From 74204f8fa117e7d0fa1d1a7a57c3c97df83ad418 Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Tue, 13 Dec 2016 09:38:25 +0100
Subject: rfkill: simplify rfkill_set_hw_state() slightly

Simplify the two conditions gating the schedule_work() into
a single one and get rid of the additional exit point from
the function in doing so.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f62783..184bb711a06d 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -478,10 +478,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 
 	rfkill_led_trigger_event(rfkill);
 
-	if (!rfkill->registered)
-		return ret;
-
-	if (prev != blocked)
+	if (rfkill->registered && prev != blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	return ret;
-- 
cgit v1.2.3


From 543b921b475ad2e1897d5e7784437af238b33705 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel
Date: Thu, 17 Nov 2016 12:48:53 +0000
Subject: cfg80211: get rid of name indirection trick for
 ieee80211_get_channel()

The comment on the name indirection suggested an issue but turned out
to be untrue. Digging in older kernel version showed issue with ipw2x00
but that is no longer true so get rid on the name indirection.

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/htt_rx.c        |  3 +--
 drivers/net/wireless/marvell/mwifiex/cfg80211.c |  2 +-
 include/net/cfg80211.h                          | 17 +++--------------
 net/wireless/util.c                             |  5 ++---
 4 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c
index 86d082cf4eef..0bc7fe8c4b52 100644
--- a/drivers/net/wireless/ath/ath10k/htt_rx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_rx.c
@@ -2451,8 +2451,7 @@ bool ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb)
 		u32 phymode = __le32_to_cpu(resp->chan_change.phymode);
 		u32 freq = __le32_to_cpu(resp->chan_change.freq);
 
-		ar->tgt_oper_chan =
-			__ieee80211_get_channel(ar->hw->wiphy, freq);
+		ar->tgt_oper_chan = ieee80211_get_channel(ar->hw->wiphy, freq);
 		ath10k_dbg(ar, ATH10K_DBG_HTT,
 			   "htt chan change freq %u phymode %s\n",
 			   freq, ath10k_wmi_phymode_str(phymode));
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 145cc4b5103b..1e3bd435a694 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -2078,7 +2078,7 @@ static int mwifiex_cfg80211_inform_ibss_bss(struct mwifiex_private *priv)
 	ie_len = ie_buf[1] + sizeof(struct ieee_types_header);
 
 	band = mwifiex_band_to_radio_type(priv->curr_bss_params.band);
-	chan = __ieee80211_get_channel(priv->wdev.wiphy,
+	chan = ieee80211_get_channel(priv->wdev.wiphy,
 			ieee80211_channel_to_frequency(bss_info.bss_chan,
 						       band));
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 814be4b4200c..ca2ac1ce5862 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3955,26 +3955,15 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band);
  */
 int ieee80211_frequency_to_channel(int freq);
 
-/*
- * Name indirection necessary because the ieee80211 code also has
- * a function named "ieee80211_get_channel", so if you include
- * cfg80211's header file you get cfg80211's version, if you try
- * to include both header files you'll (rightfully!) get a symbol
- * clash.
- */
-struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
-						  int freq);
 /**
  * ieee80211_get_channel - get channel struct from wiphy for specified frequency
+ *
  * @wiphy: the struct wiphy to get the channel for
  * @freq: the center frequency of the channel
+ *
  * Return: The channel struct from @wiphy at @freq.
  */
-static inline struct ieee80211_channel *
-ieee80211_get_channel(struct wiphy *wiphy, int freq)
-{
-	return __ieee80211_get_channel(wiphy, freq);
-}
+struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq);
 
 /**
  * ieee80211_get_response_rate - get basic rate for a given rate
diff --git a/net/wireless/util.c b/net/wireless/util.c
index e9d040d29846..2cf7df8173b6 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -114,8 +114,7 @@ int ieee80211_frequency_to_channel(int freq)
 }
 EXPORT_SYMBOL(ieee80211_frequency_to_channel);
 
-struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
-						  int freq)
+struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
 {
 	enum nl80211_band band;
 	struct ieee80211_supported_band *sband;
@@ -135,7 +134,7 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
 
 	return NULL;
 }
-EXPORT_SYMBOL(__ieee80211_get_channel);
+EXPORT_SYMBOL(ieee80211_get_channel);
 
 static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
 				     enum nl80211_band band)
-- 
cgit v1.2.3


From 5a88de5342f3090d8292132a392094034d85d101 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel
Date: Thu, 17 Nov 2016 09:02:40 +0000
Subject: nl80211: check NL80211_ATTR_SCHED_SCAN_INTERVAL only once

The presence of the NL80211_ATTR_SCHED_SCAN_INTERVAL attribute was
checked in nl80211_parse_sched_scan() and
nl80211_parse_sched_scan_plans() which might be a bit redundant
so removing one.

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3df85a751a85..db1a434f6169 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6775,13 +6775,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
 
 		/*
 		 * If scan plans are not specified,
-		 * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this
+		 * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this
 		 * case one scan plan will be set with the specified scan
 		 * interval and infinite number of iterations.
 		 */
-		if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
-			return -EINVAL;
-
 		interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
 		if (!interval)
 			return -EINVAL;
-- 
cgit v1.2.3


From b528414ca3b7b57f4f81b7ff207e6f8ffde89d2e Mon Sep 17 00:00:00 2001
From: Michael Braun
Date: Mon, 31 Oct 2016 14:40:59 +0100
Subject: nl80211: multicast_to_unicast can be changed while IFF_UP

There is no need to prevent toggling multicast_to_unicast while
interface is already up. This change simplifies reconfiguration
from hostapd.

Signed-off-by: Michael Braun <michael-dev@fami-braun.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index db1a434f6169..7762231abd32 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11804,9 +11804,6 @@ static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
 	const struct nlattr *nla;
 	bool enabled;
 
-	if (netif_running(dev))
-		return -EBUSY;
-
 	if (!rdev->ops->set_multicast_to_unicast)
 		return -EOPNOTSUPP;
 
-- 
cgit v1.2.3


From 99cd25810e326c62f5a51fa6f0dd1e848060680f Mon Sep 17 00:00:00 2001
From: Kirtika Ruchandani
Date: Mon, 21 Nov 2016 21:44:24 -0800
Subject: mac80211: Remove unused 'struct ieee80211_rx_status' ptr

Commit 554891e63a29 introduced 'struct ieee80211_rx_status' in
ieee80211_rx_h_defragment but did not use it. Compiling with W=1
gives the following warning, fix it.

net/mac80211/rx.c: In function ‘ieee80211_rx_h_defragment’:
net/mac80211/rx.c:1911:30: warning: variable ‘status’ set but not used [-Wunused-but-set-variable]

Fixes: 554891e63a29 ("mac80211: move packet flags into packet")
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Kirtika Ruchandani <kirtika@google.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index eeab7250f4b9..d2a00f2a6efe 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1908,7 +1908,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	unsigned int frag, seq;
 	struct ieee80211_fragment_entry *entry;
 	struct sk_buff *skb;
-	struct ieee80211_rx_status *status;
 
 	hdr = (struct ieee80211_hdr *)rx->skb->data;
 	fc = hdr->frame_control;
@@ -2034,9 +2033,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		dev_kfree_skb(skb);
 	}
 
-	/* Complete frame has been reassembled - process it now */
-	status = IEEE80211_SKB_RXCB(rx->skb);
-
  out:
 	ieee80211_led_rx(rx->local);
  out_no_led:
-- 
cgit v1.2.3


From 872684b1f7de40b95dac51245f3aad208528a6c3 Mon Sep 17 00:00:00 2001
From: Kirtika Ruchandani
Date: Mon, 21 Nov 2016 22:04:46 -0800
Subject: mac80211: Remove unused 'rates_idx' variable

Commit f027c2aca0cf introduced 'rates_idx' in
ieee80211_tx_status_noskb but did not use it. Compiling with W=1
gives the following warning, fix it.

mac80211/status.c: In function ‘ieee80211_tx_status_noskb’:
mac80211/status.c:636:6: warning: variable ‘rates_idx’ set but not used [-Wunused-but-set-variable]

This is a harmless warning, and is only being fixed to reduce the
noise generated with W=1.

Fixes: f027c2aca0cf ("mac80211: add ieee80211_tx_status_noskb")
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Kirtika Ruchandani <kirtika@google.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/status.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ddf71c648cab..f7c5ae597639 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -633,10 +633,9 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_supported_band *sband;
 	int retry_count;
-	int rates_idx;
 	bool acked, noack_success;
 
-	rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
+	ieee80211_tx_get_rates(hw, info, &retry_count);
 
 	sband = hw->wiphy->bands[info->band];
 
-- 
cgit v1.2.3


From fb803becb1180d3f940a634f073981e0b72d7030 Mon Sep 17 00:00:00 2001
From: Kirtika Ruchandani
Date: Mon, 21 Nov 2016 22:54:16 -0800
Subject: mac80211: Remove unused 'struct rate_control_ref' variable

Commit 3b17fbf87d5d introduced sta_get_expected_throughput()
leaving variable 'struct rate_control_ref* ref' set but unused.
Compiling with W=1 gives the following warning, fix it.

net/mac80211/sta_info.c: In function ‘sta_set_sinfo’:
net/mac80211/sta_info.c:2052:27: warning: variable ‘ref’ set but not used [-Wunused-but-set-variable]

Fixes: 3b17fbf87d5d ("mac80211: mesh: Add support for HW RC implementation")
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Maxim Altshul <maxim.altshul@ti.com>
Signed-off-by: Kirtika Ruchandani <kirtika@google.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 1711bae4abf2..4ab75a9d70c7 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2049,16 +2049,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
-	struct rate_control_ref *ref = NULL;
 	u32 thr = 0;
 	int i, ac, cpu;
 	struct ieee80211_sta_rx_stats *last_rxstats;
 
 	last_rxstats = sta_get_last_rx_stats(sta);
 
-	if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
-		ref = local->rate_ctrl;
-
 	sinfo->generation = sdata->local->sta_generation;
 
 	/* do before driver, so beacon filtering drivers have a
-- 
cgit v1.2.3


From b7f2405c6bd8ae0125a974308f649637cdc81f80 Mon Sep 17 00:00:00 2001
From: Kirtika Ruchandani
Date: Wed, 23 Nov 2016 20:45:36 -0800
Subject: mac80211: Remove unused 'i' variable

Commit 5bcae31d9 (mac80211: implement multi-vif in-place reservations)
introduced ieee80211_vif_use_reserved_switch() with a counter variable
'i' that is set but not used. Compiling with W=1 gives the following
warning, fix it.
net/mac80211/chan.c: In function ‘ieee80211_vif_use_reserved_switch’:
net/mac80211/chan.c:1273:6: warning: variable ‘i’ set but not used [-Wunused-but-set-variable]

This is a harmless warning, and is only being fixed to reduce the
noise obtained with W=1 in the kernel.

Fixes: 5bcae31d9 ("mac80211: implement multi-vif in-place reservations")
Cc: Michal Kazior <michal.kazior@tieto.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Kirtika Ruchandani <kirtika@chromium.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/chan.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index e75cbf6ecc26..7550fd264286 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1270,7 +1270,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	struct ieee80211_sub_if_data *sdata, *sdata_tmp;
 	struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
 	struct ieee80211_chanctx *new_ctx = NULL;
-	int i, err, n_assigned, n_reserved, n_ready;
+	int err, n_assigned, n_reserved, n_ready;
 	int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
 
 	lockdep_assert_held(&local->mtx);
@@ -1391,8 +1391,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	 * Update all structures, values and pointers to point to new channel
 	 * context(s).
 	 */
-
-	i = 0;
 	list_for_each_entry(ctx, &local->chanctx_list, list) {
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
 			continue;
-- 
cgit v1.2.3


From cd5861bde0299fd5189a6b9e0d1e31e205064fc6 Mon Sep 17 00:00:00 2001
From: Kirtika Ruchandani
Date: Wed, 23 Nov 2016 20:45:49 -0800
Subject: mac80211: Remove unused 'len' variable

Commit 633e27132625 (mac80211: split sched scan IEs) introduced the
len variable to keep track of the return value of
ieee80211_build_preq_ies() but did not use it. Compiling with W=1
gives the following warning, fix it.

net/mac80211/scan.c: In function ‘__ieee80211_request_sched_scan_start’:
net/mac80211/scan.c:1123:9: warning: variable ‘len’ set but not used [-Wunused-but-set-variable]

This is a harmless warning and is only being fixed to reduce the noise
with W=1 in the kernel.

Fixes: 633e27132625 ("mac80211: split sched scan IEs")
Cc: David Spinadel <david.spinadel@intel.com>
Cc: Alexander Bondar <alexander.bondar@intel.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Kirtika Ruchandani <kirtika@chromium.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/scan.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 23d8ac829279..faab3c490d2b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -1120,7 +1120,6 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 	u32 rate_masks[NUM_NL80211_BANDS] = {};
 	u8 bands_used = 0;
 	u8 *ie;
-	size_t len;
 
 	iebufsz = local->scan_ies_len + req->ie_len;
 
@@ -1145,10 +1144,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
 
-	len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
-				       &sched_scan_ies, req->ie,
-				       req->ie_len, bands_used,
-				       rate_masks, &chandef);
+	ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
+				 &sched_scan_ies, req->ie,
+				 req->ie_len, bands_used, rate_masks, &chandef);
 
 	ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
 	if (ret == 0) {
-- 
cgit v1.2.3


From 148f284b23061b6de4024423a6606437cc427f1c Mon Sep 17 00:00:00 2001
From: Kirtika Ruchandani
Date: Wed, 23 Nov 2016 20:46:14 -0800
Subject: mac80211: Remove unused 'sband' and 'local' variables

Commit b1bce14a7954 (mac80211: update opmode when adding new station)
refactored ieee80211_vht_handle_opmode into __ieee80211_vht_handle_opmode
and ieee80211_vht_handle_opmode leaving a set but unused variable
(sband) in the former. Compiling with W=1 gives the following warning,
fix it.

net/mac80211/vht.c: In function ‘__ieee80211_vht_handle_opmode’:
net/mac80211/vht.c:424:35: warning: variable ‘sband’ set but not used [-Wunused-but-set-variable]

Remove 'struct ieee80211_local* local' as well, it was only used to
set sband.

This is a harmless warning, and is only being fixed to reduce the
noise with W=1 in the kernel.

Fixes: b1bce14a7954 ("mac80211: update opmode when adding new station")
Cc: Marek Kwaczynski <marek.kwaczynski@tieto.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Kirtika Ruchandani <kirtika@chromium.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/vht.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 6832bf6ab69f..d5a56e039106 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -436,14 +436,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 				  struct sta_info *sta, u8 opmode,
 				  enum nl80211_band band)
 {
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_supported_band *sband;
 	enum ieee80211_sta_rx_bandwidth new_bw;
 	u32 changed = 0;
 	u8 nss;
 
-	sband = local->hw.wiphy->bands[band];
-
 	/* ignore - no support for BF yet */
 	if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)
 		return 0;
-- 
cgit v1.2.3


From ab7257251cc63eef5fa6702c3c616c6e2524d777 Mon Sep 17 00:00:00 2001
From: Kirtika Ruchandani
Date: Wed, 23 Nov 2016 20:46:27 -0800
Subject: mac80211: Remove unused 'beaconint_us' variable

Commit 4a733ef1bea7 (mac80211: remove PM-QoS listener) removed all use
of 'beaconint_us' from ieee80211_recalc_ps() but left the variable
intact. Compiling with W=1 gives the following warning, fix it.
net/mac80211/mlme.c: In function ‘ieee80211_recalc_ps’:
net/mac80211/mlme.c:1481:7: warning: variable ‘beaconint_us’ set but not used [-Wunused-but-set-variable]

iee80211_tu_to_usec has no side-effects and is safe to remove.

Fixes: 4a733ef1bea7 ("mac80211: remove PM-QoS listener")
Cc: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Kirtika Ruchandani <kirtika@chromium.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 098ce9b179ee..8a6344518674 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1486,10 +1486,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local)
 
 	if (count == 1 && ieee80211_powersave_allowed(found)) {
 		u8 dtimper = found->u.mgd.dtim_period;
-		s32 beaconint_us;
-
-		beaconint_us = ieee80211_tu_to_usec(
-					found->vif.bss_conf.beacon_int);
 
 		timeout = local->dynamic_ps_forced_timeout;
 		if (timeout < 0)
-- 
cgit v1.2.3


From ebceec860fc370b2f4c23e95c51daa932e047913 Mon Sep 17 00:00:00 2001
From: Michael Braun
Date: Tue, 22 Nov 2016 11:52:18 +0100
Subject: mac80211: multicast to unicast conversion

Add the ability for an AP (and associated VLANs) to perform
multicast-to-unicast conversion for ARP, IPv4 and IPv6 frames
(possibly within 802.1Q). If enabled, such frames are to be sent
to each station separately, with the DA replaced by their own
MAC address rather than the group address.

Note that this may break certain expectations of the receiver,
such as the ability to drop unicast IP packets received within
multicast L2 frames, or the ability to not send ICMP destination
unreachable messages for packets received in L2 multicast (which
is required, but the receiver can't tell the difference if this
new option is enabled.)

This also doesn't implement the 802.11 DMS (directed multicast
service).

Signed-off-by: Michael Braun <michael-dev@fami-braun.de>
[use true/false, rename label to the correct "multicast",
 use __be16 for ethertype and network order for constants]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c            |  12 +++++
 net/mac80211/debugfs_netdev.c |   3 ++
 net/mac80211/ieee80211_i.h    |   1 +
 net/mac80211/tx.c             | 122 +++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e91e503bf992..a0be2f6cd121 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3563,6 +3563,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
 }
 EXPORT_SYMBOL(ieee80211_nan_func_match);
 
+static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
+					      struct net_device *dev,
+					      const bool enabled)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	sdata->u.ap.multicast_to_unicast = enabled;
+
+	return 0;
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -3653,4 +3664,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.nan_change_conf = ieee80211_nan_change_conf,
 	.add_nan_func = ieee80211_add_nan_func,
 	.del_nan_func = ieee80211_del_nan_func,
+	.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
 };
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 1a05f85cb1f0..8f5fff8b2040 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -519,6 +519,8 @@ static ssize_t ieee80211_if_fmt_aqm(
 }
 IEEE80211_IF_FILE_R(aqm);
 
+IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX);
+
 /* IBSS attributes */
 static ssize_t ieee80211_if_fmt_tsf(
 	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -683,6 +685,7 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
 	DEBUGFS_ADD(dtim_count);
 	DEBUGFS_ADD(num_buffered_multicast);
 	DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
+	DEBUGFS_ADD_MODE(multicast_to_unicast, 0600);
 }
 
 static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d37a577f63a1..a3af14e3e64d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -297,6 +297,7 @@ struct ieee80211_if_ap {
 			 driver_smps_mode; /* smps mode request */
 
 	struct work_struct request_smps_work;
+	bool multicast_to_unicast;
 };
 
 struct ieee80211_if_wds {
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 2c21b7039136..a39c0f07021b 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/skbuff.h>
+#include <linux/if_vlan.h>
 #include <linux/etherdevice.h>
 #include <linux/bitmap.h>
 #include <linux/rcupdate.h>
@@ -3573,6 +3574,115 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 	rcu_read_unlock();
 }
 
+static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta)
+{
+	struct ethhdr *eth;
+	int err;
+
+	err = skb_ensure_writable(skb, ETH_HLEN);
+	if (unlikely(err))
+		return err;
+
+	eth = (void *)skb->data;
+	ether_addr_copy(eth->h_dest, sta->sta.addr);
+
+	return 0;
+}
+
+static bool ieee80211_multicast_to_unicast(struct sk_buff *skb,
+					   struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	const struct ethhdr *eth = (void *)skb->data;
+	const struct vlan_ethhdr *ethvlan = (void *)skb->data;
+	__be16 ethertype;
+
+	if (likely(!is_multicast_ether_addr(eth->h_dest)))
+		return false;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		if (sdata->u.vlan.sta)
+			return false;
+		if (sdata->wdev.use_4addr)
+			return false;
+		/* fall through */
+	case NL80211_IFTYPE_AP:
+		/* check runtime toggle for this bss */
+		if (!sdata->bss->multicast_to_unicast)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	/* multicast to unicast conversion only for some payload */
+	ethertype = eth->h_proto;
+	if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN)
+		ethertype = ethvlan->h_vlan_encapsulated_proto;
+	switch (ethertype) {
+	case htons(ETH_P_ARP):
+	case htons(ETH_P_IP):
+	case htons(ETH_P_IPV6):
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static void
+ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev,
+			     struct sk_buff_head *queue)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	const struct ethhdr *eth = (struct ethhdr *)skb->data;
+	struct sta_info *sta, *first = NULL;
+	struct sk_buff *cloned_skb;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sdata != sta->sdata)
+			/* AP-VLAN mismatch */
+			continue;
+		if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr)))
+			/* do not send back to source */
+			continue;
+		if (!first) {
+			first = sta;
+			continue;
+		}
+		cloned_skb = skb_clone(skb, GFP_ATOMIC);
+		if (!cloned_skb)
+			goto multicast;
+		if (unlikely(ieee80211_change_da(cloned_skb, sta))) {
+			dev_kfree_skb(cloned_skb);
+			goto multicast;
+		}
+		__skb_queue_tail(queue, cloned_skb);
+	}
+
+	if (likely(first)) {
+		if (unlikely(ieee80211_change_da(skb, first)))
+			goto multicast;
+		__skb_queue_tail(queue, skb);
+	} else {
+		/* no STA connected, drop */
+		kfree_skb(skb);
+		skb = NULL;
+	}
+
+	goto out;
+multicast:
+	__skb_queue_purge(queue);
+	__skb_queue_tail(queue, skb);
+out:
+	rcu_read_unlock();
+}
+
 /**
  * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs
  * @skb: packet to be sent
@@ -3583,7 +3693,17 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 				       struct net_device *dev)
 {
-	__ieee80211_subif_start_xmit(skb, dev, 0);
+	if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) {
+		struct sk_buff_head queue;
+
+		__skb_queue_head_init(&queue);
+		ieee80211_convert_to_unicast(skb, dev, &queue);
+		while ((skb = __skb_dequeue(&queue)))
+			__ieee80211_subif_start_xmit(skb, dev, 0);
+	} else {
+		__ieee80211_subif_start_xmit(skb, dev, 0);
+	}
+
 	return NETDEV_TX_OK;
 }
 
-- 
cgit v1.2.3


From 11197d006bcfabf0173a7820a163fcaac420d10e Mon Sep 17 00:00:00 2001
From: Masashi Honma
Date: Wed, 30 Nov 2016 09:06:04 +0900
Subject: mac80211: Suppress NEW_PEER_CANDIDATE event if no room

Previously, kernel sends NEW_PEER_CANDIDATE event to user land even if
the found peer does not have any room to accept other peer. This causes
continuous connection trials.

Signed-off-by: Masashi Honma <masashi.honma@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_plink.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 7fcdcf622655..fcba70e57073 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -505,12 +505,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
 
 	/* Userspace handles station allocation */
 	if (sdata->u.mesh.user_mpm ||
-	    sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED)
-		cfg80211_notify_new_peer_candidate(sdata->dev, addr,
-						   elems->ie_start,
-						   elems->total_len,
-						   GFP_KERNEL);
-	else
+	    sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) {
+		if (mesh_peer_accepts_plinks(elems) &&
+		    mesh_plink_availables(sdata))
+			cfg80211_notify_new_peer_candidate(sdata->dev, addr,
+							   elems->ie_start,
+							   elems->total_len,
+							   GFP_KERNEL);
+	} else
 		sta = __mesh_sta_info_alloc(sdata, addr);
 
 	return sta;
-- 
cgit v1.2.3


From 4a5eccaa93505cd186939a01d7c115568b1881a6 Mon Sep 17 00:00:00 2001
From: Ben Greear
Date: Mon, 5 Dec 2016 10:58:30 -0800
Subject: mac80211: Show pending txqlen in debugfs.

Could be useful for debugging memory consumption issues,
and perhaps power-save as well.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index e02ba42ca827..f62cd0e13c58 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -243,6 +243,31 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
 	return rv;
 }
 
+static ssize_t misc_read(struct file *file, char __user *user_buf,
+			 size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	/* Max len of each line is 16 characters, plus 9 for 'pending:\n' */
+	size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9;
+	char *buf = kzalloc(bufsz, GFP_KERNEL);
+	char *pos = buf, *end = buf + bufsz - 1;
+	ssize_t rv;
+	int i;
+	int ln;
+
+	pos += scnprintf(pos, end - pos, "pending:\n");
+
+	for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
+		ln = skb_queue_len(&local->pending[i]);
+		pos += scnprintf(pos, end - pos, "[%i] %d\n",
+				 i, ln);
+	}
+
+	rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+	kfree(buf);
+	return rv;
+}
+
 static ssize_t queues_read(struct file *file, char __user *user_buf,
 			   size_t count, loff_t *ppos)
 {
@@ -263,6 +288,7 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
 
 DEBUGFS_READONLY_FILE_OPS(hwflags);
 DEBUGFS_READONLY_FILE_OPS(queues);
+DEBUGFS_READONLY_FILE_OPS(misc);
 
 /* statistics stuff */
 
@@ -331,6 +357,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
 	DEBUGFS_ADD(total_ps_buffered);
 	DEBUGFS_ADD(wep_iv);
 	DEBUGFS_ADD(queues);
+	DEBUGFS_ADD(misc);
 #ifdef CONFIG_PM
 	DEBUGFS_ADD_MODE(reset, 0200);
 #endif
-- 
cgit v1.2.3


From 6124c53edeeaac4394755ebb38c522bc4eef1460 Mon Sep 17 00:00:00 2001
From: Michał Kępień
Date: Thu, 8 Dec 2016 08:30:51 +0100
Subject: rfkill: Cleanup error handling in rfkill_init()

Use a separate label per error condition in rfkill_init() to make it a
bit cleaner and easier to extend.

Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 184bb711a06d..9eb26a4cdac8 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1263,24 +1263,25 @@ static int __init rfkill_init(void)
 
 	error = class_register(&rfkill_class);
 	if (error)
-		goto out;
+		goto error_class;
 
 	error = misc_register(&rfkill_miscdev);
-	if (error) {
-		class_unregister(&rfkill_class);
-		goto out;
-	}
+	if (error)
+		goto error_misc;
 
 #ifdef CONFIG_RFKILL_INPUT
 	error = rfkill_handler_init();
-	if (error) {
-		misc_deregister(&rfkill_miscdev);
-		class_unregister(&rfkill_class);
-		goto out;
-	}
+	if (error)
+		goto error_input;
 #endif
 
- out:
+	return 0;
+
+error_input:
+	misc_deregister(&rfkill_miscdev);
+error_misc:
+	class_unregister(&rfkill_class);
+error_class:
 	return error;
 }
 subsys_initcall(rfkill_init);
-- 
cgit v1.2.3


From 73f4f76a196d7adb11a1e192bd8024fe0bc83910 Mon Sep 17 00:00:00 2001
From: Michał Kępień
Date: Thu, 8 Dec 2016 08:30:52 +0100
Subject: rfkill: Add rfkill-any LED trigger

Add a new "global" (i.e. not per-rfkill device) LED trigger, rfkill-any,
which may be useful on laptops with a single "radio LED" and multiple
radio transmitters.  The trigger is meant to turn a LED on whenever
there is at least one radio transmitter active and turn it off
otherwise.

This requires taking rfkill_global_mutex before calling rfkill_set_block()
in rfkill_resume(): since __rfkill_any_led_trigger_event() is called from
rfkill_set_block() unconditionally, each caller of the latter needs to
take care of locking rfkill_global_mutex.

Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 9eb26a4cdac8..f6b01f3616e5 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,6 +176,47 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 	led_trigger_unregister(&rfkill->led_trigger);
 }
+
+static struct led_trigger rfkill_any_led_trigger;
+
+static void __rfkill_any_led_trigger_event(void)
+{
+	enum led_brightness brightness = LED_OFF;
+	struct rfkill *rfkill;
+
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
+			brightness = LED_FULL;
+			break;
+		}
+	}
+
+	led_trigger_event(&rfkill_any_led_trigger, brightness);
+}
+
+static void rfkill_any_led_trigger_event(void)
+{
+	mutex_lock(&rfkill_global_mutex);
+	__rfkill_any_led_trigger_event();
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+{
+	rfkill_any_led_trigger_event();
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	rfkill_any_led_trigger.name = "rfkill-any";
+	rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
+	return led_trigger_register(&rfkill_any_led_trigger);
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+	led_trigger_unregister(&rfkill_any_led_trigger);
+}
 #else
 static void rfkill_led_trigger_event(struct rfkill *rfkill)
 {
@@ -189,6 +230,23 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
 static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 }
+
+static void __rfkill_any_led_trigger_event(void)
+{
+}
+
+static void rfkill_any_led_trigger_event(void)
+{
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	return 0;
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+}
 #endif /* CONFIG_RFKILL_LEDS */
 
 static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -297,6 +355,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	__rfkill_any_led_trigger_event();
 
 	if (prev != curr)
 		rfkill_event(rfkill);
@@ -477,6 +536,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
 	if (rfkill->registered && prev != blocked)
 		schedule_work(&rfkill->uevent_work);
@@ -520,6 +580,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
 	return blocked;
 }
@@ -569,6 +630,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 			schedule_work(&rfkill->uevent_work);
 
 		rfkill_led_trigger_event(rfkill);
+		rfkill_any_led_trigger_event();
 	}
 }
 EXPORT_SYMBOL(rfkill_set_states);
@@ -812,8 +874,10 @@ static int rfkill_resume(struct device *dev)
 	rfkill->suspended = false;
 
 	if (!rfkill->persistent) {
+		mutex_lock(&rfkill_global_mutex);
 		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
 		rfkill_set_block(rfkill, cur);
+		mutex_unlock(&rfkill_global_mutex);
 	}
 
 	if (rfkill->ops->poll && !rfkill->polling_paused)
@@ -985,6 +1049,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 #endif
 	}
 
+	__rfkill_any_led_trigger_event();
 	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
 	mutex_unlock(&rfkill_global_mutex);
@@ -1017,6 +1082,7 @@ void rfkill_unregister(struct rfkill *rfkill)
 	mutex_lock(&rfkill_global_mutex);
 	rfkill_send_events(rfkill, RFKILL_OP_DEL);
 	list_del_init(&rfkill->node);
+	__rfkill_any_led_trigger_event();
 	mutex_unlock(&rfkill_global_mutex);
 
 	rfkill_led_trigger_unregister(rfkill);
@@ -1269,6 +1335,10 @@ static int __init rfkill_init(void)
 	if (error)
 		goto error_misc;
 
+	error = rfkill_any_led_trigger_register();
+	if (error)
+		goto error_led_trigger;
+
 #ifdef CONFIG_RFKILL_INPUT
 	error = rfkill_handler_init();
 	if (error)
@@ -1278,6 +1348,8 @@ static int __init rfkill_init(void)
 	return 0;
 
 error_input:
+	rfkill_any_led_trigger_unregister();
+error_led_trigger:
 	misc_deregister(&rfkill_miscdev);
 error_misc:
 	class_unregister(&rfkill_class);
@@ -1291,6 +1363,7 @@ static void __exit rfkill_exit(void)
 #ifdef CONFIG_RFKILL_INPUT
 	rfkill_handler_exit();
 #endif
+	rfkill_any_led_trigger_unregister();
 	misc_deregister(&rfkill_miscdev);
 	class_unregister(&rfkill_class);
 }
-- 
cgit v1.2.3


From 76f43b4c0a9337af22827d78de4f2b8fd5328489 Mon Sep 17 00:00:00 2001
From: Masashi Honma
Date: Thu, 8 Dec 2016 10:15:50 +0900
Subject: mac80211: Remove invalid flag operations in mesh TSF synchronization

mesh_sync_offset_adjust_tbtt() implements Extensible synchronization
framework ([1] 13.13.2 Extensible synchronization framework). It shall
not operate the flag "TBTT Adjusting subfield" ([1] 8.4.2.100.8 Mesh
Capability), since it is used only for MBCA ([1] 13.13.4 Mesh beacon
collision avoidance, see 13.13.4.4.3 TBTT scanning and adjustment
procedures for detail). So this patch remove the flag operations.

[1] IEEE Std 802.11 2012

Signed-off-by: Masashi Honma <masashi.honma@gmail.com>
[remove adjusting_tbtt entirely, since it's now unused]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  1 -
 net/mac80211/mesh.c        |  3 ---
 net/mac80211/mesh_sync.c   | 11 -----------
 3 files changed, 15 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a3af14e3e64d..03e3230479f3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -689,7 +689,6 @@ struct ieee80211_if_mesh {
 	const struct ieee80211_mesh_sync_ops *sync_ops;
 	s64 sync_offset_clockdrift_max;
 	spinlock_t sync_offset_lock;
-	bool adjusting_tbtt;
 	/* mesh power save */
 	enum nl80211_mesh_power_mode nonpeer_pm;
 	int ps_peers_light_sleep;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 42120d965263..b9628374dcbd 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -279,8 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
 	/* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
 	*pos |= ifmsh->ps_peers_deep_sleep ?
 			IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00;
-	*pos++ |= ifmsh->adjusting_tbtt ?
-			IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00;
 	*pos++ = 0x00;
 
 	return 0;
@@ -850,7 +848,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 	ifmsh->mesh_cc_id = 0;	/* Disabled */
 	/* register sync ops from extensible synchronization framework */
 	ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id);
-	ifmsh->adjusting_tbtt = false;
 	ifmsh->sync_offset_clockdrift_max = 0;
 	set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
 	ieee80211_mesh_root_setup(ifmsh);
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index faca22cd02b5..75608c07dc7b 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -123,7 +123,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
 	 */
 
 	if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) {
-		clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
 		msync_dbg(sdata, "STA %pM : is adjusting TBTT\n",
 			  sta->sta.addr);
 		goto no_sync;
@@ -172,11 +171,9 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
 					 struct beacon_data *beacon)
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
-	u8 cap;
 
 	WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET);
 	WARN_ON(!rcu_read_lock_held());
-	cap = beacon->meshconf->meshconf_cap;
 
 	spin_lock_bh(&ifmsh->sync_offset_lock);
 
@@ -190,21 +187,13 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
 			  "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n",
 			  ifmsh->sync_offset_clockdrift_max);
 		set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags);
-
-		ifmsh->adjusting_tbtt = true;
 	} else {
 		msync_dbg(sdata,
 			  "TBTT : max clockdrift=%lld; too small to adjust\n",
 			  (long long)ifmsh->sync_offset_clockdrift_max);
 		ifmsh->sync_offset_clockdrift_max = 0;
-
-		ifmsh->adjusting_tbtt = false;
 	}
 	spin_unlock_bh(&ifmsh->sync_offset_lock);
-
-	beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ?
-			IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap :
-			~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap;
 }
 
 static const struct sync_method sync_methods[] = {
-- 
cgit v1.2.3


From 445cd452fe5187e676eef02c917c9e5f837c749e Mon Sep 17 00:00:00 2001
From: Masashi Honma
Date: Thu, 8 Dec 2016 10:15:51 +0900
Subject: mac80211: Use appropriate name for functions and messages

These functions drifts TSF timers, not TBTT.

Signed-off-by: Masashi Honma <masashi.honma@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  4 ++--
 net/mac80211/mesh.c        |  2 +-
 net/mac80211/mesh.h        |  2 +-
 net/mac80211/mesh_sync.c   | 16 ++++++++--------
 net/mac80211/tx.c          |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 03e3230479f3..e5d73118d08f 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -625,8 +625,8 @@ struct ieee80211_mesh_sync_ops {
 			     struct ieee80211_rx_status *rx_status);
 
 	/* should be called with beacon_data under RCU read lock */
-	void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata,
-			    struct beacon_data *beacon);
+	void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata,
+			   struct beacon_data *beacon);
 	/* add other framework functions here */
 };
 
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index b9628374dcbd..cc2a63bd233f 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1346,7 +1346,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
 		ieee80211_mesh_rootpath(sdata);
 
 	if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags))
-		mesh_sync_adjust_tbtt(sdata);
+		mesh_sync_adjust_tsf(sdata);
 
 	if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags))
 		mesh_bss_info_changed(sdata);
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 26b9ccbe1fce..7e5f271e3c30 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -341,7 +341,7 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
 }
 
 void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
-void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata);
+void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata);
 void ieee80211s_stop(void);
 #else
 static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index 75608c07dc7b..a435f094a82e 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -12,7 +12,7 @@
 #include "mesh.h"
 #include "driver-ops.h"
 
-/* This is not in the standard.  It represents a tolerable tbtt drift below
+/* This is not in the standard.  It represents a tolerable tsf drift below
  * which we do no TSF adjustment.
  */
 #define TOFFSET_MINIMUM_ADJUSTMENT 10
@@ -46,7 +46,7 @@ static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie)
 			IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0;
 }
 
-void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
+void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
@@ -57,12 +57,12 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
 
 	spin_lock_bh(&ifmsh->sync_offset_lock);
 	if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) {
-		msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n",
+		msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting\n",
 			  (long long) ifmsh->sync_offset_clockdrift_max);
 		tsfdelta = -ifmsh->sync_offset_clockdrift_max;
 		ifmsh->sync_offset_clockdrift_max = 0;
 	} else {
-		msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n",
+		msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting by %llu\n",
 			  (long long) ifmsh->sync_offset_clockdrift_max,
 			  (unsigned long long) beacon_int_fraction);
 		tsfdelta = -beacon_int_fraction;
@@ -167,7 +167,7 @@ no_sync:
 	rcu_read_unlock();
 }
 
-static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
+static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata,
 					 struct beacon_data *beacon)
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
@@ -184,12 +184,12 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
 		 * the tsf adjustment to the mesh tasklet
 		 */
 		msync_dbg(sdata,
-			  "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n",
+			  "TSF : kicking off TSF adjustment with clockdrift_max=%lld\n",
 			  ifmsh->sync_offset_clockdrift_max);
 		set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags);
 	} else {
 		msync_dbg(sdata,
-			  "TBTT : max clockdrift=%lld; too small to adjust\n",
+			  "TSF : max clockdrift=%lld; too small to adjust\n",
 			  (long long)ifmsh->sync_offset_clockdrift_max);
 		ifmsh->sync_offset_clockdrift_max = 0;
 	}
@@ -201,7 +201,7 @@ static const struct sync_method sync_methods[] = {
 		.method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
 		.ops = {
 			.rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp,
-			.adjust_tbtt = &mesh_sync_offset_adjust_tbtt,
+			.adjust_tsf = &mesh_sync_offset_adjust_tsf,
 		}
 	},
 };
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a39c0f07021b..058652d000d9 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4196,7 +4196,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		}
 
 		if (ifmsh->sync_ops)
-			ifmsh->sync_ops->adjust_tbtt(sdata, beacon);
+			ifmsh->sync_ops->adjust_tsf(sdata, beacon);
 
 		skb = dev_alloc_skb(local->tx_headroom +
 				    beacon->head_len +
-- 
cgit v1.2.3


From 0c2e384267b815fb784c415a90de7bdd78da0b66 Mon Sep 17 00:00:00 2001
From: Koen Vandeputte
Date: Wed, 14 Dec 2016 17:28:59 +0100
Subject: mac80211: only alloc mem if a station doesn't exist yet

This speeds up the function in case a station already exists by avoiding
calling an expensive kzalloc just to free it again after the next check.

Signed-off-by: Koen Vandeputte <koen.vandeputte@ncentric.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 4ab75a9d70c7..f5a24b742eda 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -513,23 +513,23 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
 {
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
-	struct station_info *sinfo;
+	struct station_info *sinfo = NULL;
 	int err = 0;
 
 	lockdep_assert_held(&local->sta_mtx);
 
-	sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
-	if (!sinfo) {
-		err = -ENOMEM;
-		goto out_err;
-	}
-
 	/* check if STA exists already */
 	if (sta_info_get_bss(sdata, sta->sta.addr)) {
 		err = -EEXIST;
 		goto out_err;
 	}
 
+	sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
+	if (!sinfo) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
 	local->num_sta++;
 	local->sta_generation++;
 	smp_mb();
-- 
cgit v1.2.3


From 41d085835d3d2258633d8959e717198e31d3862e Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:46:54 +0100
Subject: mac80211: minstrel_ht: move supported bitrate mask out of group data

Improves dcache footprint by ensuring that fewer cache lines need to be
touched.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel_ht.c         | 30 +++++++++++++++---------------
 net/mac80211/rc80211_minstrel_ht.h         |  6 +++---
 net/mac80211/rc80211_minstrel_ht_debugfs.c |  8 ++++----
 3 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 30fbabf4bcbc..ed8a34d46e73 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -301,7 +301,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
 				break;
 
 		/* short preamble */
-		if (!(mi->groups[group].supported & BIT(idx)))
+		if (!(mi->supported[group] & BIT(idx)))
 			idx += 4;
 	}
 	return &mi->groups[group].rates[idx];
@@ -486,7 +486,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
 			  MCS_GROUP_RATES].streams;
 	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
 		mg = &mi->groups[group];
-		if (!mg->supported || group == MINSTREL_CCK_GROUP)
+		if (!mi->supported[group] || group == MINSTREL_CCK_GROUP)
 			continue;
 
 		tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
@@ -540,7 +540,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
 
 		mg = &mi->groups[group];
-		if (!mg->supported)
+		if (!mi->supported[group])
 			continue;
 
 		mi->sample_count++;
@@ -550,7 +550,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 			tmp_group_tp_rate[j] = group;
 
 		for (i = 0; i < MCS_GROUP_RATES; i++) {
-			if (!(mg->supported & BIT(i)))
+			if (!(mi->supported[group] & BIT(i)))
 				continue;
 
 			index = MCS_GROUP_RATES * group + i;
@@ -636,7 +636,7 @@ minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi)
 		mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
 		mg = &mi->groups[mi->sample_group];
 
-		if (!mg->supported)
+		if (!mi->supported[mi->sample_group])
 			continue;
 
 		if (++mg->index >= MCS_GROUP_RATES) {
@@ -657,7 +657,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary)
 	while (group > 0) {
 		group--;
 
-		if (!mi->groups[group].supported)
+		if (!mi->supported[group])
 			continue;
 
 		if (minstrel_mcs_groups[group].streams >
@@ -994,7 +994,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 	sample_idx = sample_table[mg->column][mg->index];
 	minstrel_set_next_sample_idx(mi);
 
-	if (!(mg->supported & BIT(sample_idx)))
+	if (!(mi->supported[sample_group] & BIT(sample_idx)))
 		return -1;
 
 	mrs = &mg->rates[sample_idx];
@@ -1052,7 +1052,7 @@ static void
 minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
 				    struct minstrel_ht_sta *mi, bool val)
 {
-	u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported;
+	u8 supported = mi->supported[MINSTREL_CCK_GROUP];
 
 	if (!supported || !mi->cck_supported_short)
 		return;
@@ -1061,7 +1061,7 @@ minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
 		return;
 
 	supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4);
-	mi->groups[MINSTREL_CCK_GROUP].supported = supported;
+	mi->supported[MINSTREL_CCK_GROUP] = supported;
 }
 
 static void
@@ -1154,7 +1154,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
 			mi->cck_supported_short |= BIT(i);
 	}
 
-	mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported;
+	mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported;
 }
 
 static void
@@ -1224,7 +1224,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 		u32 gflags = minstrel_mcs_groups[i].flags;
 		int bw, nss;
 
-		mi->groups[i].supported = 0;
+		mi->supported[i] = 0;
 		if (i == MINSTREL_CCK_GROUP) {
 			minstrel_ht_update_cck(mp, mi, sband, sta);
 			continue;
@@ -1256,8 +1256,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 			if (use_vht && minstrel_vht_only)
 				continue;
 #endif
-			mi->groups[i].supported = mcs->rx_mask[nss - 1];
-			if (mi->groups[i].supported)
+			mi->supported[i] = mcs->rx_mask[nss - 1];
+			if (mi->supported[i])
 				n_supported++;
 			continue;
 		}
@@ -1283,10 +1283,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 		else
 			bw = BW_20;
 
-		mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss,
+		mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss,
 				vht_cap->vht_mcs.tx_mcs_map);
 
-		if (mi->groups[i].supported)
+		if (mi->supported[i])
 			n_supported++;
 	}
 
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index e8b52a94d24b..de1646c42e82 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -52,9 +52,6 @@ struct minstrel_mcs_group_data {
 	u8 index;
 	u8 column;
 
-	/* bitfield of supported MCS rates of this group */
-	u16 supported;
-
 	/* sorted rate set within a MCS group*/
 	u16 max_group_tp_rate[MAX_THR_RATES];
 	u16 max_group_prob_rate;
@@ -101,6 +98,9 @@ struct minstrel_ht_sta {
 	u8 cck_supported;
 	u8 cck_supported_short;
 
+	/* Bitfield of supported MCS rates of all groups */
+	u16 supported[MINSTREL_GROUPS_NB];
+
 	/* MCS rate group info and statistics */
 	struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB];
 };
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 5320e35ed3d0..1942d4b3cd18 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -24,7 +24,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 	char gimode = 'L';
 	u32 gflags;
 
-	if (!mi->groups[i].supported)
+	if (!mi->supported[i])
 		return p;
 
 	mg = &minstrel_mcs_groups[i];
@@ -42,7 +42,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		static const int bitrates[4] = { 10, 20, 55, 110 };
 		int idx = i * MCS_GROUP_RATES + j;
 
-		if (!(mi->groups[i].supported & BIT(j)))
+		if (!(mi->supported[i] & BIT(j)))
 			continue;
 
 		if (gflags & IEEE80211_TX_RC_MCS) {
@@ -170,7 +170,7 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 	char gimode = 'L';
 	u32 gflags;
 
-	if (!mi->groups[i].supported)
+	if (!mi->supported[i])
 		return p;
 
 	mg = &minstrel_mcs_groups[i];
@@ -188,7 +188,7 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		static const int bitrates[4] = { 10, 20, 55, 110 };
 		int idx = i * MCS_GROUP_RATES + j;
 
-		if (!(mi->groups[i].supported & BIT(j)))
+		if (!(mi->supported[i] & BIT(j)))
 			continue;
 
 		if (gflags & IEEE80211_TX_RC_MCS) {
-- 
cgit v1.2.3


From 782dda00ab8e9485ddc2a1fb8d8681dcc5136283 Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:46:55 +0100
Subject: mac80211: minstrel_ht: move short preamble check out of get_rate

Test short preamble support in minstrel_ht_update_caps instead of
looking at the per-packet flag. Makes the code more efficient.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel_ht.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index ed8a34d46e73..ae45dcb9d7a7 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -14,6 +14,7 @@
 #include <linux/ieee80211.h>
 #include <net/mac80211.h>
 #include "rate.h"
+#include "sta_info.h"
 #include "rc80211_minstrel.h"
 #include "rc80211_minstrel_ht.h"
 
@@ -1048,22 +1049,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 	return sample_idx;
 }
 
-static void
-minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
-				    struct minstrel_ht_sta *mi, bool val)
-{
-	u8 supported = mi->supported[MINSTREL_CCK_GROUP];
-
-	if (!supported || !mi->cck_supported_short)
-		return;
-
-	if (supported & (mi->cck_supported_short << (val * 4)))
-		return;
-
-	supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4);
-	mi->supported[MINSTREL_CCK_GROUP] = supported;
-}
-
 static void
 minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
                      struct ieee80211_tx_rate_control *txrc)
@@ -1087,7 +1072,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
 		minstrel_aggr_check(sta, txrc->skb);
 
 	info->flags |= mi->tx_flags;
-	minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble);
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 	if (mp->fixed_rate_idx != -1)
@@ -1168,6 +1152,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 	struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
 	u16 sta_cap = sta->ht_cap.cap;
 	struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
+	struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
 	int use_vht;
 	int n_supported = 0;
 	int ack_dur;
@@ -1293,6 +1278,9 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 	if (!n_supported)
 		goto use_legacy;
 
+	if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE))
+		mi->cck_supported_short |= mi->cck_supported_short << 4;
+
 	/* create an initial rate table with the lowest supported rates */
 	minstrel_ht_update_stats(mp, mi);
 	minstrel_ht_update_rates(mp, mi);
-- 
cgit v1.2.3


From 70550df2d354f13ae0544f12aa14b79f094f0de2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:46:56 +0100
Subject: mac80211: minstrel_ht: make att_hist and succ_hist u32 instead of u64

They are only used for debugging purposes and take a very long time to
overflow. Visibly reduces the size of the per-sta rate control data.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index c230bbe93262..209961cbb7e8 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -59,7 +59,7 @@ struct minstrel_rate_stats {
 	u16 success, last_success;
 
 	/* total attempts/success counters */
-	u64 att_hist, succ_hist;
+	u32 att_hist, succ_hist;
 
 	/* statistis of packet delivery probability
 	 *  cur_prob  - current prob within last update intervall
-- 
cgit v1.2.3


From 95cd470c75a8822d35fa82a1b276d47da37229d1 Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:46:57 +0100
Subject: mac80211: check for MCS in ieee80211_duration before fetching chanctx

Makes the code a bit more efficient

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 058652d000d9..4dea18be385c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -64,6 +64,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	u32 rate_flags = 0;
 
+	/* assume HW handles this */
+	if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
+		return 0;
+
 	rcu_read_lock();
 	chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
 	if (chanctx_conf) {
@@ -72,10 +76,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 	}
 	rcu_read_unlock();
 
-	/* assume HW handles this */
-	if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
-		return 0;
-
 	/* uh huh? */
 	if (WARN_ON_ONCE(tx->rate.idx < 0))
 		return 0;
-- 
cgit v1.2.3


From 1109dc392ea0248f7bfca3537011aa7afaa05cee Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:46:58 +0100
Subject: mac80211: minstrel: remove cur_prob from debugfs

This field is redundant, because it is simply last success divided by
last attempt count. Removing it from the rate stats struct saves about
1.2 KiB per HT station.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel.c            | 10 ++++++----
 net/mac80211/rc80211_minstrel.h            |  2 --
 net/mac80211/rc80211_minstrel_debugfs.c    | 16 ++++++----------
 net/mac80211/rc80211_minstrel_ht_debugfs.c | 16 ++++++----------
 4 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 14c5ba3a1b1c..a284ea08f048 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -159,21 +159,23 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
 void
 minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
 {
+	unsigned int cur_prob;
+
 	if (unlikely(mrs->attempts > 0)) {
 		mrs->sample_skipped = 0;
-		mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
+		cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
 		if (unlikely(!mrs->att_hist)) {
-			mrs->prob_ewma = mrs->cur_prob;
+			mrs->prob_ewma = cur_prob;
 		} else {
 			/* update exponential weighted moving variance */
 			mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd,
-							 mrs->cur_prob,
+							 cur_prob,
 							 mrs->prob_ewma,
 							 EWMA_LEVEL);
 
 			/*update exponential weighted moving avarage */
 			mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
-						       mrs->cur_prob,
+						       cur_prob,
 						       EWMA_LEVEL);
 		}
 		mrs->att_hist += mrs->attempts;
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index 209961cbb7e8..94b89b1b406c 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -62,10 +62,8 @@ struct minstrel_rate_stats {
 	u32 att_hist, succ_hist;
 
 	/* statistis of packet delivery probability
-	 *  cur_prob  - current prob within last update intervall
 	 *  prob_ewma - exponential weighted moving average of prob
 	 *  prob_ewmsd - exp. weighted moving standard deviation of prob */
-	unsigned int cur_prob;
 	unsigned int prob_ewma;
 	u16 prob_ewmsd;
 
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 820b0abc9c0d..ca0bafd6a44e 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -75,7 +75,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 {
 	struct minstrel_sta_info *mi = inode->i_private;
 	struct minstrel_debugfs_info *ms;
-	unsigned int i, tp_max, tp_avg, prob, eprob;
+	unsigned int i, tp_max, tp_avg, eprob;
 	char *p;
 
 	ms = kmalloc(2048, GFP_KERNEL);
@@ -86,9 +86,9 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 	p = ms->buf;
 	p += sprintf(p, "\n");
 	p += sprintf(p,
-		     "best   __________rate_________    ________statistics________    ________last_______    ______sum-of________\n");
+		     "best   __________rate_________    ________statistics________    ____last_____    ______sum-of________\n");
 	p += sprintf(p,
-		     "rate  [name idx airtime max_tp]  [avg(tp) avg(prob) sd(prob)]  [prob.|retry|suc|att]  [#success | #attempts]\n");
+		     "rate  [name idx airtime max_tp]  [avg(tp) avg(prob) sd(prob)]  [retry|suc|att]  [#success | #attempts]\n");
 
 	for (i = 0; i < mi->n_rates; i++) {
 		struct minstrel_rate *mr = &mi->r[i];
@@ -107,17 +107,15 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 
 		tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
 		tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
 
 		p += sprintf(p, "%4u.%1u    %4u.%1u     %3u.%1u    %3u.%1u"
-				"     %3u.%1u %3u   %3u %-3u   "
+				"     %3u   %3u %-3u   "
 				"%9llu   %-9llu\n",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
 				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
@@ -148,7 +146,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
 {
 	struct minstrel_sta_info *mi = inode->i_private;
 	struct minstrel_debugfs_info *ms;
-	unsigned int i, tp_max, tp_avg, prob, eprob;
+	unsigned int i, tp_max, tp_avg, eprob;
 	char *p;
 
 	ms = kmalloc(2048, GFP_KERNEL);
@@ -175,16 +173,14 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
 
 		tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
 		tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
 
-		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
+		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
 				"%llu,%llu,%d,%d\n",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
 				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 1942d4b3cd18..0cecd23bb834 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -19,7 +19,7 @@ static char *
 minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 {
 	const struct mcs_group *mg;
-	unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+	unsigned int j, tp_max, tp_avg, eprob, tx_time;
 	char htmode = '2';
 	char gimode = 'L';
 	u32 gflags;
@@ -83,17 +83,15 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 
 		tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
 		tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
 
 		p += sprintf(p, "%4u.%1u    %4u.%1u     %3u.%1u    %3u.%1u"
-				"     %3u.%1u %3u   %3u %-3u   "
+				"     %3u   %3u %-3u   "
 				"%9llu   %-9llu\n",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
 				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
@@ -130,9 +128,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
 
 	p += sprintf(p, "\n");
 	p += sprintf(p,
-		     "              best   ____________rate__________    ________statistics________    ________last_______    ______sum-of________\n");
+		     "              best   ____________rate__________    ________statistics________    _____last____    ______sum-of________\n");
 	p += sprintf(p,
-		     "mode guard #  rate  [name   idx airtime  max_tp]  [avg(tp) avg(prob) sd(prob)]  [prob.|retry|suc|att]  [#success | #attempts]\n");
+		     "mode guard #  rate  [name   idx airtime  max_tp]  [avg(tp) avg(prob) sd(prob)]  [retry|suc|att]  [#success | #attempts]\n");
 
 	p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
 	for (i = 0; i < MINSTREL_CCK_GROUP; i++)
@@ -165,7 +163,7 @@ static char *
 minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 {
 	const struct mcs_group *mg;
-	unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+	unsigned int j, tp_max, tp_avg, eprob, tx_time;
 	char htmode = '2';
 	char gimode = 'L';
 	u32 gflags;
@@ -226,16 +224,14 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 
 		tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
 		tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
 
-		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
+		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
 				"%u,%llu,%llu,",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
 				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
-- 
cgit v1.2.3


From 0217eefa64509d63fbe8d1205310bafb2355cf4d Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:46:59 +0100
Subject: mac80211: minstrel: reduce MINSTREL_SCALE

The loss of a bit of extra precision does not hurt the calculation, 12
bits is still enough to calculate probabilities well. Reducing the scale
makes it easier to avoid overflows

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index 94b89b1b406c..03716fddaf24 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -14,7 +14,7 @@
 #define SAMPLE_COLUMNS	10	/* number of columns in sample table */
 
 /* scaled fraction values */
-#define MINSTREL_SCALE  16
+#define MINSTREL_SCALE  12
 #define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
 #define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
 
-- 
cgit v1.2.3


From 4f0bc9c61bae9c74ffcf5dbdbd241cbf0ec3a44e Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:47:00 +0100
Subject: mac80211: minstrel: store probability variance instead of standard
 deviation

This avoids the costly int_sqrt calls in the statistics update and moves
it to the debugfs code instead.
This also fixes an overflow in the previous standard deviation
calculation.

Signed-off-by: Thomas Huehn <thomas.huehn@evernet-eg.de>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel.c            |  8 ++++----
 net/mac80211/rc80211_minstrel.h            | 25 ++++++++++++++-----------
 net/mac80211/rc80211_minstrel_debugfs.c    |  8 ++++++--
 net/mac80211/rc80211_minstrel_ht_debugfs.c |  8 ++++++--
 4 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index a284ea08f048..11a4cc393a98 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -168,10 +168,10 @@ minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
 			mrs->prob_ewma = cur_prob;
 		} else {
 			/* update exponential weighted moving variance */
-			mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd,
-							 cur_prob,
-							 mrs->prob_ewma,
-							 EWMA_LEVEL);
+			mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv,
+							cur_prob,
+							mrs->prob_ewma,
+							EWMA_LEVEL);
 
 			/*update exponential weighted moving avarage */
 			mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index 03716fddaf24..ea86e6784154 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -36,21 +36,16 @@ minstrel_ewma(int old, int new, int weight)
 }
 
 /*
- * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation
+ * Perform EWMV (Exponentially Weighted Moving Variance) calculation
  */
 static inline int
-minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight)
+minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight)
 {
-	int diff, incr, tmp_var;
+	int diff, incr;
 
-	/* calculate exponential weighted moving variance */
-	diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000);
+	diff = cur_prob - prob_ewma;
 	incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
-	tmp_var = old_ewmsd * old_ewmsd;
-	tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV;
-
-	/* return standard deviation */
-	return (u16) int_sqrt(tmp_var);
+	return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV;
 }
 
 struct minstrel_rate_stats {
@@ -65,7 +60,7 @@ struct minstrel_rate_stats {
 	 *  prob_ewma - exponential weighted moving average of prob
 	 *  prob_ewmsd - exp. weighted moving standard deviation of prob */
 	unsigned int prob_ewma;
-	u16 prob_ewmsd;
+	u16 prob_ewmv;
 
 	/* maximum retry counts */
 	u8 retry_count;
@@ -151,6 +146,14 @@ struct minstrel_debugfs_info {
 	char buf[];
 };
 
+/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */
+static inline int
+minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs)
+{
+	unsigned int ewmv = mrs->prob_ewmv;
+	return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000));
+}
+
 extern const struct rate_control_ops mac80211_minstrel;
 void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
 void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index ca0bafd6a44e..36fc971deb86 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -93,6 +93,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 	for (i = 0; i < mi->n_rates; i++) {
 		struct minstrel_rate *mr = &mi->r[i];
 		struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+		unsigned int prob_ewmsd;
 
 		*(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
 		*(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
@@ -108,6 +109,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 		tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
 		tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
 		p += sprintf(p, "%4u.%1u    %4u.%1u     %3u.%1u    %3u.%1u"
 				"     %3u   %3u %-3u   "
@@ -115,7 +117,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
@@ -159,6 +161,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
 	for (i = 0; i < mi->n_rates; i++) {
 		struct minstrel_rate *mr = &mi->r[i];
 		struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+		unsigned int prob_ewmsd;
 
 		p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
 		p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
@@ -174,13 +177,14 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
 		tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
 		tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
 		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
 				"%llu,%llu,%d,%d\n",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 0cecd23bb834..7d969e300fb3 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -41,6 +41,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
 		static const int bitrates[4] = { 10, 20, 55, 110 };
 		int idx = i * MCS_GROUP_RATES + j;
+		unsigned int prob_ewmsd;
 
 		if (!(mi->supported[i] & BIT(j)))
 			continue;
@@ -84,6 +85,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
 		tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
 		p += sprintf(p, "%4u.%1u    %4u.%1u     %3u.%1u    %3u.%1u"
 				"     %3u   %3u %-3u   "
@@ -91,7 +93,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
@@ -185,6 +187,7 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
 		static const int bitrates[4] = { 10, 20, 55, 110 };
 		int idx = i * MCS_GROUP_RATES + j;
+		unsigned int prob_ewmsd;
 
 		if (!(mi->supported[i] & BIT(j)))
 			continue;
@@ -225,13 +228,14 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
 		tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
 		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
 				"%u,%llu,%llu,",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
-- 
cgit v1.2.3


From 4cae4cd10d058eb4057e20cb72e82fe76ae739af Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:47:01 +0100
Subject: mac80211: minstrel: make prob_ewma u16 instead of u32

Saves about 1.2 KiB memory per station

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index ea86e6784154..be6c3f35f48b 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -59,7 +59,7 @@ struct minstrel_rate_stats {
 	/* statistis of packet delivery probability
 	 *  prob_ewma - exponential weighted moving average of prob
 	 *  prob_ewmsd - exp. weighted moving standard deviation of prob */
-	unsigned int prob_ewma;
+	u16 prob_ewma;
 	u16 prob_ewmv;
 
 	/* maximum retry counts */
-- 
cgit v1.2.3


From 73b16589b859130eb8b49882e71c7cd1caa5986d Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Wed, 14 Dec 2016 20:47:02 +0100
Subject: mac80211: minstrel_ht: remove obsolete #if for >= 3 streams

This was added during early development when 3x3 hardware was not very
common yet. This is completely unnecessary now.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel_ht.c | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index ae45dcb9d7a7..8e783e197e93 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -155,67 +155,47 @@ MODULE_PARM_DESC(minstrel_vht_only,
 const struct mcs_group minstrel_mcs_groups[] = {
 	MCS_GROUP(1, 0, BW_20),
 	MCS_GROUP(2, 0, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 0, BW_20),
-#endif
 
 	MCS_GROUP(1, 1, BW_20),
 	MCS_GROUP(2, 1, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 1, BW_20),
-#endif
 
 	MCS_GROUP(1, 0, BW_40),
 	MCS_GROUP(2, 0, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 0, BW_40),
-#endif
 
 	MCS_GROUP(1, 1, BW_40),
 	MCS_GROUP(2, 1, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 1, BW_40),
-#endif
 
 	CCK_GROUP,
 
 #ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
 	VHT_GROUP(1, 0, BW_20),
 	VHT_GROUP(2, 0, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 0, BW_20),
-#endif
 
 	VHT_GROUP(1, 1, BW_20),
 	VHT_GROUP(2, 1, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 1, BW_20),
-#endif
 
 	VHT_GROUP(1, 0, BW_40),
 	VHT_GROUP(2, 0, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 0, BW_40),
-#endif
 
 	VHT_GROUP(1, 1, BW_40),
 	VHT_GROUP(2, 1, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 1, BW_40),
-#endif
 
 	VHT_GROUP(1, 0, BW_80),
 	VHT_GROUP(2, 0, BW_80),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 0, BW_80),
-#endif
 
 	VHT_GROUP(1, 1, BW_80),
 	VHT_GROUP(2, 1, BW_80),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 1, BW_80),
 #endif
-#endif
 };
 
 static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
-- 
cgit v1.2.3


From 38252e9ef68936f104a5791740f4ca2f81510a33 Mon Sep 17 00:00:00 2001
From: Thomas Huehn
Date: Wed, 14 Dec 2016 20:47:03 +0100
Subject: mac80211: minstrel: avoid port control frames for sampling

Makes connections more reliable

Signed-off-by: Thomas Huehn <thomas.huehn@evernet-eg.de>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 11a4cc393a98..3ebe4405a2d4 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -367,6 +367,11 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
 		return;
 #endif
 
+	/* Don't use EAPOL frames for sampling on non-mrr hw */
+	if (mp->hw->max_rates == 1 &&
+	    (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
+		return;
+
 	delta = (mi->total_packets * sampling_ratio / 100) -
 			(mi->sample_packets + mi->sample_deferred / 2);
 
-- 
cgit v1.2.3


From f6b4122c93f43c3f7b5787ca20fa3ac458f50d05 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Fri, 16 Dec 2016 09:44:08 +0100
Subject: rfkill: hide unused goto label

A cleanup introduced a harmless warning in some configurations:

net/rfkill/core.c: In function 'rfkill_init':
net/rfkill/core.c:1350:1: warning: label 'error_input' defined but not used [-Wunused-label]

This adds another #ifdef around the label to match that around the
caller.

Fixes: 6124c53edeea ("rfkill: Cleanup error handling in rfkill_init()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index f6b01f3616e5..b7adaee69756 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1347,8 +1347,10 @@ static int __init rfkill_init(void)
 
 	return 0;
 
+#ifdef CONFIG_RFKILL_INPUT
 error_input:
 	rfkill_any_led_trigger_unregister();
+#endif
 error_led_trigger:
 	misc_deregister(&rfkill_miscdev);
 error_misc:
-- 
cgit v1.2.3


From 505a2e882bfae6627c84586edb276485df05c2ef Mon Sep 17 00:00:00 2001
From: Arend Van Spriel
Date: Fri, 16 Dec 2016 11:21:54 +0000
Subject: nl80211: rework {sched_,}scan event related functions

A couple of functions used with scan events were named with
term "send" although they were only preparing the the event
message so renamed those.

Also remove nl80211_send_sched_scan_results() in favor of
just calling nl80211_send_sched_scan() with the right value.

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
[mention nl80211_send_sched_scan_results() in the commit log]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 34 ++++++++--------------------------
 net/wireless/nl80211.h |  6 ++----
 net/wireless/scan.c    |  9 +++++----
 3 files changed, 15 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7762231abd32..23692658fe98 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12804,7 +12804,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
 	return -ENOBUFS;
 }
 
-static int nl80211_send_scan_msg(struct sk_buff *msg,
+static int nl80211_prep_scan_msg(struct sk_buff *msg,
 				 struct cfg80211_registered_device *rdev,
 				 struct wireless_dev *wdev,
 				 u32 portid, u32 seq, int flags,
@@ -12835,7 +12835,7 @@ static int nl80211_send_scan_msg(struct sk_buff *msg,
 }
 
 static int
-nl80211_send_sched_scan_msg(struct sk_buff *msg,
+nl80211_prep_sched_scan_msg(struct sk_buff *msg,
 			    struct cfg80211_registered_device *rdev,
 			    struct net_device *netdev,
 			    u32 portid, u32 seq, int flags, u32 cmd)
@@ -12867,7 +12867,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return;
 
-	if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+	if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
 				  NL80211_CMD_TRIGGER_SCAN) < 0) {
 		nlmsg_free(msg);
 		return;
@@ -12886,7 +12886,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return NULL;
 
-	if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+	if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
 				  aborted ? NL80211_CMD_SCAN_ABORTED :
 					    NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
 		nlmsg_free(msg);
@@ -12896,31 +12896,13 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
 	return msg;
 }
 
-void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
-			      struct sk_buff *msg)
-{
-	if (!msg)
-		return;
-
-	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				NL80211_MCGRP_SCAN, GFP_KERNEL);
-}
-
-void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
-				     struct net_device *netdev)
+/* send message created by nl80211_build_scan_msg() */
+void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
+			   struct sk_buff *msg)
 {
-	struct sk_buff *msg;
-
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
 		return;
 
-	if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0,
-					NL80211_CMD_SCHED_SCAN_RESULTS) < 0) {
-		nlmsg_free(msg);
-		return;
-	}
-
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				NL80211_MCGRP_SCAN, GFP_KERNEL);
 }
@@ -12934,7 +12916,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return;
 
-	if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
+	if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
 		nlmsg_free(msg);
 		return;
 	}
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 7e3821d7fcc5..75f82520211d 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -14,12 +14,10 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
 			     struct wireless_dev *wdev);
 struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
 				       struct wireless_dev *wdev, bool aborted);
-void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
-			      struct sk_buff *msg);
+void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
+			   struct sk_buff *msg);
 void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
 			     struct net_device *netdev, u32 cmd);
-void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
-				     struct net_device *netdev);
 void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
 				     struct regulatory_request *request);
 
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 35ad69fd0838..21be56b3128e 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -227,7 +227,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 	ASSERT_RTNL();
 
 	if (rdev->scan_msg) {
-		nl80211_send_scan_result(rdev, rdev->scan_msg);
+		nl80211_send_scan_msg(rdev, rdev->scan_msg);
 		rdev->scan_msg = NULL;
 		return;
 	}
@@ -273,7 +273,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 	if (!send_message)
 		rdev->scan_msg = msg;
 	else
-		nl80211_send_scan_result(rdev, msg);
+		nl80211_send_scan_msg(rdev, msg);
 }
 
 void __cfg80211_scan_done(struct work_struct *wk)
@@ -321,7 +321,8 @@ void __cfg80211_sched_scan_results(struct work_struct *wk)
 			spin_unlock_bh(&rdev->bss_lock);
 			request->scan_start = jiffies;
 		}
-		nl80211_send_sched_scan_results(rdev, request->dev);
+		nl80211_send_sched_scan(rdev, request->dev,
+					NL80211_CMD_SCHED_SCAN_RESULTS);
 	}
 
 	rtnl_unlock();
@@ -1147,7 +1148,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 	else
 		rcu_assign_pointer(tmp.pub.beacon_ies, ies);
 	rcu_assign_pointer(tmp.pub.ies, ies);
-	
+
 	memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
 	tmp.pub.channel = channel;
 	tmp.pub.scan_width = data->scan_width;
-- 
cgit v1.2.3


From 7b854982b273134e4ff02c61a74e70e1b1876286 Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Mon, 19 Dec 2016 09:51:11 +0100
Subject: Revert "rfkill: Add rfkill-any LED trigger"

This reverts commit 73f4f76a196d7adb11a1e192bd8024fe0bc83910.

As Mike reported, and I should've seen in review, we can't call
the new LED functions, which acquire the mutex, from places like
rfkill_set_sw_state() that are documented to be callable from
any context the user likes to use. For Mike's case it led to a
deadlock, but other scenarios are possible.

Reported-by: Михаил Кринкин <krinkin.m.u@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 75 +------------------------------------------------------
 1 file changed, 1 insertion(+), 74 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index b7adaee69756..afa4f71b4c7b 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,47 +176,6 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 	led_trigger_unregister(&rfkill->led_trigger);
 }
-
-static struct led_trigger rfkill_any_led_trigger;
-
-static void __rfkill_any_led_trigger_event(void)
-{
-	enum led_brightness brightness = LED_OFF;
-	struct rfkill *rfkill;
-
-	list_for_each_entry(rfkill, &rfkill_list, node) {
-		if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
-			brightness = LED_FULL;
-			break;
-		}
-	}
-
-	led_trigger_event(&rfkill_any_led_trigger, brightness);
-}
-
-static void rfkill_any_led_trigger_event(void)
-{
-	mutex_lock(&rfkill_global_mutex);
-	__rfkill_any_led_trigger_event();
-	mutex_unlock(&rfkill_global_mutex);
-}
-
-static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
-{
-	rfkill_any_led_trigger_event();
-}
-
-static int rfkill_any_led_trigger_register(void)
-{
-	rfkill_any_led_trigger.name = "rfkill-any";
-	rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
-	return led_trigger_register(&rfkill_any_led_trigger);
-}
-
-static void rfkill_any_led_trigger_unregister(void)
-{
-	led_trigger_unregister(&rfkill_any_led_trigger);
-}
 #else
 static void rfkill_led_trigger_event(struct rfkill *rfkill)
 {
@@ -230,23 +189,6 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
 static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 }
-
-static void __rfkill_any_led_trigger_event(void)
-{
-}
-
-static void rfkill_any_led_trigger_event(void)
-{
-}
-
-static int rfkill_any_led_trigger_register(void)
-{
-	return 0;
-}
-
-static void rfkill_any_led_trigger_unregister(void)
-{
-}
 #endif /* CONFIG_RFKILL_LEDS */
 
 static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -355,7 +297,6 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
-	__rfkill_any_led_trigger_event();
 
 	if (prev != curr)
 		rfkill_event(rfkill);
@@ -536,7 +477,6 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
-	rfkill_any_led_trigger_event();
 
 	if (rfkill->registered && prev != blocked)
 		schedule_work(&rfkill->uevent_work);
@@ -580,7 +520,6 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	rfkill_led_trigger_event(rfkill);
-	rfkill_any_led_trigger_event();
 
 	return blocked;
 }
@@ -630,7 +569,6 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 			schedule_work(&rfkill->uevent_work);
 
 		rfkill_led_trigger_event(rfkill);
-		rfkill_any_led_trigger_event();
 	}
 }
 EXPORT_SYMBOL(rfkill_set_states);
@@ -874,10 +812,8 @@ static int rfkill_resume(struct device *dev)
 	rfkill->suspended = false;
 
 	if (!rfkill->persistent) {
-		mutex_lock(&rfkill_global_mutex);
 		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
 		rfkill_set_block(rfkill, cur);
-		mutex_unlock(&rfkill_global_mutex);
 	}
 
 	if (rfkill->ops->poll && !rfkill->polling_paused)
@@ -1049,7 +985,6 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 #endif
 	}
 
-	__rfkill_any_led_trigger_event();
 	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
 	mutex_unlock(&rfkill_global_mutex);
@@ -1082,7 +1017,6 @@ void rfkill_unregister(struct rfkill *rfkill)
 	mutex_lock(&rfkill_global_mutex);
 	rfkill_send_events(rfkill, RFKILL_OP_DEL);
 	list_del_init(&rfkill->node);
-	__rfkill_any_led_trigger_event();
 	mutex_unlock(&rfkill_global_mutex);
 
 	rfkill_led_trigger_unregister(rfkill);
@@ -1335,10 +1269,6 @@ static int __init rfkill_init(void)
 	if (error)
 		goto error_misc;
 
-	error = rfkill_any_led_trigger_register();
-	if (error)
-		goto error_led_trigger;
-
 #ifdef CONFIG_RFKILL_INPUT
 	error = rfkill_handler_init();
 	if (error)
@@ -1349,10 +1279,8 @@ static int __init rfkill_init(void)
 
 #ifdef CONFIG_RFKILL_INPUT
 error_input:
-	rfkill_any_led_trigger_unregister();
-#endif
-error_led_trigger:
 	misc_deregister(&rfkill_miscdev);
+#endif
 error_misc:
 	class_unregister(&rfkill_class);
 error_class:
@@ -1365,7 +1293,6 @@ static void __exit rfkill_exit(void)
 #ifdef CONFIG_RFKILL_INPUT
 	rfkill_handler_exit();
 #endif
-	rfkill_any_led_trigger_unregister();
 	misc_deregister(&rfkill_miscdev);
 	class_unregister(&rfkill_class);
 }
-- 
cgit v1.2.3


From eab59075d3cd7f3535aa2dbbc19a198dfee58892 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Wed, 28 Dec 2016 09:26:31 -0200
Subject: sctp: reduce indent level at sctp_sf_tabort_8_4_8

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8ec20a64a3f8..32587b1f84e7 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3237,36 +3237,34 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 	struct sctp_chunk *abort;
 
 	packet = sctp_ootb_pkt_new(net, asoc, chunk);
+	if (!packet)
+		return SCTP_DISPOSITION_NOMEM;
 
-	if (packet) {
-		/* Make an ABORT. The T bit will be set if the asoc
-		 * is NULL.
-		 */
-		abort = sctp_make_abort(asoc, chunk, 0);
-		if (!abort) {
-			sctp_ootb_pkt_free(packet);
-			return SCTP_DISPOSITION_NOMEM;
-		}
+	/* Make an ABORT. The T bit will be set if the asoc
+	 * is NULL.
+	 */
+	abort = sctp_make_abort(asoc, chunk, 0);
+	if (!abort) {
+		sctp_ootb_pkt_free(packet);
+		return SCTP_DISPOSITION_NOMEM;
+	}
 
-		/* Reflect vtag if T-Bit is set */
-		if (sctp_test_T_bit(abort))
-			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+	/* Reflect vtag if T-Bit is set */
+	if (sctp_test_T_bit(abort))
+		packet->vtag = ntohl(chunk->sctp_hdr->vtag);
 
-		/* Set the skb to the belonging sock for accounting.  */
-		abort->skb->sk = ep->base.sk;
+	/* Set the skb to the belonging sock for accounting.  */
+	abort->skb->sk = ep->base.sk;
 
-		sctp_packet_append_chunk(packet, abort);
+	sctp_packet_append_chunk(packet, abort);
 
-		sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
-				SCTP_PACKET(packet));
-
-		SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+			SCTP_PACKET(packet));
 
-		sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-		return SCTP_DISPOSITION_CONSUME;
-	}
+	SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
 
-	return SCTP_DISPOSITION_NOMEM;
+	sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	return SCTP_DISPOSITION_CONSUME;
 }
 
 /*
-- 
cgit v1.2.3


From 1ff0156167dfb625c2e82aefded6b4cf245476ce Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Wed, 28 Dec 2016 09:26:32 -0200
Subject: sctp: reduce indent level in sctp_sf_shut_8_4_5

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 58 ++++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 32587b1f84e7..a95915ef9dba 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3501,45 +3501,43 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
 	struct sctp_chunk *shut;
 
 	packet = sctp_ootb_pkt_new(net, asoc, chunk);
+	if (!packet)
+		return SCTP_DISPOSITION_NOMEM;
 
-	if (packet) {
-		/* Make an SHUTDOWN_COMPLETE.
-		 * The T bit will be set if the asoc is NULL.
-		 */
-		shut = sctp_make_shutdown_complete(asoc, chunk);
-		if (!shut) {
-			sctp_ootb_pkt_free(packet);
-			return SCTP_DISPOSITION_NOMEM;
-		}
-
-		/* Reflect vtag if T-Bit is set */
-		if (sctp_test_T_bit(shut))
-			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+	/* Make an SHUTDOWN_COMPLETE.
+	 * The T bit will be set if the asoc is NULL.
+	 */
+	shut = sctp_make_shutdown_complete(asoc, chunk);
+	if (!shut) {
+		sctp_ootb_pkt_free(packet);
+		return SCTP_DISPOSITION_NOMEM;
+	}
 
-		/* Set the skb to the belonging sock for accounting.  */
-		shut->skb->sk = ep->base.sk;
+	/* Reflect vtag if T-Bit is set */
+	if (sctp_test_T_bit(shut))
+		packet->vtag = ntohl(chunk->sctp_hdr->vtag);
 
-		sctp_packet_append_chunk(packet, shut);
+	/* Set the skb to the belonging sock for accounting.  */
+	shut->skb->sk = ep->base.sk;
 
-		sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
-				SCTP_PACKET(packet));
+	sctp_packet_append_chunk(packet, shut);
 
-		SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+			SCTP_PACKET(packet));
 
-		/* If the chunk length is invalid, we don't want to process
-		 * the reset of the packet.
-		 */
-		if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
-			return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
 
-		/* We need to discard the rest of the packet to prevent
-		 * potential bomming attacks from additional bundled chunks.
-		 * This is documented in SCTP Threats ID.
-		 */
+	/* If the chunk length is invalid, we don't want to process
+	 * the reset of the packet.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-	}
 
-	return SCTP_DISPOSITION_NOMEM;
+	/* We need to discard the rest of the packet to prevent
+	 * potential bomming attacks from additional bundled chunks.
+	 * This is documented in SCTP Threats ID.
+	 */
+	return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 }
 
 /*
-- 
cgit v1.2.3


From 0630c56e40b0bcca299d3b4c20ffcfddbe6a0218 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Wed, 28 Dec 2016 09:26:33 -0200
Subject: sctp: simplify addr copy

Make it a bit easier to read.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c     | 16 +++++++---------
 net/sctp/protocol.c | 18 +++++++-----------
 2 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 5ed8e79bf102..6619367bb6ca 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -412,22 +412,20 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
 static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
 			     int is_saddr)
 {
-	__be16 *port;
-	struct sctphdr *sh;
+	/* Always called on head skb, so this is safe */
+	struct sctphdr *sh = sctp_hdr(skb);
+	struct sockaddr_in6 *sa = &addr->v6;
 
-	port = &addr->v6.sin6_port;
 	addr->v6.sin6_family = AF_INET6;
 	addr->v6.sin6_flowinfo = 0; /* FIXME */
 	addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
 
-	/* Always called on head skb, so this is safe */
-	sh = sctp_hdr(skb);
 	if (is_saddr) {
-		*port  = sh->source;
-		addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
+		sa->sin6_port = sh->source;
+		sa->sin6_addr = ipv6_hdr(skb)->saddr;
 	} else {
-		*port = sh->dest;
-		addr->v6.sin6_addr = ipv6_hdr(skb)->daddr;
+		sa->sin6_port = sh->dest;
+		sa->sin6_addr = ipv6_hdr(skb)->daddr;
 	}
 }
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 616a9428e0c4..f9c3c37c9ae0 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -237,23 +237,19 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
 static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
 			     int is_saddr)
 {
-	void *from;
-	__be16 *port;
-	struct sctphdr *sh;
+	/* Always called on head skb, so this is safe */
+	struct sctphdr *sh = sctp_hdr(skb);
+	struct sockaddr_in *sa = &addr->v4;
 
-	port = &addr->v4.sin_port;
 	addr->v4.sin_family = AF_INET;
 
-	/* Always called on head skb, so this is safe */
-	sh = sctp_hdr(skb);
 	if (is_saddr) {
-		*port  = sh->source;
-		from = &ip_hdr(skb)->saddr;
+		sa->sin_port = sh->source;
+		sa->sin_addr.s_addr = ip_hdr(skb)->saddr;
 	} else {
-		*port = sh->dest;
-		from = &ip_hdr(skb)->daddr;
+		sa->sin_port = sh->dest;
+		sa->sin_addr.s_addr = ip_hdr(skb)->daddr;
 	}
-	memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr));
 }
 
 /* Initialize an sctp_addr from a socket. */
-- 
cgit v1.2.3


From 66b91d2cd0344c417194596ef6e387e52be69e57 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Wed, 28 Dec 2016 09:26:34 -0200
Subject: sctp: remove return value from sctp_packet_init/config

There is no reason to use this cascading. It doesn't add anything.
Let's remove it and simplify.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  7 +++----
 net/sctp/output.c          | 14 +++++---------
 net/sctp/sm_statefuns.c    |  5 +++--
 3 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 92daabdc007d..87d56cc80a3c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -722,10 +722,9 @@ struct sctp_packet {
 	    ipfragok:1;		/* So let ip fragment this packet */
 };
 
-struct sctp_packet *sctp_packet_init(struct sctp_packet *,
-				     struct sctp_transport *,
-				     __u16 sport, __u16 dport);
-struct sctp_packet *sctp_packet_config(struct sctp_packet *, __u32 vtag, int);
+void sctp_packet_init(struct sctp_packet *, struct sctp_transport *,
+		      __u16 sport, __u16 dport);
+void sctp_packet_config(struct sctp_packet *, __u32 vtag, int);
 sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *,
 				       struct sctp_chunk *, int, gfp_t);
 sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *,
diff --git a/net/sctp/output.c b/net/sctp/output.c
index f5320a87341e..07ab5062e541 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -81,8 +81,8 @@ static void sctp_packet_reset(struct sctp_packet *packet)
 /* Config a packet.
  * This appears to be a followup set of initializations.
  */
-struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
-				       __u32 vtag, int ecn_capable)
+void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
+			int ecn_capable)
 {
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
@@ -123,14 +123,12 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
 		if (chunk)
 			sctp_packet_append_chunk(packet, chunk);
 	}
-
-	return packet;
 }
 
 /* Initialize the packet structure. */
-struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
-				     struct sctp_transport *transport,
-				     __u16 sport, __u16 dport)
+void sctp_packet_init(struct sctp_packet *packet,
+		      struct sctp_transport *transport,
+		      __u16 sport, __u16 dport)
 {
 	struct sctp_association *asoc = transport->asoc;
 	size_t overhead;
@@ -151,8 +149,6 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
 	packet->overhead = overhead;
 	sctp_packet_reset(packet);
 	packet->vtag = 0;
-
-	return packet;
 }
 
 /* Free a packet.  */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index a95915ef9dba..9a223d5b2314 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6032,8 +6032,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
 	sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
 			     sctp_sk(net->sctp.ctl_sock));
 
-	packet = sctp_packet_init(&transport->packet, transport, sport, dport);
-	packet = sctp_packet_config(packet, vtag, 0);
+	packet = &transport->packet;
+	sctp_packet_init(packet, transport, sport, dport);
+	sctp_packet_config(packet, vtag, 0);
 
 	return packet;
 
-- 
cgit v1.2.3


From 509e7a311f0423bce6d3e1705ba0a470ffe770dc Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Wed, 28 Dec 2016 09:26:35 -0200
Subject: sctp: sctp_chunk_length_valid should return bool

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 9a223d5b2314..3382ef254e7b 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -160,23 +160,22 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
  * is set to be the size of a specific chunk we are testing.
- * Return Values:  1 = Valid length
- * 		   0 = Invalid length
+ * Return Values:  true  = Valid length
+ * 		   false = Invalid length
  *
  */
-static inline int
-sctp_chunk_length_valid(struct sctp_chunk *chunk,
-			   __u16 required_length)
+static inline bool
+sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
 {
 	__u16 chunk_length = ntohs(chunk->chunk_hdr->length);
 
 	/* Previously already marked? */
 	if (unlikely(chunk->pdiscard))
-		return 0;
+		return false;
 	if (unlikely(chunk_length < required_length))
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 
 /**********************************************************
-- 
cgit v1.2.3


From b77b7565a6d11611c3f179941d2b95c2d72939dc Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Wed, 28 Dec 2016 09:51:56 -0200
Subject: sctp: add pr_debug for tracking asocs not found

This pr_debug may help identify why the system is generating some
Aborts. It's not something a sysadmin would be expected to use.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 458e506ef84b..704ad19c1565 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -1229,13 +1229,26 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
 	struct sctp_association *asoc;
 
 	asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+	if (asoc)
+		goto out;
 
 	/* Further lookup for INIT/INIT-ACK packets.
 	 * SCTP Implementors Guide, 2.18 Handling of address
 	 * parameters within the INIT or INIT-ACK.
 	 */
-	if (!asoc)
-		asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+	asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+	if (asoc)
+		goto out;
 
+	if (paddr->sa.sa_family == AF_INET)
+		pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n",
+			 &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port),
+			 &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port));
+	else
+		pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n",
+			 &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port),
+			 &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port));
+
+out:
 	return asoc;
 }
-- 
cgit v1.2.3


From 1946e672c173559155a3e210fe95dbf8b7b8ddf7 Mon Sep 17 00:00:00 2001
From: Haishuang Yan
Date: Wed, 28 Dec 2016 17:52:32 +0800
Subject: ipv4: Namespaceify tcp_tw_recycle and tcp_max_tw_buckets knob

Different namespace application might require fast recycling
TIME-WAIT sockets independently of the host.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 13 +------------
 include/net/netns/ipv4.h         | 11 +++++++++++
 include/net/tcp.h                |  1 -
 net/ipv4/af_inet.c               |  2 --
 net/ipv4/inet_timewait_sock.c    |  3 +--
 net/ipv4/proc.c                  |  2 +-
 net/ipv4/sysctl_net_ipv4.c       | 28 ++++++++++++++--------------
 net/ipv4/tcp.c                   |  3 ++-
 net/ipv4/tcp_input.c             |  2 +-
 net/ipv4/tcp_ipv4.c              | 12 ++++++++----
 net/ipv4/tcp_minisocks.c         | 14 +++++---------
 net/ipv6/tcp_ipv6.c              |  7 ++++---
 12 files changed, 48 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c9b3eb70f340..6a75d67a30fd 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -29,16 +29,6 @@
 
 #include <linux/atomic.h>
 
-struct inet_hashinfo;
-
-struct inet_timewait_death_row {
-	atomic_t		tw_count;
-
-	struct inet_hashinfo 	*hashinfo ____cacheline_aligned_in_smp;
-	int			sysctl_tw_recycle;
-	int			sysctl_max_tw_buckets;
-};
-
 struct inet_bind_bucket;
 
 /*
@@ -125,8 +115,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo
 
 void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
 
-void inet_twsk_purge(struct inet_hashinfo *hashinfo,
-		     struct inet_timewait_death_row *twdr, int family);
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family);
 
 static inline
 struct net *twsk_net(const struct inet_timewait_sock *twsk)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 0378e88f6fd3..fffd38453985 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -27,6 +27,16 @@ struct ping_group_range {
 	kgid_t		range[2];
 };
 
+struct inet_hashinfo;
+
+struct inet_timewait_death_row {
+	atomic_t		tw_count;
+
+	struct inet_hashinfo 	*hashinfo ____cacheline_aligned_in_smp;
+	int			sysctl_tw_recycle;
+	int			sysctl_max_tw_buckets;
+};
+
 struct netns_ipv4 {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*forw_hdr;
@@ -111,6 +121,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_fin_timeout;
 	unsigned int sysctl_tcp_notsent_lowat;
 	int sysctl_tcp_tw_reuse;
+	struct inet_timewait_death_row tcp_death_row;
 
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6061963cca98..1da0aa724929 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -231,7 +231,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define	TFO_SERVER_WO_SOCKOPT1	0x400
 
-extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
 extern int sysctl_tcp_timestamps;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f75069883f2b..aae410bb655a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1831,8 +1831,6 @@ static int __init inet_init(void)
 
 	ip_init();
 
-	tcp_v4_init();
-
 	/* Setup TCP slab cache for open requests. */
 	tcp_init();
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index ddcd56c08d14..f8aff2c71cde 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
 }
 EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
 
-void inet_twsk_purge(struct inet_hashinfo *hashinfo,
-		     struct inet_timewait_death_row *twdr, int family)
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
 {
 	struct inet_timewait_sock *tw;
 	struct sock *sk;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7143ca1a6af9..0247ca032232 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -65,7 +65,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
-		   atomic_read(&tcp_death_row.tw_count), sockets,
+		   atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
 		   proto_memory_allocated(&tcp_prot));
 	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 22cbd61079b5..66f8f1b1dc78 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -289,13 +289,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_max_tw_buckets",
-		.data		= &tcp_death_row.sysctl_max_tw_buckets,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -309,13 +302,6 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
 		.proc_handler	= proc_tcp_fastopen_key,
 	},
-	{
-		.procname	= "tcp_tw_recycle",
-		.data		= &tcp_death_row.sysctl_tw_recycle,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_abort_on_overflow",
 		.data		= &sysctl_tcp_abort_on_overflow,
@@ -960,6 +946,20 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_max_tw_buckets",
+		.data		= &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_tw_recycle",
+		.data		= &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4a044964da66..7f0d81c090ce 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3334,6 +3334,7 @@ void __init tcp_init(void)
 
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+	inet_hashinfo_init(&tcp_hashinfo);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
@@ -3378,7 +3379,6 @@ void __init tcp_init(void)
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 
-	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
 	sysctl_tcp_max_orphans = cnt / 2;
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
@@ -3399,6 +3399,7 @@ void __init tcp_init(void)
 	pr_info("Hash tables configured (established %u bind %u)\n",
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
 
+	tcp_v4_init();
 	tcp_metrics_init();
 	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
 	tcp_tasklet_init();
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6c790754ae3e..c61480249835 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6363,7 +6363,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		 * timewait bucket, so that all the necessary checks
 		 * are made in the function processing timewait state.
 		 */
-		if (tcp_death_row.sysctl_tw_recycle) {
+		if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
 			bool strict;
 
 			dst = af_ops->route_req(sk, &fl, req, &strict);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fe9da4fb96bf..56b5f49e3f97 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -146,6 +146,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct rtable *rt;
 	int err;
 	struct ip_options_rcu *inet_opt;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
@@ -196,7 +197,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 			tp->write_seq	   = 0;
 	}
 
-	if (tcp_death_row.sysctl_tw_recycle &&
+	if (tcp_death_row->sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 		tcp_fetch_timewait_stamp(sk, &rt->dst);
 
@@ -215,7 +216,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	 * complete initialization after this.
 	 */
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = inet_hash_connect(&tcp_death_row, sk);
+	err = inet_hash_connect(tcp_death_row, sk);
 	if (err)
 		goto failure;
 
@@ -2457,6 +2458,10 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 	net->ipv4.sysctl_tcp_tw_reuse = 0;
 
+	net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
+	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (tcp_hashinfo.ehash_mask + 1) / 2;
+	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
@@ -2466,7 +2471,7 @@ fail:
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 {
-	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+	inet_twsk_purge(&tcp_hashinfo, AF_INET);
 }
 
 static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2477,7 +2482,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 
 void __init tcp_v4_init(void)
 {
-	inet_hashinfo_init(&tcp_hashinfo);
 	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 28ce5ee831f5..06fde26a82b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,12 +29,6 @@
 
 int sysctl_tcp_abort_on_overflow __read_mostly;
 
-struct inet_timewait_death_row tcp_death_row = {
-	.sysctl_max_tw_buckets = NR_FILE * 2,
-	.hashinfo	= &tcp_hashinfo,
-};
-EXPORT_SYMBOL_GPL(tcp_death_row);
-
 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
 	if (seq == s_win)
@@ -100,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	struct tcp_options_received tmp_opt;
 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 	bool paws_reject = false;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
@@ -153,7 +148,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 		}
 
-		if (tcp_death_row.sysctl_tw_recycle &&
+		if (tcp_death_row->sysctl_tw_recycle &&
 		    tcptw->tw_ts_recent_stamp &&
 		    tcp_tw_remember_stamp(tw))
 			inet_twsk_reschedule(tw, tw->tw_timeout);
@@ -264,11 +259,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_timewait_sock *tw;
 	bool recycle_ok = false;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
-	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+	if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = tcp_remember_stamp(sk);
 
-	tw = inet_twsk_alloc(sk, &tcp_death_row, state);
+	tw = inet_twsk_alloc(sk, tcp_death_row, state);
 
 	if (tw) {
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 73bc8fc68acd..a4cdf6a34c30 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -123,6 +123,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct dst_entry *dst;
 	int addr_type;
 	int err;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
@@ -258,7 +259,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk->sk_gso_type = SKB_GSO_TCPV6;
 	ip6_dst_store(sk, dst, NULL, NULL);
 
-	if (tcp_death_row.sysctl_tw_recycle &&
+	if (tcp_death_row->sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
 	    ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
 		tcp_fetch_timewait_stamp(sk, dst);
@@ -273,7 +274,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->inet_dport = usin->sin6_port;
 
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = inet6_hash_connect(&tcp_death_row, sk);
+	err = inet6_hash_connect(tcp_death_row, sk);
 	if (err)
 		goto late_failure;
 
@@ -1948,7 +1949,7 @@ static void __net_exit tcpv6_net_exit(struct net *net)
 
 static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
 {
-	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
+	inet_twsk_purge(&tcp_hashinfo, AF_INET6);
 }
 
 static struct pernet_operations tcpv6_net_ops = {
-- 
cgit v1.2.3


From fee83d097b1620530f23bf6063f4ea251ba9c8c7 Mon Sep 17 00:00:00 2001
From: Haishuang Yan
Date: Wed, 28 Dec 2016 17:52:33 +0800
Subject: ipv4: Namespaceify tcp_max_syn_backlog knob

Different namespace application might require different maximal
number of remembered connection requests.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/request_sock.h |  4 +---
 net/core/request_sock.c    |  2 --
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  2 --
 net/ipv4/tcp_input.c       |  4 ++--
 net/ipv4/tcp_ipv4.c        |  7 +++++--
 7 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index fffd38453985..8e3f5b6f26d5 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -122,6 +122,7 @@ struct netns_ipv4 {
 	unsigned int sysctl_tcp_notsent_lowat;
 	int sysctl_tcp_tw_reuse;
 	struct inet_timewait_death_row tcp_death_row;
+	int sysctl_max_syn_backlog;
 
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 6ebe13eb1c4c..a12a5d25b27e 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -1,7 +1,7 @@
 /*
  * NET		Generic infrastructure for Network protocols.
  *
- *		Definitions for request_sock 
+ *		Definitions for request_sock
  *
  * Authors:	Arnaldo Carvalho de Melo <acme@conectiva.com.br>
  *
@@ -123,8 +123,6 @@ static inline void reqsk_put(struct request_sock *req)
 		reqsk_free(req);
 }
 
-extern int sysctl_max_syn_backlog;
-
 /*
  * For a TCP Fast Open listener -
  *	lock - protects the access to all the reqsk, which is co-owned by
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5d26056b6d8f..9b8727c67b58 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -34,8 +34,6 @@
  * and it will increase in proportion to the memory of machine.
  * Note : Dont forget somaxconn that may limit backlog too.
  */
-int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
 
 void reqsk_queue_alloc(struct request_sock_queue *queue)
 {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 66f8f1b1dc78..134d8e191366 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -323,13 +323,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_max_syn_backlog",
-		.data		= &sysctl_max_syn_backlog,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "inet_peer_threshold",
 		.data		= &inet_peer_threshold,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_max_syn_backlog",
+		.data		= &init_net.ipv4.sysctl_max_syn_backlog,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7f0d81c090ce..2e3807d8eba8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3378,9 +3378,7 @@ void __init tcp_init(void)
 
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
-
 	sysctl_tcp_max_orphans = cnt / 2;
-	sysctl_max_syn_backlog = max(128, cnt / 256);
 
 	tcp_init_mem();
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c61480249835..ec6d84363024 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6377,8 +6377,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		}
 		/* Kill the following clause, if you dislike this way. */
 		else if (!net->ipv4.sysctl_tcp_syncookies &&
-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
-			  (sysctl_max_syn_backlog >> 2)) &&
+			 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+			  (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
 			 !tcp_peer_is_proven(req, dst, false,
 					     tmp_opt.saw_tstamp)) {
 			/* Without syncookies last quarter of
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 56b5f49e3f97..7e4be4f361f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2419,7 +2419,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
 
 static int __net_init tcp_sk_init(struct net *net)
 {
-	int res, cpu;
+	int res, cpu, cnt;
 
 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
 	if (!net->ipv4.tcp_sk)
@@ -2458,10 +2458,13 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 	net->ipv4.sysctl_tcp_tw_reuse = 0;
 
+	cnt = tcp_hashinfo.ehash_mask + 1;
 	net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
-	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (tcp_hashinfo.ehash_mask + 1) / 2;
+	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
 
+	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
-- 
cgit v1.2.3


From de8499cee59465cfa6135591d665a065539d456b Mon Sep 17 00:00:00 2001
From: Dave Jones
Date: Wed, 28 Dec 2016 11:53:18 -0500
Subject: ipv6: remove unnecessary inet6_sk check

np is already assigned in the variable declaration of ping_v6_sendmsg.
At this point, we have already dereferenced np several times, so the
NULL check is also redundant.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ping.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index e1f8b34d7a2e..9b522fa90e6d 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -126,12 +126,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		return PTR_ERR(dst);
 	rt = (struct rt6_info *) dst;
 
-	np = inet6_sk(sk);
-	if (!np) {
-		err = -EBADF;
-		goto dst_err_out;
-	}
-
 	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
 		fl6.flowi6_oif = np->mcast_oif;
 	else if (!fl6.flowi6_oif)
@@ -166,7 +160,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 	release_sock(sk);
 
-dst_err_out:
 	dst_release(dst);
 
 	if (err)
-- 
cgit v1.2.3


From bfd2e4b8734d34632f00391994b89c558dcb7d4e Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Thu, 29 Dec 2016 15:53:28 -0200
Subject: sctp: refactor sctp_datamsg_from_user

This patch refactors sctp_datamsg_from_user() in an attempt to make it
better to read and avoid code duplication for handling the last
fragment.

It also avoids doing division and remaining operations. Even though, it
should still operate similarly as before this patch.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/chunk.c | 107 +++++++++++++++++--------------------------------------
 1 file changed, 32 insertions(+), 75 deletions(-)

(limited to 'net')

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 615f0ddd41df..e3621cb4827f 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 					    struct sctp_sndrcvinfo *sinfo,
 					    struct iov_iter *from)
 {
-	int max, whole, i, offset, over, err;
-	int len, first_len;
-	int max_data;
+	size_t len, first_len, max_data, remaining;
+	size_t msg_len = iov_iter_count(from);
+	struct list_head *pos, *temp;
 	struct sctp_chunk *chunk;
 	struct sctp_datamsg *msg;
-	struct list_head *pos, *temp;
-	size_t msg_len = iov_iter_count(from);
-	__u8 frag;
+	int err;
 
 	msg = sctp_datamsg_new(GFP_KERNEL);
 	if (!msg)
@@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	    (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
 	     !SCTP_PR_POLICY(sinfo->sinfo_flags)))
 		msg->expires_at = jiffies +
-				    msecs_to_jiffies(sinfo->sinfo_timetolive);
+				  msecs_to_jiffies(sinfo->sinfo_timetolive);
 
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
@@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		   sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
 	max_data = SCTP_TRUNC4(max_data);
 
-	max = asoc->frag_point;
 	/* If the the peer requested that we authenticate DATA chunks
 	 * we need to account for bundling of the AUTH chunks along with
 	 * DATA.
@@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 					      hmac_desc->hmac_len);
 	}
 
-	/* Now, check if we need to reduce our max */
-	if (max > max_data)
-		max = max_data;
+	/* Check what's our max considering the above */
+	max_data = min_t(size_t, max_data, asoc->frag_point);
 
-	whole = 0;
-	first_len = max;
+	/* Set first_len and then account for possible bundles on first frag */
+	first_len = max_data;
 
 	/* Check to see if we have a pending SACK and try to let it be bundled
 	 * with this message.  Do this if we don't have any data queued already.
@@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
 	    asoc->outqueue.out_qlen == 0 &&
 	    list_empty(&asoc->outqueue.retransmit) &&
-	    msg_len > max)
-		max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+	    msg_len > max_data)
+		first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
 
 	/* Encourage Cookie-ECHO bundling. */
 	if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
-		max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
-
-	/* Now that we adjusted completely, reset first_len */
-	if (first_len > max_data)
-		first_len = max_data;
+		first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
 
 	/* Account for a different sized first fragment */
 	if (msg_len >= first_len) {
-		msg_len -= first_len;
-		whole = 1;
 		msg->can_delay = 0;
-	}
-
-	/* How many full sized?  How many bytes leftover? */
-	whole += msg_len / max;
-	over = msg_len % max;
-	offset = 0;
-
-	if ((whole > 1) || (whole && over))
 		SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+	} else {
+		/* Which may be the only one... */
+		first_len = msg_len;
+	}
 
-	/* Create chunks for all the full sized DATA chunks. */
-	for (i = 0, len = first_len; i < whole; i++) {
-		frag = SCTP_DATA_MIDDLE_FRAG;
+	/* Create chunks for all DATA chunks. */
+	for (remaining = msg_len; remaining; remaining -= len) {
+		u8 frag = SCTP_DATA_MIDDLE_FRAG;
 
-		if (0 == i)
+		if (remaining == msg_len) {
+			/* First frag, which may also be the last */
 			frag |= SCTP_DATA_FIRST_FRAG;
+			len = first_len;
+		} else {
+			/* Middle frags */
+			len = max_data;
+		}
 
-		if ((i == (whole - 1)) && !over) {
+		if (len >= remaining) {
+			/* Last frag, which may also be the first */
+			len = remaining;
 			frag |= SCTP_DATA_LAST_FRAG;
 
 			/* The application requests to set the I-bit of the
@@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 
 		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
 						 0, GFP_KERNEL);
-
 		if (!chunk) {
 			err = -ENOMEM;
 			goto errout;
@@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 			goto errout_chunk_free;
 
 		/* Put the chunk->skb back into the form expected by send.  */
-		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
-			   - (__u8 *)chunk->skb->data);
-
-		sctp_datamsg_assign(msg, chunk);
-		list_add_tail(&chunk->frag_list, &msg->chunks);
-
-		/* The first chunk, the first chunk was likely short
-		 * to allow bundling, so reset to full size.
-		 */
-		if (0 == i)
-			len = max;
-	}
-
-	/* .. now the leftover bytes. */
-	if (over) {
-		if (!whole)
-			frag = SCTP_DATA_NOT_FRAG;
-		else
-			frag = SCTP_DATA_LAST_FRAG;
-
-		if ((sinfo->sinfo_flags & SCTP_EOF) ||
-		    (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
-			frag |= SCTP_DATA_SACK_IMM;
-
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
-						 0, GFP_KERNEL);
-
-		if (!chunk) {
-			err = -ENOMEM;
-			goto errout;
-		}
-
-		err = sctp_user_addto_chunk(chunk, over, from);
-
-		/* Put the chunk->skb back into the form expected by send.  */
-		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
-			   - (__u8 *)chunk->skb->data);
-		if (err < 0)
-			goto errout_chunk_free;
+		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
+				       chunk->skb->data);
 
 		sctp_datamsg_assign(msg, chunk);
 		list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -338,6 +294,7 @@ errout:
 		sctp_chunk_free(chunk);
 	}
 	sctp_datamsg_put(msg);
+
 	return ERR_PTR(err);
 }
 
-- 
cgit v1.2.3


From 3d48b53fb2ae37158e700ffef3f45461ff15c965 Mon Sep 17 00:00:00 2001
From: Matthias Tafelmeier
Date: Thu, 29 Dec 2016 21:37:21 +0100
Subject: net: dev_weight: TX/RX orthogonality

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.

Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
 include/linux/netdevice.h    |  4 ++++
 net/core/dev.c               |  8 ++++++--
 net/core/sysctl_net_core.c   | 31 ++++++++++++++++++++++++++++++-
 net/sched/sch_generic.c      |  2 +-
 5 files changed, 62 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7ea740..b80fbd4e5575 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f7423a74b..ecd78b3c9aba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
+extern int		dev_weight_rx_bias;
+extern int		dev_weight_tx_bias;
+extern int		dev_rx_weight;
+extern int		dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b4b520..56818f7eab2b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3427,7 +3427,11 @@ EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64;            /* old backlog weight */
+int weight_p __read_mostly = 64;           /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = dev_rx_weight;
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e4009f62..eaa72eb0399c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret != 0)
+		return ret;
+
+	dev_rx_weight = weight_p * dev_weight_rx_bias;
+	dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+	return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
 		.data		= &weight_p,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_rx_bias",
+		.data		= &dev_weight_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_tx_bias",
+		.data		= &dev_weight_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
 	},
 	{
 		.procname	= "netdev_max_backlog",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e88519..b052b27a984e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = dev_tx_weight;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
cgit v1.2.3


From 3a543ef479868e36c95935de320608a7e41466ca Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Thu, 29 Dec 2016 14:20:56 -0800
Subject: net: dsa: Implement ndo_get_phys_port_id

Implement ndo_get_phys_port_id() by returning the physical port number
of the switch this per-port DSA created network interface corresponds
to.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 68c9eea00518..ffd91969b830 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -984,6 +984,17 @@ static void dsa_slave_poll_controller(struct net_device *dev)
 }
 #endif
 
+static int dsa_slave_get_phys_port_id(struct net_device *dev,
+				      struct netdev_phys_item_id *ppid)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	ppid->id_len = sizeof(p->port);
+	memcpy(ppid->id, &p->port, ppid->id_len);
+
+	return 0;
+}
+
 void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
 {
 	ops->get_sset_count = dsa_cpu_port_get_sset_count;
@@ -1031,6 +1042,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
+	.ndo_get_phys_port_id	= dsa_slave_get_phys_port_id,
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
-- 
cgit v1.2.3


From 7bb387c5ab12aeac3d5eea28686489ff46b53ca9 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 29 Dec 2016 15:39:37 -0800
Subject: net: Allow IP_MULTICAST_IF to set index to L3 slave

IP_MULTICAST_IF fails if sk_bound_dev_if is already set and the new index
does not match it. e.g.,

    ntpd[15381]: setsockopt IP_MULTICAST_IF 192.168.1.23 fails: Invalid argument

Relax the check in setsockopt to allow setting mc_index to an L3 slave if
sk_bound_dev_if points to an L3 master.

Make a similar change for IPv6. In this case change the device lookup to
take the rcu_read_lock avoiding a refcnt. The rcu lock is also needed for
the lookup of a potential L3 master device.

This really only silences a setsockopt failure since uses of mc_index are
secondary to sk_bound_dev_if if it is set. In both cases, if either index
is an L3 slave or master, lookups are directed to the same FIB table so
relaxing the check at setsockopt time causes no harm.

Patch is based on a suggested change by Darwin for a problem noted in
their code base.

Suggested-by: Darwin Dingel <darwin.dingel@alliedtelesis.co.nz>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c   |  7 ++++++-
 net/ipv6/ipv6_sockglue.c | 16 ++++++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 57e1405e8282..cf9b43de4690 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -843,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	{
 		struct ip_mreqn mreq;
 		struct net_device *dev = NULL;
+		int midx;
 
 		if (sk->sk_type == SOCK_STREAM)
 			goto e_inval;
@@ -887,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		err = -EADDRNOTAVAIL;
 		if (!dev)
 			break;
+
+		midx = l3mdev_master_ifindex(dev);
+
 		dev_put(dev);
 
 		err = -EINVAL;
 		if (sk->sk_bound_dev_if &&
-		    mreq.imr_ifindex != sk->sk_bound_dev_if)
+		    mreq.imr_ifindex != sk->sk_bound_dev_if &&
+		    (!midx || midx != sk->sk_bound_dev_if))
 			break;
 
 		inet->mc_index = mreq.imr_ifindex;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index ee97c44e2aa0..a531ba032b85 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -595,16 +595,24 @@ done:
 
 		if (val) {
 			struct net_device *dev;
+			int midx;
 
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val)
-				goto e_inval;
+			rcu_read_lock();
 
-			dev = dev_get_by_index(net, val);
+			dev = dev_get_by_index_rcu(net, val);
 			if (!dev) {
+				rcu_read_unlock();
 				retv = -ENODEV;
 				break;
 			}
-			dev_put(dev);
+			midx = l3mdev_master_ifindex_rcu(dev);
+
+			rcu_read_unlock();
+
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != val &&
+			    (!midx || midx != sk->sk_bound_dev_if))
+				goto e_inval;
 		}
 		np->mcast_oif = val;
 		retv = 0;
-- 
cgit v1.2.3


From 32bcad4b7f33f79847e6d4757e9cf26c6980bac6 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich
Date: Thu, 15 Dec 2016 18:13:23 +0100
Subject: batman-adv: Start new development cycle

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index a6cc8040a21d..8683542067ba 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2016.5"
+#define BATADV_SOURCE_VERSION "2017.0"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
cgit v1.2.3


From 57629915d568c522ac1422df7bba4bee5b5c7a7c Mon Sep 17 00:00:00 2001
From: Ilan peer
Date: Mon, 26 Dec 2016 18:17:36 +0200
Subject: mac80211: Fix addition of mesh configuration element

The code was setting the capabilities byte to zero,
after it was already properly set previously. Fix it.

The bug was found while debugging hwsim mesh tests failures
that happened since the commit mentioned below.

Fixes: 76f43b4c0a93 ("mac80211: Remove invalid flag operations in mesh TSF synchronization")
Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Reviewed-by: Masashi Honma <masashi.honma@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index cc2a63bd233f..9c23172feba0 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -279,8 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
 	/* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
 	*pos |= ifmsh->ps_peers_deep_sleep ?
 			IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00;
-	*pos++ = 0x00;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From f69b22e65ecfcb3648304e0e32a1df6f0d421375 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Wed, 4 Nov 2015 13:42:39 -0800
Subject: RDS: log the address on bind failure

It's useful to know the IP address when RDS fails to bind a
connection. Thus, adding it to the error message.

Orabug: 21894138
Reviewed-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/bind.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 095f6ce583fe..3a915bedb76c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (!trans) {
 		ret = -EADDRNOTAVAIL;
 		rds_remove_bound(rs);
-		printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
-				"load rds_tcp or rds_rdma?\n");
+		pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
+				    __func__, &sin->sin_addr.s_addr);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From bb7897631d2379ec198635cc24bf1e8c629d0bda Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Sun, 4 Dec 2016 16:41:29 -0800
Subject: RDS: mark few internal functions static to make sparse build happy

Fixes below warnings:
warning: symbol 'rds_send_probe' was not declared. Should it be static?
warning: symbol 'rds_send_ping' was not declared. Should it be static?
warning: symbol 'rds_tcp_accept_one_path' was not declared. Should it be static?
warning: symbol 'rds_walk_conn_path_info' was not declared. Should it be static?

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/connection.c | 10 +++++-----
 net/rds/send.c       |  4 ++--
 net/rds/tcp_listen.c |  1 +
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index fe9d31c0b22d..0e04dcceb1d4 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -545,11 +545,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 }
 EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
 
-void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
-			     struct rds_info_iterator *iter,
-			     struct rds_info_lengths *lens,
-			     int (*visitor)(struct rds_conn_path *, void *),
-			     size_t item_len)
+static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+				    struct rds_info_iterator *iter,
+				    struct rds_info_lengths *lens,
+				    int (*visitor)(struct rds_conn_path *, void *),
+				    size_t item_len)
 {
 	u64  buffer[(item_len + 7) / 8];
 	struct hlist_head *head;
diff --git a/net/rds/send.c b/net/rds/send.c
index 77c8c6e613ad..bb13c56fc2f8 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1169,7 +1169,7 @@ out:
  * or
  *   RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
  */
-int
+static int
 rds_send_probe(struct rds_conn_path *cp, __be16 sport,
 	       __be16 dport, u8 h_flags)
 {
@@ -1238,7 +1238,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport)
 	return rds_send_probe(cp, 0, dport, 0);
 }
 
-void
+static void
 rds_send_ping(struct rds_connection *conn)
 {
 	unsigned long flags;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index f74bab3ecdca..67d0929c7d3d 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -79,6 +79,7 @@ bail:
  * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
  * by moving them to CONNECTING in this function.
  */
+static
 struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 {
 	int i;
-- 
cgit v1.2.3


From ff3f19a2f608ee406331e8c7b60d7376e75c2157 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Mon, 14 Mar 2016 07:43:55 -0700
Subject: RDS: IB: include faddr in connection log

Also use pr_* for it.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_cm.c   | 19 +++++++++----------
 net/rds/ib_recv.c |  4 ++--
 net/rds/ib_send.c |  4 ++--
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5b2ab95afa07..b9da1e59ecc1 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -113,19 +113,18 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	}
 
 	if (conn->c_version < RDS_PROTOCOL(3, 1)) {
-		printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
-		       " no longer supported\n",
-		       &conn->c_faddr,
-		       RDS_PROTOCOL_MAJOR(conn->c_version),
-		       RDS_PROTOCOL_MINOR(conn->c_version));
+		pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version));
 		rds_conn_destroy(conn);
 		return;
 	} else {
-		printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-		       &conn->c_faddr,
-		       RDS_PROTOCOL_MAJOR(conn->c_version),
-		       RDS_PROTOCOL_MINOR(conn->c_version),
-		       ic->i_flowctl ? ", flow control" : "");
+		pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version),
+			  ic->i_flowctl ? ", flow control" : "");
 	}
 
 	/*
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 606a11f681d2..6803b75eb8bd 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -980,8 +980,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
 	} else {
 		/* We expect errors as the qp is drained during shutdown */
 		if (rds_conn_up(conn) || rds_conn_connecting(conn))
-			rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
-					  &conn->c_faddr,
+			rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+					  &conn->c_laddr, &conn->c_faddr,
 					  wc->status,
 					  ib_wc_status_msg(wc->status));
 	}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 84d90c97332f..19eca5c4c00c 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -300,8 +300,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 
 	/* We expect errors as the qp is drained during shutdown */
 	if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-		rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
-				  &conn->c_faddr, wc->status,
+		rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+				  &conn->c_laddr, &conn->c_faddr, wc->status,
 				  ib_wc_status_msg(wc->status));
 	}
 }
-- 
cgit v1.2.3


From fab8688d7185a1fe01ee9e0930fc59c0f161ee93 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Mon, 4 Jul 2016 15:31:21 -0700
Subject: RDS: IB: make the transport retry count smallest

Transport retry is not much useful since it indicate packet loss
in fabric so its better to failover fast rather than longer retry.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 45ac8e8e58f4..f4e81214e70a 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -16,7 +16,7 @@
 #define RDS_IB_DEFAULT_SEND_WR		256
 #define RDS_IB_DEFAULT_FR_WR		512
 
-#define RDS_IB_DEFAULT_RETRY_COUNT	2
+#define RDS_IB_DEFAULT_RETRY_COUNT	1
 
 #define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
 
-- 
cgit v1.2.3


From 3e56c2f856d7aba6a03feea834d68f9c05f7d0b6 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Sun, 4 Dec 2016 16:25:43 -0800
Subject: RDS: RDMA: fix the ib_map_mr_sg_zbva() argument

Fixes warning: Using plain integer as NULL pointer

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_frmr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index d921adc62765..66b3d6228a15 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
 	struct ib_send_wr *failed_wr;
 	struct ib_reg_wr reg_wr;
-	int ret;
+	int ret, off = 0;
 
 	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
 		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		cpu_relax();
 	}
 
-	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
+	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
+				&off, PAGE_SIZE);
 	if (unlikely(ret != ibmr->sg_len))
 		return ret < 0 ? ret : -EINVAL;
 
-- 
cgit v1.2.3


From 8d5d8a5fd7f9337b2eff689df14ff3ae617f3ae6 Mon Sep 17 00:00:00 2001
From: Qing Huang
Date: Mon, 4 Jul 2016 16:29:13 -0700
Subject: RDS: RDMA: start rdma listening after init

This prevents RDS from handling incoming rdma packets before RDS
completes initializing its recv/send components.

Signed-off-by: Qing Huang <qing.huang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma_transport.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index d5f311767157..fc59821f0a27 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -206,18 +206,13 @@ static int rds_rdma_init(void)
 {
 	int ret;
 
-	ret = rds_rdma_listen_init();
+	ret = rds_ib_init();
 	if (ret)
 		goto out;
 
-	ret = rds_ib_init();
+	ret = rds_rdma_listen_init();
 	if (ret)
-		goto err_ib_init;
-
-	goto out;
-
-err_ib_init:
-	rds_rdma_listen_stop();
+		rds_ib_exit();
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From 584a8279a44a800dea5a5c1e9d53a002e03016b4 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Mon, 4 Jul 2016 17:04:37 -0700
Subject: RDS: RDMA: return appropriate error on rdma map failures

The first message to a remote node should prompt a new
connection even if it is RDMA operation. For RDMA operation
the MR mapping can fail because connections is not yet up.

Since the connection establishment is asynchronous,
we make sure the map failure because of unavailable
connection reach to the user by appropriate error code.
Before returning to the user, lets trigger the connection
so that its ready for the next retry.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/send.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index bb13c56fc2f8..0a6f38b1c8a5 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -945,6 +945,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
 			if (!ret)
 				*allocated_mr = 1;
+			else if (ret == -ENODEV)
+				/* Accommodate the get_mr() case which can fail
+				 * if connection isn't established yet.
+				 */
+				ret = -EAGAIN;
 			break;
 		case RDS_CMSG_ATOMIC_CSWP:
 		case RDS_CMSG_ATOMIC_FADD:
@@ -1082,8 +1087,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
 	/* Parse any control messages the user may have included. */
 	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
-	if (ret)
+	if (ret) {
+		/* Trigger connection so that its ready for the next retry */
+		if (ret ==  -EAGAIN)
+			rds_conn_connect_if_down(conn);
 		goto out;
+	}
 
 	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
 		printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-- 
cgit v1.2.3


From 56012459310a1dbcc55c2dbf5500a9f7571402cb Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Tue, 8 Mar 2016 09:19:01 -0800
Subject: RDS: IB: split the mr registration and invalidation path

MR invalidation in RDS is done in background thread and not in
data path like registration. So break the dependency between them
which helps to remove the performance bottleneck.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h      |  4 +++-
 net/rds/ib_cm.c   |  9 +++++++--
 net/rds/ib_frmr.c | 11 ++++++-----
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index f4e81214e70a..f14c26d22b27 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,7 +14,8 @@
 
 #define RDS_IB_DEFAULT_RECV_WR		1024
 #define RDS_IB_DEFAULT_SEND_WR		256
-#define RDS_IB_DEFAULT_FR_WR		512
+#define RDS_IB_DEFAULT_FR_WR		256
+#define RDS_IB_DEFAULT_FR_INV_WR	256
 
 #define RDS_IB_DEFAULT_RETRY_COUNT	1
 
@@ -125,6 +126,7 @@ struct rds_ib_connection {
 
 	/* To control the number of wrs from fastreg */
 	atomic_t		i_fastreg_wrs;
+	atomic_t		i_fastunreg_wrs;
 
 	/* interrupt handling */
 	struct tasklet_struct	i_send_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b9da1e59ecc1..3002acf75766 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -382,7 +382,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	 * completion queue and send queue. This extra space is used for FRMR
 	 * registration and invalidation work requests
 	 */
-	fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+	fr_queue_space = rds_ibdev->use_fastreg ?
+			 (RDS_IB_DEFAULT_FR_WR + 1) +
+			 (RDS_IB_DEFAULT_FR_INV_WR + 1)
+			 : 0;
 
 	/* add the conn now so that connection establishment has the dev */
 	rds_ib_add_conn(rds_ibdev, conn);
@@ -444,6 +447,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.send_cq = ic->i_send_cq;
 	attr.recv_cq = ic->i_recv_cq;
 	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+	atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
 
 	/*
 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -766,7 +770,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		wait_event(rds_ib_ring_empty_wait,
 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
 			   (atomic_read(&ic->i_signaled_sends) == 0) &&
-			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
+			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
+			   (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 66b3d6228a15..48332a6ed738 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -241,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (frmr->fr_state != FRMR_IS_INUSE)
 		goto out;
 
-	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+	while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		cpu_relax();
 	}
 
@@ -261,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (unlikely(ret)) {
 		frmr->fr_state = FRMR_IS_STALE;
 		frmr->fr_inv = false;
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
 	}
@@ -289,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	if (frmr->fr_inv) {
 		frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
+		atomic_inc(&ic->i_fastreg_wrs);
+	} else {
+		atomic_inc(&ic->i_fastunreg_wrs);
 	}
-
-	atomic_inc(&ic->i_fastreg_wrs);
 }
 
 void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
-- 
cgit v1.2.3


From c536a068870a08fb7b35482e701a6b72e294b493 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Sun, 3 Jul 2016 19:14:10 -0700
Subject: RDS: RDMA: silence the use_once mr log flood

In absence of extension headers, message log will keep
flooding the console. As such even without use_once we can
clean up the MRs so its not really an error case message
so make it debug message

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index ea961144084f..4297f3f337d7 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -415,7 +415,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 	if (!mr) {
-		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+		pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
+			 r_key);
 		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 		return;
 	}
-- 
cgit v1.2.3


From 581d53c91cbf7b31415a9ed5e9a8b89d6af609b3 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Sat, 9 Jul 2016 18:31:38 -0700
Subject: RDS: IB: track and log active side endpoint in connection

Useful to know the active and passive end points in a
RDS IB connection.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h    |  3 +++
 net/rds/ib_cm.c | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index f14c26d22b27..5f02b4d8f10c 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -181,6 +181,9 @@ struct rds_ib_connection {
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
+
+	/* Endpoint role in connection */
+	bool			i_active_side;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 3002acf75766..4d1bf04b06b5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -120,16 +120,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 		rds_conn_destroy(conn);
 		return;
 	} else {
-		pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n",
+		pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+			  ic->i_active_side ? "Active" : "Passive",
 			  &conn->c_laddr, &conn->c_faddr,
 			  RDS_PROTOCOL_MAJOR(conn->c_version),
 			  RDS_PROTOCOL_MINOR(conn->c_version),
 			  ic->i_flowctl ? ", flow control" : "");
 	}
 
-	/*
-	 * Init rings and fill recv. this needs to wait until protocol negotiation
-	 * is complete, since ring layout is different from 3.0 to 3.1.
+	/* Init rings and fill recv. this needs to wait until protocol
+	 * negotiation is complete, since ring layout is different
+	 * from 3.1 to 4.1.
 	 */
 	rds_ib_send_init_ring(ic);
 	rds_ib_recv_init_ring(ic);
@@ -685,6 +686,7 @@ out:
 		if (ic->i_cm_id == cm_id)
 			ret = 0;
 	}
+	ic->i_active_side = true;
 	return ret;
 }
 
@@ -859,6 +861,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 	ic->i_sends = NULL;
 	vfree(ic->i_recvs);
 	ic->i_recvs = NULL;
+	ic->i_active_side = false;
 }
 
 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-- 
cgit v1.2.3


From 09b2b8f52895addd9bf28dc5ac98ff5cc750cf9a Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Sat, 9 Jul 2016 17:14:02 -0700
Subject: RDS: IB: add few useful cache stasts

Tracks the ib receive cache total, incoming and frag allocations.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h       | 7 +++++++
 net/rds/ib_recv.c  | 6 ++++++
 net/rds/ib_stats.c | 2 ++
 3 files changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 5f02b4d8f10c..c62e5513d306 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -151,6 +151,7 @@ struct rds_ib_connection {
 	u64			i_ack_recv;	/* last ACK received */
 	struct rds_ib_refill_cache i_cache_incs;
 	struct rds_ib_refill_cache i_cache_frags;
+	atomic_t		i_cache_allocs;
 
 	/* sending acks */
 	unsigned long		i_ack_flags;
@@ -254,6 +255,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rx_refill_from_cq;
 	uint64_t	s_ib_rx_refill_from_thread;
 	uint64_t	s_ib_rx_alloc_limit;
+	uint64_t	s_ib_rx_total_frags;
+	uint64_t	s_ib_rx_total_incs;
 	uint64_t	s_ib_rx_credit_updates;
 	uint64_t	s_ib_ack_sent;
 	uint64_t	s_ib_ack_send_failure;
@@ -276,6 +279,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rdma_mr_1m_reused;
 	uint64_t	s_ib_atomic_cswp;
 	uint64_t	s_ib_atomic_fadd;
+	uint64_t	s_ib_recv_added_to_cache;
+	uint64_t	s_ib_recv_removed_from_cache;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
@@ -406,6 +411,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
 #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+#define rds_ib_stats_add(member, count) \
+		rds_stats_add_which(rds_ib_stats, member, count)
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
 				    unsigned int avail);
 
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 6803b75eb8bd..4b0f12679219 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic,
 	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
 
 	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+	atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+	rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
 }
 
 /* Recycle inc after freeing attached frags */
@@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
 			atomic_dec(&rds_ib_allocation);
 			return NULL;
 		}
+		rds_ib_stats_inc(s_ib_rx_total_incs);
 	}
 	INIT_LIST_HEAD(&ibinc->ii_frags);
 	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
@@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
 	if (cache_item) {
 		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+		atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+		rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
 	} else {
 		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
 		if (!frag)
@@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 			kmem_cache_free(rds_ib_frag_slab, frag);
 			return NULL;
 		}
+		rds_ib_stats_inc(s_ib_rx_total_frags);
 	}
 
 	INIT_LIST_HEAD(&frag->f_item);
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 7e78dca1f252..9252ad126335 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = {
 	"ib_rx_refill_from_cq",
 	"ib_rx_refill_from_thread",
 	"ib_rx_alloc_limit",
+	"ib_rx_total_frags",
+	"ib_rx_total_incs",
 	"ib_rx_credit_updates",
 	"ib_ack_sent",
 	"ib_ack_send_failure",
-- 
cgit v1.2.3


From be2f76eacc278c272f26d46e4168efe5a55f5383 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Mon, 4 Jul 2016 16:16:36 -0700
Subject: RDS: IB: Add vector spreading for cqs

Based on available device vectors, allocate cqs accordingly to
get better spread of completion vectors which helps performace
great deal..

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.c    | 11 +++++++++++
 net/rds/ib.h    |  5 +++++
 net/rds/ib_cm.c | 40 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 5680d90b0b77..8d70884d7bb6 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -111,6 +111,9 @@ static void rds_ib_dev_free(struct work_struct *work)
 		kfree(i_ipaddr);
 	}
 
+	if (rds_ibdev->vector_load)
+		kfree(rds_ibdev->vector_load);
+
 	kfree(rds_ibdev);
 }
 
@@ -159,6 +162,14 @@ static void rds_ib_add_one(struct ib_device *device)
 	rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
 	rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
 
+	rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors,
+					 GFP_KERNEL);
+	if (!rds_ibdev->vector_load) {
+		pr_err("RDS/IB: %s failed to allocate vector memory\n",
+			__func__);
+		goto put_dev;
+	}
+
 	rds_ibdev->dev = device;
 	rds_ibdev->pd = ib_alloc_pd(device, 0);
 	if (IS_ERR(rds_ibdev->pd)) {
diff --git a/net/rds/ib.h b/net/rds/ib.h
index c62e5513d306..1fe9f79fead5 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -185,6 +185,10 @@ struct rds_ib_connection {
 
 	/* Endpoint role in connection */
 	bool			i_active_side;
+
+	/* Send/Recv vectors */
+	int			i_scq_vector;
+	int			i_rcq_vector;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
@@ -227,6 +231,7 @@ struct rds_ib_device {
 	spinlock_t		spinlock;	/* protect the above */
 	atomic_t		refcount;
 	struct work_struct	free_work;
+	int			*vector_load;
 };
 
 #define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 4d1bf04b06b5..33c8584ada1f 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -358,6 +358,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
 	tasklet_schedule(&ic->i_send_tasklet);
 }
 
+static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
+{
+	int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
+	int index = rds_ibdev->dev->num_comp_vectors - 1;
+	int i;
+
+	for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
+		if (rds_ibdev->vector_load[i] < min) {
+			index = i;
+			min = rds_ibdev->vector_load[i];
+		}
+	}
+
+	rds_ibdev->vector_load[index]++;
+	return index;
+}
+
+static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
+{
+	rds_ibdev->vector_load[index]--;
+}
+
 /*
  * This needs to be very careful to not leave IS_ERR pointers around for
  * cleanup to trip over.
@@ -399,25 +421,30 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	/* Protection domain and memory range */
 	ic->i_pd = rds_ibdev->pd;
 
+	ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
 	cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
-
+	cq_attr.comp_vector = ic->i_scq_vector;
 	ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
 				     rds_ib_cq_event_handler, conn,
 				     &cq_attr);
 	if (IS_ERR(ic->i_send_cq)) {
 		ret = PTR_ERR(ic->i_send_cq);
 		ic->i_send_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
 		rdsdebug("ib_create_cq send failed: %d\n", ret);
 		goto out;
 	}
 
+	ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
 	cq_attr.cqe = ic->i_recv_ring.w_nr;
+	cq_attr.comp_vector = ic->i_rcq_vector;
 	ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
 				     rds_ib_cq_event_handler, conn,
 				     &cq_attr);
 	if (IS_ERR(ic->i_recv_cq)) {
 		ret = PTR_ERR(ic->i_recv_cq);
 		ic->i_recv_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
 		rdsdebug("ib_create_cq recv failed: %d\n", ret);
 		goto out;
 	}
@@ -780,10 +807,17 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		/* first destroy the ib state that generates callbacks */
 		if (ic->i_cm_id->qp)
 			rdma_destroy_qp(ic->i_cm_id);
-		if (ic->i_send_cq)
+		if (ic->i_send_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
 			ib_destroy_cq(ic->i_send_cq);
-		if (ic->i_recv_cq)
+		}
+
+		if (ic->i_recv_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
 			ib_destroy_cq(ic->i_recv_cq);
+		}
 
 		/* then free the resources that ib callbacks use */
 		if (ic->i_send_hdrs)
-- 
cgit v1.2.3


From 941f8d55f6d613a460a5e080d25a38509f45eb75 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Thu, 18 Feb 2016 20:06:47 -0800
Subject: RDS: RDMA: Fix the composite message user notification

When application sends an RDS RDMA composite message consist of
RDMA transfer to be followed up by non RDMA payload, it expect to
be notified *only* when the full message gets delivered. RDS RDMA
notification doesn't behave this way though.

Thanks to Venkat for debug and root casuing the issue
where only first part of the message(RDMA) was
successfully delivered but remainder payload delivery failed.
In that case, application should not be notified with
a false positive of message delivery success.

Fix this case by making sure the user gets notified only after
the full message delivery.

Reviewed-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_send.c | 25 +++++++++++++++----------
 net/rds/rdma.c    | 10 ++++++++++
 net/rds/rds.h     |  1 +
 net/rds/send.c    |  4 +++-
 4 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 19eca5c4c00c..5e72de10c484 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm,
 	complete(rm, notify_status);
 }
 
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
-				   struct rm_data_op *op,
-				   int wc_status)
-{
-	if (op->op_nents)
-		ib_dma_unmap_sg(ic->i_cm_id->device,
-				op->op_sg, op->op_nents,
-				DMA_TO_DEVICE);
-}
-
 static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
 				   struct rm_rdma_op *op,
 				   int wc_status)
@@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
 		rds_ib_stats_inc(s_ib_atomic_fadd);
 }
 
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	struct rds_message *rm = container_of(op, struct rds_message, data);
+
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+
+	if (rm->rdma.op_active && rm->data.op_notify)
+		rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
+}
+
 /*
  * Unmap the resources associated with a struct send_work.
  *
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4297f3f337d7..138aef644c56 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -627,6 +627,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		}
 		op->op_notifier->n_user_token = args->user_token;
 		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+
+		/* Enable rmda notification on data operation for composite
+		 * rds messages and make sure notification is enabled only
+		 * for the data operation which follows it so that application
+		 * gets notified only after full message gets delivered.
+		 */
+		if (rm->data.op_sg) {
+			rm->rdma.op_notify = 0;
+			rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+		}
 	}
 
 	/* The cookie contains the R_Key of the remote memory region, and
diff --git a/net/rds/rds.h b/net/rds/rds.h
index ebbf909b87ec..0bb8213c7d0b 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -419,6 +419,7 @@ struct rds_message {
 		} rdma;
 		struct rm_data_op {
 			unsigned int		op_active:1;
+			unsigned int		op_notify:1;
 			unsigned int		op_nents;
 			unsigned int		op_count;
 			unsigned int		op_dmasg;
diff --git a/net/rds/send.c b/net/rds/send.c
index 0a6f38b1c8a5..45e025b65d29 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -476,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 	struct rm_rdma_op *ro;
 	struct rds_notifier *notifier;
 	unsigned long flags;
+	unsigned int notify = 0;
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
+	notify =  rm->rdma.op_notify | rm->data.op_notify;
 	ro = &rm->rdma;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro->op_active && ro->op_notify && ro->op_notifier) {
+	    ro->op_active && notify && ro->op_notifier) {
 		notifier = ro->op_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
-- 
cgit v1.2.3


From cf657269d311d575eb196c7045579b3443631b8b Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Thu, 29 Sep 2016 11:07:11 -0700
Subject: RDS: IB: fix panic due to handlers running post teardown

Shutdown code reaping loop takes care of emptying the
CQ's before they being destroyed. And once tasklets are
killed, the hanlders are not expected to run.

But because of core tasklet code issues, tasklet handler could
still run even after tasklet_kill,
RDS IB shutdown code already reaps the CQs before freeing
cq/qp resources so as such the handlers have nothing left
to do post shutdown.

On other hand any handler running after teardown and trying
to access already freed qp/cq resources causes issues
Patch fixes this race by  makes sure that handlers returns
without any action post teardown.

Reviewed-by: Wengang <wen.gang.wang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h    |  1 +
 net/rds/ib_cm.c | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 1fe9f79fead5..540458928f3c 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -185,6 +185,7 @@ struct rds_ib_connection {
 
 	/* Endpoint role in connection */
 	bool			i_active_side;
+	atomic_t		i_cq_quiesce;
 
 	/* Send/Recv vectors */
 	int			i_scq_vector;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 33c8584ada1f..ce3775abc6e7 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -128,6 +128,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 			  ic->i_flowctl ? ", flow control" : "");
 	}
 
+	atomic_set(&ic->i_cq_quiesce, 0);
+
 	/* Init rings and fill recv. this needs to wait until protocol
 	 * negotiation is complete, since ring layout is different
 	 * from 3.1 to 4.1.
@@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
 
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
 	ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
@@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
 
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
 	memset(&state, 0, sizeof(state));
 	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
 	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
@@ -804,6 +814,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
+		atomic_set(&ic->i_cq_quiesce, 1);
+
 		/* first destroy the ib state that generates callbacks */
 		if (ic->i_cm_id->qp)
 			rdma_destroy_qp(ic->i_cm_id);
-- 
cgit v1.2.3


From 192a798f52998a643cef84fce0204be56666b0bf Mon Sep 17 00:00:00 2001
From: Venkat Venkatsubra
Date: Sat, 9 Jul 2016 17:36:20 -0700
Subject: RDS: add stat for socket recv memory usage

Tracks the receive side memory added to scokets and removed from sockets.

Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rds.h  | 3 +++
 net/rds/recv.c | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0bb8213c7d0b..8ccd5a93e56c 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -631,6 +631,9 @@ struct rds_statistics {
 	uint64_t	s_cong_update_received;
 	uint64_t	s_cong_send_error;
 	uint64_t	s_cong_send_blocked;
+	uint64_t	s_recv_bytes_added_to_socket;
+	uint64_t	s_recv_bytes_removed_from_socket;
+
 };
 
 /* af_rds.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 9d0666e5fe35..ba19eeeae85a 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -94,6 +94,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
 		return;
 
 	rs->rs_rcv_bytes += delta;
+	if (delta > 0)
+		rds_stats_add(s_recv_bytes_added_to_socket, delta);
+	else
+		rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
 	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
 
 	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
-- 
cgit v1.2.3


From f9fb69adb6c7acca60977a4db5a5f95b8e66c041 Mon Sep 17 00:00:00 2001
From: Avinash Repaka
Date: Mon, 29 Feb 2016 15:30:57 -0800
Subject: RDS: make message size limit compliant with spec

RDS support max message size as 1M but the code doesn't check this
in all cases. Patch fixes it for RDMA & non-RDMA and RDS MR size
and its enforced irrespective of underlying transport.

Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma.c |  9 ++++++++-
 net/rds/rds.h  |  3 +++
 net/rds/send.c | 31 +++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 138aef644c56..f06fac4886b0 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
 /*
  * XXX
  *  - build with sparse
- *  - should we limit the size of a mr region?  let transport return failure?
  *  - should we detect duplicate keys on a socket?  hmm.
  *  - an rdma is an mlock, apply rlimit?
  */
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 		goto out;
 	}
 
+	/* Restrict the size of mr irrespective of underlying transport
+	 * To account for unaligned mr regions, subtract one from nr_pages
+	 */
+	if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
 		args->vec.addr, args->vec.bytes, nr_pages);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 8ccd5a93e56c..f713194e4620 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
 #define RDS_FRAG_SHIFT	12
 #define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
 
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE	((unsigned int)(1 << 20))
+
 #define RDS_CONG_MAP_BYTES	(65536 / 8)
 #define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
 #define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
diff --git a/net/rds/send.c b/net/rds/send.c
index 45e025b65d29..5cc64039caf7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -994,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
 	return hash;
 }
 
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+	struct rds_rdma_args *args;
+	struct cmsghdr *cmsg;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+			args = CMSG_DATA(cmsg);
+			*rdma_bytes += args->remote_vec.bytes;
+		}
+	}
+	return 0;
+}
+
 int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 {
 	struct sock *sk = sock->sk;
@@ -1008,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
 	long timeo = sock_sndtimeo(sk, nonblock);
 	struct rds_conn_path *cpath;
+	size_t total_payload_len = payload_len, rdma_payload_len = 0;
 
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
@@ -1040,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	}
 	release_sock(sk);
 
+	ret = rds_rdma_bytes(msg, &rdma_payload_len);
+	if (ret)
+		goto out;
+
+	total_payload_len += rdma_payload_len;
+	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	if (payload_len > rds_sk_sndbuf(rs)) {
 		ret = -EMSGSIZE;
 		goto out;
-- 
cgit v1.2.3


From 3289025aedc018f8fd9d0e37fb9efa0c6d531ffa Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar
Date: Mon, 4 Jul 2016 22:35:15 -0700
Subject: RDS: add receive message trace used by application

Socket option to tap receive path latency in various stages
in nano seconds. It can be enabled on selective sockets using
using SO_RDS_MSG_RXPATH_LATENCY socket option. RDS will return
the data to application with RDS_CMSG_RXPATH_LATENCY in defined
format. Scope is left to add more trace points for future
without need of change in the interface.

Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 include/uapi/linux/rds.h | 33 +++++++++++++++++++++++++++++++++
 net/rds/af_rds.c         | 28 ++++++++++++++++++++++++++++
 net/rds/ib_recv.c        |  4 ++++
 net/rds/rds.h            | 10 ++++++++++
 net/rds/recv.c           | 32 +++++++++++++++++++++++++++++---
 net/rds/tcp_recv.c       |  5 +++++
 6 files changed, 109 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 0f9265cb2a96..3833113ab2c0 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -52,6 +52,13 @@
 #define RDS_GET_MR_FOR_DEST		7
 #define SO_RDS_TRANSPORT		8
 
+/* Socket option to tap receive path latency
+ *	SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
+ *	Format used struct rds_rx_trace_so
+ */
+#define SO_RDS_MSG_RXPATH_LATENCY	10
+
+
 /* supported values for SO_RDS_TRANSPORT */
 #define	RDS_TRANS_IB	0
 #define	RDS_TRANS_IWARP	1
@@ -77,6 +84,12 @@
  *	the same as for the GET_MR setsockopt.
  * RDS_CMSG_RDMA_STATUS (recvmsg)
  *	Returns the status of a completed RDMA operation.
+ * RDS_CMSG_RXPATH_LATENCY(recvmsg)
+ *	Returns rds message latencies in various stages of receive
+ *	path in nS. Its set per socket using SO_RDS_MSG_RXPATH_LATENCY
+ *	socket option. Legitimate points are defined in
+ *	enum rds_message_rxpath_latency. More points can be added in
+ *	future. CSMG format is struct rds_cmsg_rx_trace.
  */
 #define RDS_CMSG_RDMA_ARGS		1
 #define RDS_CMSG_RDMA_DEST		2
@@ -87,6 +100,7 @@
 #define RDS_CMSG_ATOMIC_CSWP		7
 #define RDS_CMSG_MASKED_ATOMIC_FADD	8
 #define RDS_CMSG_MASKED_ATOMIC_CSWP	9
+#define RDS_CMSG_RXPATH_LATENCY		11
 
 #define RDS_INFO_FIRST			10000
 #define RDS_INFO_COUNTERS		10000
@@ -171,6 +185,25 @@ struct rds_info_rdma_connection {
 	uint32_t	rdma_mr_size;
 };
 
+/* RDS message Receive Path Latency points */
+enum rds_message_rxpath_latency {
+	RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
+	RDS_MSG_RX_DGRAM_REASSEMBLE,
+	RDS_MSG_RX_DGRAM_DELIVERED,
+	RDS_MSG_RX_DGRAM_TRACE_MAX
+};
+
+struct rds_rx_trace_so {
+	u8 rx_traces;
+	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
+struct rds_cmsg_rx_trace {
+	u8 rx_traces;
+	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
 /*
  * Congestion monitoring.
  * Congestion control in RDS happens at the host connection
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2ac1e6194be3..fd8217404162 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,30 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+				  int optlen)
+{
+	struct rds_rx_trace_so trace;
+	int i;
+
+	if (optlen != sizeof(struct rds_rx_trace_so))
+		return -EFAULT;
+
+	if (copy_from_user(&trace, optval, sizeof(trace)))
+		return -EFAULT;
+
+	rs->rs_rx_traces = trace.rx_traces;
+	for (i = 0; i < rs->rs_rx_traces; i++) {
+		if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
+			rs->rs_rx_traces = 0;
+			return -EFAULT;
+		}
+		rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+	}
+
+	return 0;
+}
+
 static int rds_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -338,6 +362,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
 		release_sock(sock->sk);
 		break;
+	case SO_RDS_MSG_RXPATH_LATENCY:
+		ret = rds_recv_track_latency(rs, optval, optlen);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 	}
@@ -484,6 +511,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	INIT_LIST_HEAD(&rs->rs_cong_list);
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
+	rs->rs_rx_traces = 0;
 
 	spin_lock_bh(&rds_sock_lock);
 	list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4b0f12679219..e10624aa6959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -911,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		ic->i_ibinc = ibinc;
 
 		hdr = &ibinc->ii_inc.i_hdr;
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+				local_clock();
 		memcpy(hdr, ihdr, sizeof(*hdr));
 		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+				local_clock();
 
 		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
 			 ic->i_recv_data_rem, hdr->h_flags);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f713194e4620..07fff73dd4f3 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -253,6 +253,11 @@ struct rds_ext_header_rdma_dest {
 #define RDS_EXTHDR_GEN_NUM	6
 
 #define __RDS_EXTHDR_MAX	16 /* for now */
+#define RDS_RX_MAX_TRACES	(RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define	RDS_MSG_RX_HDR		0
+#define	RDS_MSG_RX_START	1
+#define	RDS_MSG_RX_END		2
+#define	RDS_MSG_RX_CMSG		3
 
 struct rds_incoming {
 	atomic_t		i_refcount;
@@ -265,6 +270,7 @@ struct rds_incoming {
 
 	rds_rdma_cookie_t	i_rdma_cookie;
 	struct timeval		i_rx_tstamp;
+	u64			i_rx_lat_trace[RDS_RX_MAX_TRACES];
 };
 
 struct rds_mr {
@@ -575,6 +581,10 @@ struct rds_sock {
 	unsigned char		rs_recverr,
 				rs_cong_monitor;
 	u32			rs_hash_initval;
+
+	/* Socket receive path trace points*/
+	u8			rs_rx_traces;
+	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index ba19eeeae85a..8b7e7b7f2c2d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 		  __be32 saddr)
 {
+	int i;
+
 	atomic_set(&inc->i_refcount, 1);
 	INIT_LIST_HEAD(&inc->i_item);
 	inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 	inc->i_rdma_cookie = 0;
 	inc->i_rx_tstamp.tv_sec = 0;
 	inc->i_rx_tstamp.tv_usec = 0;
+
+	for (i = 0; i < RDS_RX_MAX_TRACES; i++)
+		inc->i_rx_lat_trace[i] = 0;
 }
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
@@ -373,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 		if (sock_flag(sk, SOCK_RCVTSTAMP))
 			do_gettimeofday(&inc->i_rx_tstamp);
 		rds_inc_addref(inc);
+		inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
 		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
 		__rds_wake_sk_sleep(sk);
 	} else {
@@ -534,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
 				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -543,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 			       sizeof(struct timeval),
 			       &inc->i_rx_tstamp);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
-	return 0;
+	if (rs->rs_rx_traces) {
+		struct rds_cmsg_rx_trace t;
+		int i, j;
+
+		inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+		t.rx_traces =  rs->rs_rx_traces;
+		for (i = 0; i < rs->rs_rx_traces; i++) {
+			j = rs->rs_rx_trace[i];
+			t.rx_trace_pos[i] = j;
+			t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+					  inc->i_rx_lat_trace[j];
+		}
+
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+			       sizeof(t), &t);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
 }
 
 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e97f91..e006ef8e6d40 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 			rdsdebug("alloced tinc %p\n", tinc);
 			rds_inc_path_init(&tinc->ti_inc, cp,
 					  cp->cp_conn->c_faddr);
+			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+					local_clock();
+
 			/*
 			 * XXX * we might be able to use the __ variants when
 			 * we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 				/* could be 0 for a 0 len message */
 				tc->t_tinc_data_rem =
 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+				tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+					local_clock();
 			}
 		}
 
-- 
cgit v1.2.3


From e4781421e883340b796da5a724bda7226817990b Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 20 Dec 2016 21:57:02 +0100
Subject: netfilter: merge udp and udplite conntrack helpers

udplite was copied from udp, they are virtually 100% identical.

This adds udplite tracker to udp instead, removes udplite module,
and then makes the udplite tracker builtin.

udplite will then simply re-use udp timeout settings.
It makes little sense to add separate sysctls, nowadays we have
fine-grained timeout policy support via the CT target.

old:
 text    data     bss     dec     hex filename
 1633     672       0    2305     901 nf_conntrack_proto_udp.o
 1756     672       0    2428     97c nf_conntrack_proto_udplite.o
69526   17937     268   87731   156b3 nf_conntrack.ko

new:
 text    data     bss     dec     hex filename
 2442    1184       0    3626     e2a nf_conntrack_proto_udp.o
68565   17721     268   86554   1521a nf_conntrack.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |   1 +
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |   1 +
 include/net/netns/conntrack.h                  |  16 --
 net/netfilter/Makefile                         |   1 -
 net/netfilter/nf_conntrack_proto_udp.c         | 123 ++++++++++
 net/netfilter/nf_conntrack_proto_udplite.c     | 324 -------------------------
 6 files changed, 125 insertions(+), 341 deletions(-)
 delete mode 100644 net/netfilter/nf_conntrack_proto_udplite.c

(limited to 'net')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 919e4e8af327..6ff32815641b 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -14,6 +14,7 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
 
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index eaea968f8657..c59b82456f89 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -5,6 +5,7 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
 
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index cf799fc3fdec..17724c62de97 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -69,19 +69,6 @@ struct nf_sctp_net {
 };
 #endif
 
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-enum udplite_conntrack {
-	UDPLITE_CT_UNREPLIED,
-	UDPLITE_CT_REPLIED,
-	UDPLITE_CT_MAX
-};
-
-struct nf_udplite_net {
-	struct nf_proto_net pn;
-	unsigned int timeouts[UDPLITE_CT_MAX];
-};
-#endif
-
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -94,9 +81,6 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 	struct nf_sctp_net	sctp;
 #endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-	struct nf_udplite_net	udplite;
-#endif
 };
 
 struct ct_pcpu {
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index ca30d1960f1d..bf5c577113b6 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -7,7 +7,6 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
-nf_conntrack-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 20f35ed68030..ae63944c9dc4 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -108,6 +108,59 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+static int udplite_error(struct net *net, struct nf_conn *tmpl,
+			 struct sk_buff *skb,
+			 unsigned int dataoff,
+			 enum ip_conntrack_info *ctinfo,
+			 u8 pf, unsigned int hooknum)
+{
+	unsigned int udplen = skb->len - dataoff;
+	const struct udphdr *hdr;
+	struct udphdr _hdr;
+	unsigned int cscov;
+
+	/* Header is too small? */
+	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (!hdr) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	cscov = ntohs(hdr->len);
+	if (cscov == 0) {
+		cscov = udplen;
+	} else if (cscov < sizeof(*hdr) || cscov > udplen) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: invalid checksum coverage ");
+		return -NF_ACCEPT;
+	}
+
+	/* UDPLITE mandates checksums */
+	if (!hdr->check) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: checksum missing ");
+		return -NF_ACCEPT;
+	}
+
+	/* Checksum invalid? Ignore. */
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
+				pf)) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: bad UDPLite checksum ");
+		return -NF_ACCEPT;
+	}
+
+	return NF_ACCEPT;
+}
+#endif
+
 static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 		     unsigned int dataoff, enum ip_conntrack_info *ctinfo,
 		     u_int8_t pf,
@@ -290,6 +343,41 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
 
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+{
+	.l3proto		= PF_INET,
+	.l4proto		= IPPROTO_UDPLITE,
+	.name			= "udplite",
+	.allow_clash		= true,
+	.pkt_to_tuple		= udp_pkt_to_tuple,
+	.invert_tuple		= udp_invert_tuple,
+	.print_tuple		= udp_print_tuple,
+	.packet			= udp_packet,
+	.get_timeouts		= udp_get_timeouts,
+	.new			= udp_new,
+	.error			= udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+	.ctnl_timeout		= {
+		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
+		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
+		.nlattr_max	= CTA_TIMEOUT_UDP_MAX,
+		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+		.nla_policy	= udp_timeout_nla_policy,
+	},
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+	.init_net		= udp_init_net,
+	.get_net_proto		= udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
+#endif
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
@@ -322,3 +410,38 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 	.get_net_proto		= udp_get_net_proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
+
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+{
+	.l3proto		= PF_INET6,
+	.l4proto		= IPPROTO_UDPLITE,
+	.name			= "udplite",
+	.allow_clash		= true,
+	.pkt_to_tuple		= udp_pkt_to_tuple,
+	.invert_tuple		= udp_invert_tuple,
+	.print_tuple		= udp_print_tuple,
+	.packet			= udp_packet,
+	.get_timeouts		= udp_get_timeouts,
+	.new			= udp_new,
+	.error			= udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+	.ctnl_timeout		= {
+		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
+		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
+		.nlattr_max	= CTA_TIMEOUT_UDP_MAX,
+		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+		.nla_policy	= udp_timeout_nla_policy,
+	},
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+	.init_net		= udp_init_net,
+	.get_net_proto		= udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
+#endif
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
deleted file mode 100644
index c35f7bf05d8c..000000000000
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/udp.h>
-#include <linux/seq_file.h>
-#include <linux/skbuff.h>
-#include <linux/ipv6.h>
-#include <net/ip6_checksum.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_log.h>
-
-static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
-	[UDPLITE_CT_UNREPLIED]	= 30*HZ,
-	[UDPLITE_CT_REPLIED]	= 180*HZ,
-};
-
-static inline struct nf_udplite_net *udplite_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.udplite;
-}
-
-static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
-				 unsigned int dataoff,
-				 struct net *net,
-				 struct nf_conntrack_tuple *tuple)
-{
-	const struct udphdr *hp;
-	struct udphdr _hdr;
-
-	/* Actually only need first 4 bytes to get ports. */
-	hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
-	if (hp == NULL)
-		return false;
-
-	tuple->src.u.udp.port = hp->source;
-	tuple->dst.u.udp.port = hp->dest;
-	return true;
-}
-
-static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
-				 const struct nf_conntrack_tuple *orig)
-{
-	tuple->src.u.udp.port = orig->dst.u.udp.port;
-	tuple->dst.u.udp.port = orig->src.u.udp.port;
-	return true;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static void udplite_print_tuple(struct seq_file *s,
-				const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "sport=%hu dport=%hu ",
-		   ntohs(tuple->src.u.udp.port),
-		   ntohs(tuple->dst.u.udp.port));
-}
-
-static unsigned int *udplite_get_timeouts(struct net *net)
-{
-	return udplite_pernet(net)->timeouts;
-}
-
-/* Returns verdict for packet, and may modify conntracktype */
-static int udplite_packet(struct nf_conn *ct,
-			  const struct sk_buff *skb,
-			  unsigned int dataoff,
-			  enum ip_conntrack_info ctinfo,
-			  u_int8_t pf,
-			  unsigned int hooknum,
-			  unsigned int *timeouts)
-{
-	/* If we've seen traffic both ways, this is some kind of UDP
-	   stream.  Extend timeout. */
-	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
-		nf_ct_refresh_acct(ct, ctinfo, skb,
-				   timeouts[UDPLITE_CT_REPLIED]);
-		/* Also, more likely to be important, and not a probe */
-		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
-			nf_conntrack_event_cache(IPCT_ASSURED, ct);
-	} else {
-		nf_ct_refresh_acct(ct, ctinfo, skb,
-				   timeouts[UDPLITE_CT_UNREPLIED]);
-	}
-	return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
-			unsigned int dataoff, unsigned int *timeouts)
-{
-	return true;
-}
-
-static int udplite_error(struct net *net, struct nf_conn *tmpl,
-			 struct sk_buff *skb,
-			 unsigned int dataoff,
-			 enum ip_conntrack_info *ctinfo,
-			 u_int8_t pf,
-			 unsigned int hooknum)
-{
-	unsigned int udplen = skb->len - dataoff;
-	const struct udphdr *hdr;
-	struct udphdr _hdr;
-	unsigned int cscov;
-
-	/* Header is too small? */
-	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
-	if (hdr == NULL) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: short packet ");
-		return -NF_ACCEPT;
-	}
-
-	cscov = ntohs(hdr->len);
-	if (cscov == 0)
-		cscov = udplen;
-	else if (cscov < sizeof(*hdr) || cscov > udplen) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				"nf_ct_udplite: invalid checksum coverage ");
-		return -NF_ACCEPT;
-	}
-
-	/* UDPLITE mandates checksums */
-	if (!hdr->check) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: checksum missing ");
-		return -NF_ACCEPT;
-	}
-
-	/* Checksum invalid? Ignore. */
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
-	    			pf)) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: bad UDPLite checksum ");
-		return -NF_ACCEPT;
-	}
-
-	return NF_ACCEPT;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
-					 struct net *net, void *data)
-{
-	unsigned int *timeouts = data;
-	struct nf_udplite_net *un = udplite_pernet(net);
-
-	/* set default timeouts for UDPlite. */
-	timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
-	timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
-
-	if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
-		timeouts[UDPLITE_CT_UNREPLIED] =
-		  ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
-	}
-	if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
-		timeouts[UDPLITE_CT_REPLIED] =
-		  ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
-	}
-	return 0;
-}
-
-static int
-udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
-	const unsigned int *timeouts = data;
-
-	if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
-			 htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
-	    nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
-			 htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -ENOSPC;
-}
-
-static const struct nla_policy
-udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
-	[CTA_TIMEOUT_UDPLITE_UNREPLIED]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_UDPLITE_REPLIED]	= { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table udplite_sysctl_table[] = {
-	{
-		.procname	= "nf_conntrack_udplite_timeout",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "nf_conntrack_udplite_timeout_stream",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	{ }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
-					struct nf_udplite_net *un)
-{
-#ifdef CONFIG_SYSCTL
-	if (pn->ctl_table)
-		return 0;
-
-	pn->ctl_table = kmemdup(udplite_sysctl_table,
-				sizeof(udplite_sysctl_table),
-				GFP_KERNEL);
-	if (!pn->ctl_table)
-		return -ENOMEM;
-
-	pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
-	pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
-#endif
-	return 0;
-}
-
-static int udplite_init_net(struct net *net, u_int16_t proto)
-{
-	struct nf_udplite_net *un = udplite_pernet(net);
-	struct nf_proto_net *pn = &un->pn;
-
-	if (!pn->users) {
-		int i;
-
-		for (i = 0 ; i < UDPLITE_CT_MAX; i++)
-			un->timeouts[i] = udplite_timeouts[i];
-	}
-
-	return udplite_kmemdup_sysctl_table(pn, un);
-}
-
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
-{
-	.l3proto		= PF_INET,
-	.l4proto		= IPPROTO_UDPLITE,
-	.name			= "udplite",
-	.allow_clash		= true,
-	.pkt_to_tuple		= udplite_pkt_to_tuple,
-	.invert_tuple		= udplite_invert_tuple,
-	.print_tuple		= udplite_print_tuple,
-	.packet			= udplite_packet,
-	.get_timeouts		= udplite_get_timeouts,
-	.new			= udplite_new,
-	.error			= udplite_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= udplite_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= udplite_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_UDPLITE_MAX,
-		.obj_size	= sizeof(unsigned int) *
-					CTA_TIMEOUT_UDPLITE_MAX,
-		.nla_policy	= udplite_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= udplite_init_net,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
-
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
-{
-	.l3proto		= PF_INET6,
-	.l4proto		= IPPROTO_UDPLITE,
-	.name			= "udplite",
-	.allow_clash		= true,
-	.pkt_to_tuple		= udplite_pkt_to_tuple,
-	.invert_tuple		= udplite_invert_tuple,
-	.print_tuple		= udplite_print_tuple,
-	.packet			= udplite_packet,
-	.get_timeouts		= udplite_get_timeouts,
-	.new			= udplite_new,
-	.error			= udplite_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= udplite_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= udplite_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_UDPLITE_MAX,
-		.obj_size	= sizeof(unsigned int) *
-					CTA_TIMEOUT_UDPLITE_MAX,
-		.nla_policy	= udplite_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= udplite_init_net,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
-- 
cgit v1.2.3


From 9700ba805b4f58d26c4621ad6ba6b0e632e7a04b Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 20 Dec 2016 21:57:03 +0100
Subject: netfilter: nat: merge udp and udplite helpers

udplite nat was copied from udp nat, they are virtually 100% identical.
Not really surprising given udplite is just udp with partial csum coverage.

old:
   text    data     bss     dec     hex filename
  11606    1457     210   13273    33d9 nf_nat.ko
    330       0       2     332     14c nf_nat_proto_udp.o
    276       0       2     278     116 nf_nat_proto_udplite.o
new:
   text    data     bss     dec     hex filename
  11598    1457     210   13265    33d1 nf_nat.ko
    640       0       4     644     284 nf_nat_proto_udp.o

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Makefile               |  1 -
 net/netfilter/nf_nat_proto_udp.c     | 78 ++++++++++++++++++++++++++++++------
 net/netfilter/nf_nat_proto_udplite.c | 73 ---------------------------------
 3 files changed, 66 insertions(+), 86 deletions(-)
 delete mode 100644 net/netfilter/nf_nat_proto_udplite.c

(limited to 'net')

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index bf5c577113b6..6b3034f12661 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -46,7 +46,6 @@ nf_nat-y	:= nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
 # NAT protocols (nf_nat)
 nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
 nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
-nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
 
 # generic transport layer logging
 obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index b1e627227b6e..edd4a77dc09a 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -30,20 +30,15 @@ udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				    &udp_port_rover);
 }
 
-static bool
-udp_manip_pkt(struct sk_buff *skb,
-	      const struct nf_nat_l3proto *l3proto,
-	      unsigned int iphdroff, unsigned int hdroff,
-	      const struct nf_conntrack_tuple *tuple,
-	      enum nf_nat_manip_type maniptype)
+static void
+__udp_manip_pkt(struct sk_buff *skb,
+	        const struct nf_nat_l3proto *l3proto,
+	        unsigned int iphdroff, struct udphdr *hdr,
+	        const struct nf_conntrack_tuple *tuple,
+	        enum nf_nat_manip_type maniptype, bool do_csum)
 {
-	struct udphdr *hdr;
 	__be16 *portptr, newport;
 
-	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-		return false;
-	hdr = (struct udphdr *)(skb->data + hdroff);
-
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		/* Get rid of src port */
 		newport = tuple->src.u.udp.port;
@@ -53,7 +48,7 @@ udp_manip_pkt(struct sk_buff *skb,
 		newport = tuple->dst.u.udp.port;
 		portptr = &hdr->dest;
 	}
-	if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+	if (do_csum) {
 		l3proto->csum_update(skb, iphdroff, &hdr->check,
 				     tuple, maniptype);
 		inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
@@ -62,9 +57,68 @@ udp_manip_pkt(struct sk_buff *skb,
 			hdr->check = CSUM_MANGLED_0;
 	}
 	*portptr = newport;
+}
+
+static bool udp_manip_pkt(struct sk_buff *skb,
+			  const struct nf_nat_l3proto *l3proto,
+			  unsigned int iphdroff, unsigned int hdroff,
+			  const struct nf_conntrack_tuple *tuple,
+			  enum nf_nat_manip_type maniptype)
+{
+	struct udphdr *hdr;
+	bool do_csum;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	hdr = (struct udphdr *)(skb->data + hdroff);
+	do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
+
+	__udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
+	return true;
+}
+
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+static u16 udplite_port_rover;
+
+static bool udplite_manip_pkt(struct sk_buff *skb,
+			      const struct nf_nat_l3proto *l3proto,
+			      unsigned int iphdroff, unsigned int hdroff,
+			      const struct nf_conntrack_tuple *tuple,
+			      enum nf_nat_manip_type maniptype)
+{
+	struct udphdr *hdr;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	hdr = (struct udphdr *)(skb->data + hdroff);
+	__udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
 	return true;
 }
 
+static void
+udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
+		     struct nf_conntrack_tuple *tuple,
+		     const struct nf_nat_range *range,
+		     enum nf_nat_manip_type maniptype,
+		     const struct nf_conn *ct)
+{
+	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+				    &udplite_port_rover);
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+	.l4proto		= IPPROTO_UDPLITE,
+	.manip_pkt		= udplite_manip_pkt,
+	.in_range		= nf_nat_l4proto_in_range,
+	.unique_tuple		= udplite_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */
+
 const struct nf_nat_l4proto nf_nat_l4proto_udp = {
 	.l4proto		= IPPROTO_UDP,
 	.manip_pkt		= udp_manip_pkt,
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index 366bfbfd82a1..000000000000
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static u16 udplite_port_rover;
-
-static void
-udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
-		     struct nf_conntrack_tuple *tuple,
-		     const struct nf_nat_range *range,
-		     enum nf_nat_manip_type maniptype,
-		     const struct nf_conn *ct)
-{
-	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
-				    &udplite_port_rover);
-}
-
-static bool
-udplite_manip_pkt(struct sk_buff *skb,
-		  const struct nf_nat_l3proto *l3proto,
-		  unsigned int iphdroff, unsigned int hdroff,
-		  const struct nf_conntrack_tuple *tuple,
-		  enum nf_nat_manip_type maniptype)
-{
-	struct udphdr *hdr;
-	__be16 *portptr, newport;
-
-	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-		return false;
-
-	hdr = (struct udphdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		/* Get rid of source port */
-		newport = tuple->src.u.udp.port;
-		portptr = &hdr->source;
-	} else {
-		/* Get rid of dst port */
-		newport = tuple->dst.u.udp.port;
-		portptr = &hdr->dest;
-	}
-
-	l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
-	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
-	if (!hdr->check)
-		hdr->check = CSUM_MANGLED_0;
-
-	*portptr = newport;
-	return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
-	.l4proto		= IPPROTO_UDPLITE,
-	.manip_pkt		= udplite_manip_pkt,
-	.in_range		= nf_nat_l4proto_in_range,
-	.unique_tuple		= udplite_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
-#endif
-};
-- 
cgit v1.2.3


From 949a358418aae397d7cf1622aa6515eca766b9e7 Mon Sep 17 00:00:00 2001
From: Liping Zhang
Date: Sun, 25 Dec 2016 19:58:59 +0800
Subject: netfilter: nft_ct: add average bytes per packet support

Similar to xt_connbytes, user can match how many average bytes per packet
a connection has transferred so far.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_ct.c                   | 22 +++++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5726f90bfc2f..b00a05d1ee56 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -863,6 +863,7 @@ enum nft_rt_attributes {
  * @NFT_CT_LABELS: conntrack labels
  * @NFT_CT_PKTS: conntrack packets
  * @NFT_CT_BYTES: conntrack bytes
+ * @NFT_CT_AVGPKT: conntrack average bytes per packet
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
@@ -881,6 +882,7 @@ enum nft_ct_keys {
 	NFT_CT_LABELS,
 	NFT_CT_PKTS,
 	NFT_CT_BYTES,
+	NFT_CT_AVGPKT,
 };
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index e6baeaebe653..d774d7823688 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -129,6 +129,22 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 		memcpy(dest, &count, sizeof(count));
 		return;
 	}
+	case NFT_CT_AVGPKT: {
+		const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
+		u64 avgcnt = 0, bcnt = 0, pcnt = 0;
+
+		if (acct) {
+			pcnt = nft_ct_get_eval_counter(acct->counter,
+						       NFT_CT_PKTS, priv->dir);
+			bcnt = nft_ct_get_eval_counter(acct->counter,
+						       NFT_CT_BYTES, priv->dir);
+			if (pcnt != 0)
+				avgcnt = div64_u64(bcnt, pcnt);
+		}
+
+		memcpy(dest, &avgcnt, sizeof(avgcnt));
+		return;
+	}
 	case NFT_CT_L3PROTOCOL:
 		*dest = nf_ct_l3num(ct);
 		return;
@@ -316,6 +332,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 		break;
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
+	case NFT_CT_AVGPKT:
 		/* no direction? return sum of original + reply */
 		if (tb[NFTA_CT_DIRECTION] == NULL)
 			priv->dir = IP_CT_DIR_MAX;
@@ -346,7 +363,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS)
+	if (priv->key == NFT_CT_BYTES ||
+	    priv->key == NFT_CT_PKTS  ||
+	    priv->key == NFT_CT_AVGPKT)
 		nf_ct_set_acct(ctx->net, true);
 
 	return 0;
@@ -445,6 +464,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		break;
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
+	case NFT_CT_AVGPKT:
 		if (priv->dir < IP_CT_DIR_MAX &&
 		    nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
 			goto nla_put_failure;
-- 
cgit v1.2.3


From 1708ebc9636a249e104b83c6d105f15244825281 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Tue, 3 Jan 2017 12:13:39 +0100
Subject: ipmr, ip6mr: add RTNH_F_UNRESOLVED flag to unresolved cache entries

While working with ipmr, we noticed that it is impossible to determine
if an entry is actually unresolved or its IIF interface has disappeared
(e.g. virtual interface got deleted). These entries look almost
identical to user-space when dumping or receiving notifications. So in
order to recognize them add a new RTNH_F_UNRESOLVED flag which is set when
sending an unresolved cache entry to user-space.

Suggested-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 1 +
 net/ipv4/ipmr.c                | 4 +++-
 net/ipv6/ip6mr.c               | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index e14377f2ec27..8c93ad1ef9ab 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -350,6 +350,7 @@ struct rtnexthop {
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
 #define RTNH_F_OFFLOAD		8	/* offloaded route */
 #define RTNH_F_LINKDOWN		16	/* carrier-down on nexthop */
+#define RTNH_F_UNRESOLVED	32	/* The entry is unresolved (ipmr) */
 
 #define RTNH_COMPARE_MASK	(RTNH_F_DEAD | RTNH_F_LINKDOWN | RTNH_F_OFFLOAD)
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index efc1e76d4977..b35dda57586b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2091,8 +2091,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mfc_parent >= MAXVIFS)
+	if (c->mfc_parent >= MAXVIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
 		return -ENOENT;
+	}
 
 	if (VIF_EXISTS(mrt, c->mfc_parent) &&
 	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 604d8953c775..e275077e8af2 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2243,8 +2243,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mf6c_parent >= MAXMIFS)
+	if (c->mf6c_parent >= MAXMIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
 		return -ENOENT;
+	}
 
 	if (MIF_EXISTS(mrt, c->mf6c_parent) &&
 	    nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
-- 
cgit v1.2.3


From 7f953ab2ba46e8649537942c0a64668ca2ce5cc5 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan
Date: Tue, 3 Jan 2017 06:31:47 -0800
Subject: af_packet: TX_RING support for TPACKET_V3

Although TPACKET_V3 Rx has some benefits over TPACKET_V2 Rx, *_v3
does not currently have TX_RING support. As a result an application
that wants the best perf for Tx and Rx (e.g. to handle request/response
transacations) ends up needing 2 sockets, one with *_v2 for Tx and
another with *_v3 for Rx.

This patch enables TPACKET_V2 compatible Tx features in TPACKET_V3
so that an application can use a single descriptor to get the benefits
of _v3 RX_RING and _v2 TX_RING. An application may do a block-send by
first filling up multiple frames in the Tx ring and then triggering a
transmit. This patch only support fixed size Tx frames for TPACKET_V3,
and requires that tp_next_offset must be zero.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/packet_mmap.txt |  9 ++++++--
 net/packet/af_packet.c                   | 39 ++++++++++++++++++++++++--------
 2 files changed, 37 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index daa015af16a0..f3b9e507ab05 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -565,7 +565,7 @@ TPACKET_V1 --> TPACKET_V2:
 		   (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr))
 
 TPACKET_V2 --> TPACKET_V3:
-	- Flexible buffer implementation:
+	- Flexible buffer implementation for RX_RING:
 		1. Blocks can be configured with non-static frame-size
 		2. Read/poll is at a block-level (as opposed to packet-level)
 		3. Added poll timeout to avoid indefinite user-space wait
@@ -574,7 +574,12 @@ TPACKET_V2 --> TPACKET_V3:
 			4.1 block::timeout
 			4.2 tpkt_hdr::sk_rxhash
 	- RX Hash data available in user space
-	- Currently only RX_RING available
+	- TX_RING semantics are conceptually similar to TPACKET_V2;
+	  use tpacket3_hdr instead of tpacket2_hdr, and TPACKET3_HDRLEN
+	  instead of TPACKET2_HDRLEN. In the current implementation,
+	  the tp_next_offset field in the tpacket3_hdr MUST be set to
+	  zero, indicating that the ring does not hold variable sized frames.
+	  Packets with non-zero values of tp_next_offset will be dropped.
 
 -------------------------------------------------------------------------------
 + AF_PACKET fanout mode
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b9e1a13b4ba3..7e39087d519b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		break;
 	case TPACKET_V3:
+		h.h3->tp_status = status;
+		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+		break;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
@@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		return h.h2->tp_status;
 	case TPACKET_V3:
+		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+		return h.h3->tp_status;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
@@ -2497,6 +2502,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 	ph.raw = frame;
 
 	switch (po->tp_version) {
+	case TPACKET_V3:
+		if (ph.h3->tp_next_offset != 0) {
+			pr_warn_once("variable sized slot not supported");
+			return -EINVAL;
+		}
+		tp_len = ph.h3->tp_len;
+		break;
 	case TPACKET_V2:
 		tp_len = ph.h2->tp_len;
 		break;
@@ -2516,6 +2528,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 		off_max = po->tx_ring.frame_size - tp_len;
 		if (po->sk.sk_type == SOCK_DGRAM) {
 			switch (po->tp_version) {
+			case TPACKET_V3:
+				off = ph.h3->tp_net;
+				break;
 			case TPACKET_V2:
 				off = ph.h2->tp_net;
 				break;
@@ -2525,6 +2540,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 			}
 		} else {
 			switch (po->tp_version) {
+			case TPACKET_V3:
+				off = ph.h3->tp_mac;
+				break;
 			case TPACKET_V2:
 				off = ph.h2->tp_mac;
 				break;
@@ -4113,11 +4131,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	struct tpacket_req *req = &req_u->req;
 
 	lock_sock(sk);
-	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
-	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
-		net_warn_ratelimited("Tx-ring is not supported.\n");
-		goto out;
-	}
 
 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4177,11 +4190,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			goto out;
 		switch (po->tp_version) {
 		case TPACKET_V3:
-		/* Transmit path is not supported. We checked
-		 * it above but just being paranoid
-		 */
-			if (!tx_ring)
+			/* Block transmit is not supported yet */
+			if (!tx_ring) {
 				init_prb_bdqc(po, rb, pg_vec, req_u);
+			} else {
+				struct tpacket_req3 *req3 = &req_u->req3;
+
+				if (req3->tp_retire_blk_tov ||
+				    req3->tp_sizeof_priv ||
+				    req3->tp_feature_req_word) {
+					err = -EINVAL;
+					goto out;
+				}
+			}
 			break;
 		default:
 			break;
-- 
cgit v1.2.3


From 8c44e1af16b2983b3df93117cd0ca40638998ce3 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy
Date: Tue, 3 Jan 2017 10:55:09 -0500
Subject: tipc: unify tipc_wait_for_sndpkt() and tipc_wait_for_sndmsg()
 functions

The functions tipc_wait_for_sndpkt() and tipc_wait_for_sndmsg() are very
similar. The latter function is also called from two locations, and
there will be more in the coming commits, which will all need to test on
different conditions.

Instead of making yet another duplicates of the function, we now
introduce a new macro tipc_wait_for_cond() where the wakeup condition
can be stated as an argument to the call. This macro replaces all
current and future uses of the two functions, which can now be
eliminated.

Acked-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 108 +++++++++++++++++++++++++-----------------------------
 1 file changed, 49 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 800caaa699a1..f27462eeccbe 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -110,7 +110,6 @@ static void tipc_write_space(struct sock *sk);
 static void tipc_sock_destruct(struct sock *sk);
 static int tipc_release(struct socket *sock);
 static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p);
 static void tipc_sk_timeout(unsigned long data);
 static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
 			   struct tipc_name_seq const *seq);
@@ -334,6 +333,49 @@ static int tipc_set_sk_state(struct sock *sk, int state)
 	return res;
 }
 
+static int tipc_sk_sock_err(struct socket *sock, long *timeout)
+{
+	struct sock *sk = sock->sk;
+	int err = sock_error(sk);
+	int typ = sock->type;
+
+	if (err)
+		return err;
+	if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) {
+		if (sk->sk_state == TIPC_DISCONNECTING)
+			return -EPIPE;
+		else if (!tipc_sk_connected(sk))
+			return -ENOTCONN;
+	}
+	if (!*timeout)
+		return -EAGAIN;
+	if (signal_pending(current))
+		return sock_intr_errno(*timeout);
+
+	return 0;
+}
+
+#define tipc_wait_for_cond(sock_, timeout_, condition_)			\
+({								        \
+	int rc_ = 0;							\
+	int done_ = 0;							\
+									\
+	while (!(condition_) && !done_) {				\
+		struct sock *sk_ = sock->sk;				\
+		DEFINE_WAIT_FUNC(wait_, woken_wake_function);		\
+									\
+		rc_ = tipc_sk_sock_err(sock_, timeout_);		\
+		if (rc_)						\
+			break;						\
+		prepare_to_wait(sk_sleep(sk_), &wait_,			\
+				TASK_INTERRUPTIBLE);			\
+		done_ = sk_wait_event(sk_, timeout_,			\
+				      (condition_), &wait_);		\
+		remove_wait_queue(sk_sleep(sk_), &wait_);		\
+	}								\
+	rc_;								\
+})
+
 /**
  * tipc_sk_create - create a TIPC socket
  * @net: network namespace (must be default network)
@@ -721,7 +763,7 @@ new_mtu:
 
 		if (rc == -ELINKCONG) {
 			tsk->link_cong = 1;
-			rc = tipc_wait_for_sndmsg(sock, &timeo);
+			rc = tipc_wait_for_cond(sock, &timeo, !tsk->link_cong);
 			if (!rc)
 				continue;
 		}
@@ -830,31 +872,6 @@ exit:
 	kfree_skb(skb);
 }
 
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
-	int done;
-
-	do {
-		int err = sock_error(sk);
-		if (err)
-			return err;
-		if (sk->sk_shutdown & SEND_SHUTDOWN)
-			return -EPIPE;
-		if (!*timeo_p)
-			return -EAGAIN;
-		if (signal_pending(current))
-			return sock_intr_errno(*timeo_p);
-
-		add_wait_queue(sk_sleep(sk), &wait);
-		done = sk_wait_event(sk, timeo_p, !tsk->link_cong, &wait);
-		remove_wait_queue(sk_sleep(sk), &wait);
-	} while (!done);
-	return 0;
-}
-
 /**
  * tipc_sendmsg - send message in connectionless manner
  * @sock: socket structure
@@ -970,7 +987,7 @@ new_mtu:
 		}
 		if (rc == -ELINKCONG) {
 			tsk->link_cong = 1;
-			rc = tipc_wait_for_sndmsg(sock, &timeo);
+			rc = tipc_wait_for_cond(sock, &timeo, !tsk->link_cong);
 			if (!rc)
 				continue;
 		}
@@ -985,36 +1002,6 @@ new_mtu:
 	return rc;
 }
 
-static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
-	int done;
-
-	do {
-		int err = sock_error(sk);
-		if (err)
-			return err;
-		if (sk->sk_state == TIPC_DISCONNECTING)
-			return -EPIPE;
-		else if (!tipc_sk_connected(sk))
-			return -ENOTCONN;
-		if (!*timeo_p)
-			return -EAGAIN;
-		if (signal_pending(current))
-			return sock_intr_errno(*timeo_p);
-
-		add_wait_queue(sk_sleep(sk), &wait);
-		done = sk_wait_event(sk, timeo_p,
-				     (!tsk->link_cong &&
-				      !tsk_conn_cong(tsk)) ||
-				      !tipc_sk_connected(sk), &wait);
-		remove_wait_queue(sk_sleep(sk), &wait);
-	} while (!done);
-	return 0;
-}
-
 /**
  * tipc_send_stream - send stream-oriented data
  * @sock: socket structure
@@ -1109,7 +1096,10 @@ next:
 
 			tsk->link_cong = 1;
 		}
-		rc = tipc_wait_for_sndpkt(sock, &timeo);
+		rc = tipc_wait_for_cond(sock, &timeo,
+					(!tsk->link_cong &&
+					 !tsk_conn_cong(tsk) &&
+					 tipc_sk_connected(sk)));
 	} while (!rc);
 
 	__skb_queue_purge(&pktchain);
-- 
cgit v1.2.3


From 4d8642d896c53966d32d5e343c3620813dd0e7c8 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy
Date: Tue, 3 Jan 2017 10:55:10 -0500
Subject: tipc: modify struct tipc_plist to be more versatile

During multicast reception we currently use a simple linked list with
push/pop semantics to store port numbers.

We now see a need for a more generic list for storing values of type
u32. We therefore make some modifications to this list, while replacing
the prefix 'tipc_plist_' with 'u32_'. We also add a couple of new
functions which will come to use in the next commits.

Acked-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 100 ++++++++++++++++++++++++++++++++++++--------------
 net/tipc/name_table.h |  21 ++++-------
 net/tipc/socket.c     |   8 ++--
 3 files changed, 83 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e190460fe0d3..5a86df1e5fc2 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -608,7 +608,7 @@ not_found:
  * Returns non-zero if any off-node ports overlap
  */
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct tipc_plist *dports)
+			      u32 limit, struct list_head *dports)
 {
 	struct name_seq *seq;
 	struct sub_seq *sseq;
@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 		info = sseq->info;
 		list_for_each_entry(publ, &info->node_list, node_list) {
 			if (publ->scope <= limit)
-				tipc_plist_push(dports, publ->ref);
+				u32_push(dports, publ->ref);
 		}
 
 		if (info->cluster_list_size != info->node_list_size)
@@ -1022,40 +1022,84 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-void tipc_plist_push(struct tipc_plist *pl, u32 port)
+struct u32_item {
+	struct list_head list;
+	u32 value;
+};
+
+bool u32_find(struct list_head *l, u32 value)
 {
-	struct tipc_plist *nl;
+	struct u32_item *item;
 
-	if (likely(!pl->port)) {
-		pl->port = port;
-		return;
+	list_for_each_entry(item, l, list) {
+		if (item->value == value)
+			return true;
 	}
-	if (pl->port == port)
-		return;
-	list_for_each_entry(nl, &pl->list, list) {
-		if (nl->port == port)
-			return;
+	return false;
+}
+
+bool u32_push(struct list_head *l, u32 value)
+{
+	struct u32_item *item;
+
+	list_for_each_entry(item, l, list) {
+		if (item->value == value)
+			return false;
+	}
+	item = kmalloc(sizeof(*item), GFP_ATOMIC);
+	if (unlikely(!item))
+		return false;
+
+	item->value = value;
+	list_add(&item->list, l);
+	return true;
+}
+
+u32 u32_pop(struct list_head *l)
+{
+	struct u32_item *item;
+	u32 value = 0;
+
+	if (list_empty(l))
+		return 0;
+	item = list_first_entry(l, typeof(*item), list);
+	value = item->value;
+	list_del(&item->list);
+	kfree(item);
+	return value;
+}
+
+bool u32_del(struct list_head *l, u32 value)
+{
+	struct u32_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, l, list) {
+		if (item->value != value)
+			continue;
+		list_del(&item->list);
+		kfree(item);
+		return true;
 	}
-	nl = kmalloc(sizeof(*nl), GFP_ATOMIC);
-	if (nl) {
-		nl->port = port;
-		list_add(&nl->list, &pl->list);
+	return false;
+}
+
+void u32_list_purge(struct list_head *l)
+{
+	struct u32_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, l, list) {
+		list_del(&item->list);
+		kfree(item);
 	}
 }
 
-u32 tipc_plist_pop(struct tipc_plist *pl)
+int u32_list_len(struct list_head *l)
 {
-	struct tipc_plist *nl;
-	u32 port = 0;
+	struct u32_item *item;
+	int i = 0;
 
-	if (likely(list_empty(&pl->list))) {
-		port = pl->port;
-		pl->port = 0;
-		return port;
+	list_for_each_entry(item, l, list) {
+		i++;
 	}
-	nl = list_first_entry(&pl->list, typeof(*nl), list);
-	port = nl->port;
-	list_del(&nl->list);
-	kfree(nl);
-	return port;
+	return i;
 }
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 1524a73830f7..c89bb3f5c364 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -99,7 +99,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct tipc_plist *dports);
+			      u32 limit, struct list_head *dports);
 struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
 					 u32 upper, u32 scope, u32 port_ref,
 					 u32 key);
@@ -116,18 +116,11 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
 int tipc_nametbl_init(struct net *net);
 void tipc_nametbl_stop(struct net *net);
 
-struct tipc_plist {
-	struct list_head list;
-	u32 port;
-};
-
-static inline void tipc_plist_init(struct tipc_plist *pl)
-{
-	INIT_LIST_HEAD(&pl->list);
-	pl->port = 0;
-}
-
-void tipc_plist_push(struct tipc_plist *pl, u32 port);
-u32 tipc_plist_pop(struct tipc_plist *pl);
+bool u32_push(struct list_head *l, u32 value);
+u32 u32_pop(struct list_head *l);
+bool u32_find(struct list_head *l, u32 value);
+bool u32_del(struct list_head *l, u32 value);
+void u32_list_purge(struct list_head *l);
+int u32_list_len(struct list_head *l);
 
 #endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index f27462eeccbe..fae6a55ef1b0 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -788,7 +788,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		       struct sk_buff_head *inputq)
 {
 	struct tipc_msg *msg;
-	struct tipc_plist dports;
+	struct list_head dports;
 	u32 portid;
 	u32 scope = TIPC_CLUSTER_SCOPE;
 	struct sk_buff_head tmpq;
@@ -796,7 +796,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 	struct sk_buff *skb, *_skb;
 
 	__skb_queue_head_init(&tmpq);
-	tipc_plist_init(&dports);
+	INIT_LIST_HEAD(&dports);
 
 	skb = tipc_skb_peek(arrvq, &inputq->lock);
 	for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
@@ -810,8 +810,8 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		tipc_nametbl_mc_translate(net,
 					  msg_nametype(msg), msg_namelower(msg),
 					  msg_nameupper(msg), scope, &dports);
-		portid = tipc_plist_pop(&dports);
-		for (; portid; portid = tipc_plist_pop(&dports)) {
+		portid = u32_pop(&dports);
+		for (; portid; portid = u32_pop(&dports)) {
 			_skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
 			if (_skb) {
 				msg_set_destport(buf_msg(_skb), portid);
-- 
cgit v1.2.3


From 365ad353c2564bba8835290061308ba825166b3a Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy
Date: Tue, 3 Jan 2017 10:55:11 -0500
Subject: tipc: reduce risk of user starvation during link congestion

The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.

This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.

In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.

The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.

Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c  |   6 +-
 net/tipc/link.c   |  75 +++++-------
 net/tipc/msg.h    |   2 -
 net/tipc/node.c   |  15 +--
 net/tipc/socket.c | 347 ++++++++++++++++++++++++------------------------------
 5 files changed, 194 insertions(+), 251 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index aa1babbea385..c35fad3e08e8 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -174,7 +174,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
  *                    and to identified node local sockets
  * @net: the applicable net namespace
  * @list: chain of buffers containing message
- * Consumes the buffer chain, except when returning -ELINKCONG
+ * Consumes the buffer chain.
  * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
  */
 int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
@@ -197,7 +197,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
 	tipc_bcast_unlock(net);
 
 	/* Don't send to local node if adding to link failed */
-	if (unlikely(rc)) {
+	if (unlikely(rc && (rc != -ELINKCONG))) {
 		__skb_queue_purge(&rcvq);
 		return rc;
 	}
@@ -206,7 +206,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
 	tipc_bcbase_xmit(net, &xmitq);
 	tipc_sk_mcast_rcv(net, &rcvq, &inputq);
 	__skb_queue_purge(list);
-	return 0;
+	return rc;
 }
 
 /* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
diff --git a/net/tipc/link.c b/net/tipc/link.c
index bda89bf9f4ff..b758ca8b2f79 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -776,60 +776,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
 
 /**
  * link_schedule_user - schedule a message sender for wakeup after congestion
- * @link: congested link
- * @list: message that was attempted sent
+ * @l: congested link
+ * @hdr: header of message that is being sent
  * Create pseudo msg to send back to user when congestion abates
- * Does not consume buffer list
  */
-static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
+static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
 {
-	struct tipc_msg *msg = buf_msg(skb_peek(list));
-	int imp = msg_importance(msg);
-	u32 oport = msg_origport(msg);
-	u32 addr = tipc_own_addr(link->net);
+	u32 dnode = tipc_own_addr(l->net);
+	u32 dport = msg_origport(hdr);
 	struct sk_buff *skb;
 
-	/* This really cannot happen...  */
-	if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
-		pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
-		return -ENOBUFS;
-	}
-	/* Non-blocking sender: */
-	if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
-		return -ELINKCONG;
-
 	/* Create and schedule wakeup pseudo message */
 	skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
-			      addr, addr, oport, 0, 0);
+			      dnode, l->addr, dport, 0, 0);
 	if (!skb)
 		return -ENOBUFS;
-	TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
-	TIPC_SKB_CB(skb)->chain_imp = imp;
-	skb_queue_tail(&link->wakeupq, skb);
-	link->stats.link_congs++;
+	msg_set_dest_droppable(buf_msg(skb), true);
+	TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
+	skb_queue_tail(&l->wakeupq, skb);
+	l->stats.link_congs++;
 	return -ELINKCONG;
 }
 
 /**
  * link_prepare_wakeup - prepare users for wakeup after congestion
- * @link: congested link
- * Move a number of waiting users, as permitted by available space in
- * the send queue, from link wait queue to node wait queue for wakeup
+ * @l: congested link
+ * Wake up a number of waiting users, as permitted by available space
+ * in the send queue
  */
 void link_prepare_wakeup(struct tipc_link *l)
 {
-	int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
-	int imp, lim;
 	struct sk_buff *skb, *tmp;
+	int imp, i = 0;
 
 	skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
 		imp = TIPC_SKB_CB(skb)->chain_imp;
-		lim = l->backlog[imp].limit;
-		pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
-		if ((pnd[imp] + l->backlog[imp].len) >= lim)
+		if (l->backlog[imp].len < l->backlog[imp].limit) {
+			skb_unlink(skb, &l->wakeupq);
+			skb_queue_tail(l->inputq, skb);
+		} else if (i++ > 10) {
 			break;
-		skb_unlink(skb, &l->wakeupq);
-		skb_queue_tail(l->inputq, skb);
+		}
 	}
 }
 
@@ -869,8 +856,7 @@ void tipc_link_reset(struct tipc_link *l)
  * @list: chain of buffers containing message
  * @xmitq: returned list of packets to be sent by caller
  *
- * Consumes the buffer chain, except when returning -ELINKCONG,
- * since the caller then may want to make more send attempts.
+ * Consumes the buffer chain.
  * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
  * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
  */
@@ -879,7 +865,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 {
 	struct tipc_msg *hdr = buf_msg(skb_peek(list));
 	unsigned int maxwin = l->window;
-	unsigned int i, imp = msg_importance(hdr);
+	int imp = msg_importance(hdr);
 	unsigned int mtu = l->mtu;
 	u16 ack = l->rcv_nxt - 1;
 	u16 seqno = l->snd_nxt;
@@ -888,19 +874,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 	struct sk_buff_head *backlogq = &l->backlogq;
 	struct sk_buff *skb, *_skb, *bskb;
 	int pkt_cnt = skb_queue_len(list);
+	int rc = 0;
 
-	/* Match msg importance against this and all higher backlog limits: */
-	if (!skb_queue_empty(backlogq)) {
-		for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
-			if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
-				return link_schedule_user(l, list);
-		}
-	}
 	if (unlikely(msg_size(hdr) > mtu)) {
 		skb_queue_purge(list);
 		return -EMSGSIZE;
 	}
 
+	/* Allow oversubscription of one data msg per source at congestion */
+	if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
+		if (imp == TIPC_SYSTEM_IMPORTANCE) {
+			pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
+			return -ENOBUFS;
+		}
+		rc = link_schedule_user(l, hdr);
+	}
+
 	if (pkt_cnt > 1) {
 		l->stats.sent_fragmented++;
 		l->stats.sent_fragments += pkt_cnt;
@@ -946,7 +935,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		skb_queue_splice_tail_init(list, backlogq);
 	}
 	l->snd_nxt = seqno;
-	return 0;
+	return rc;
 }
 
 void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 8d408612ffa4..850ae0e469f5 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -98,8 +98,6 @@ struct tipc_skb_cb {
 	u32 bytes_read;
 	struct sk_buff *tail;
 	bool validated;
-	bool wakeup_pending;
-	u16 chain_sz;
 	u16 chain_imp;
 	u16 ackers;
 };
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9d2f4c2b08ab..2883f6a0ed98 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1167,7 +1167,7 @@ msg_full:
  * @list: chain of buffers containing message
  * @dnode: address of destination node
  * @selector: a number used for deterministic link selection
- * Consumes the buffer chain, except when returning -ELINKCONG
+ * Consumes the buffer chain.
  * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
  */
 int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
@@ -1206,10 +1206,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
 	spin_unlock_bh(&le->lock);
 	tipc_node_read_unlock(n);
 
-	if (likely(rc == 0))
-		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
-	else if (rc == -ENOBUFS)
+	if (unlikely(rc == -ENOBUFS))
 		tipc_node_link_down(n, bearer_id, false);
+	else
+		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
 
 	tipc_node_put(n);
 
@@ -1221,20 +1221,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
  * messages, which will not be rejected
  * The only exception is datagram messages rerouted after secondary
  * lookup, which are rare and safe to dispose of anyway.
- * TODO: Return real return value, and let callers use
- * tipc_wait_for_sendpkt() where applicable
  */
 int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
 		       u32 selector)
 {
 	struct sk_buff_head head;
-	int rc;
 
 	skb_queue_head_init(&head);
 	__skb_queue_tail(&head, skb);
-	rc = tipc_node_xmit(net, &head, dnode, selector);
-	if (rc == -ELINKCONG)
-		kfree_skb(skb);
+	tipc_node_xmit(net, &head, dnode, selector);
 	return 0;
 }
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index fae6a55ef1b0..d2f353934f82 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -67,12 +67,14 @@ enum {
  * @max_pkt: maximum packet size "hint" used when building messages sent by port
  * @portid: unique port identity in TIPC socket hash table
  * @phdr: preformatted message header used when sending messages
+ * #cong_links: list of congested links
  * @publications: list of publications for port
+ * @blocking_link: address of the congested link we are currently sleeping on
  * @pub_count: total # of publications port has made during its lifetime
  * @probing_state:
  * @conn_timeout: the time we can wait for an unresponded setup request
  * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
- * @link_cong: non-zero if owner must sleep because of link congestion
+ * @cong_link_cnt: number of congested links
  * @sent_unacked: # messages sent by socket, and not yet acked by peer
  * @rcv_unacked: # messages read by user, but not yet acked back to peer
  * @peer: 'connected' peer for dgram/rdm
@@ -87,13 +89,13 @@ struct tipc_sock {
 	u32 max_pkt;
 	u32 portid;
 	struct tipc_msg phdr;
-	struct list_head sock_list;
+	struct list_head cong_links;
 	struct list_head publications;
 	u32 pub_count;
 	uint conn_timeout;
 	atomic_t dupl_rcvcnt;
 	bool probe_unacked;
-	bool link_cong;
+	u16 cong_link_cnt;
 	u16 snt_unacked;
 	u16 snd_win;
 	u16 peer_caps;
@@ -118,8 +120,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
 static int tipc_sk_insert(struct tipc_sock *tsk);
 static void tipc_sk_remove(struct tipc_sock *tsk);
-static int __tipc_send_stream(struct socket *sock, struct msghdr *m,
-			      size_t dsz);
+static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
 static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
 
 static const struct proto_ops packet_ops;
@@ -424,6 +425,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	tsk = tipc_sk(sk);
 	tsk->max_pkt = MAX_PKT_DEFAULT;
 	INIT_LIST_HEAD(&tsk->publications);
+	INIT_LIST_HEAD(&tsk->cong_links);
 	msg = &tsk->phdr;
 	tn = net_generic(sock_net(sk), tipc_net_id);
 	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
@@ -474,9 +476,14 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct net *net = sock_net(sk);
+	long timeout = CONN_TIMEOUT_DEFAULT;
 	u32 dnode = tsk_peer_node(tsk);
 	struct sk_buff *skb;
 
+	/* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */
+	tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
+					    !tsk_conn_cong(tsk)));
+
 	/* Reject all unreceived messages, except on an active connection
 	 * (which disconnects locally & sends a 'FIN+' to peer).
 	 */
@@ -547,7 +554,8 @@ static int tipc_release(struct socket *sock)
 
 	/* Reject any messages that accumulated in backlog queue */
 	release_sock(sk);
-
+	u32_list_purge(&tsk->cong_links);
+	tsk->cong_link_cnt = 0;
 	call_rcu(&tsk->rcu, tipc_sk_callback);
 	sock->sk = NULL;
 
@@ -690,7 +698,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 
 	switch (sk->sk_state) {
 	case TIPC_ESTABLISHED:
-		if (!tsk->link_cong && !tsk_conn_cong(tsk))
+		if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
 			mask |= POLLOUT;
 		/* fall thru' */
 	case TIPC_LISTEN:
@@ -699,7 +707,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			mask |= (POLLIN | POLLRDNORM);
 		break;
 	case TIPC_OPEN:
-		if (!tsk->link_cong)
+		if (!tsk->cong_link_cnt)
 			mask |= POLLOUT;
 		if (tipc_sk_type_connectionless(sk) &&
 		    (!skb_queue_empty(&sk->sk_receive_queue)))
@@ -718,63 +726,48 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
  * @sock: socket structure
  * @seq: destination address
  * @msg: message to send
- * @dsz: total length of message data
- * @timeo: timeout to wait for wakeup
+ * @dlen: length of data to send
+ * @timeout: timeout to wait for wakeup
  *
  * Called from function tipc_sendmsg(), which has done all sanity checks
  * Returns the number of bytes sent on success, or errno
  */
 static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
-			  struct msghdr *msg, size_t dsz, long timeo)
+			  struct msghdr *msg, size_t dlen, long timeout)
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
-	struct tipc_msg *mhdr = &tsk->phdr;
-	struct sk_buff_head pktchain;
-	struct iov_iter save = msg->msg_iter;
-	uint mtu;
+	int mtu = tipc_bcast_get_mtu(net);
+	struct sk_buff_head pkts;
 	int rc;
 
-	if (!timeo && tsk->link_cong)
-		return -ELINKCONG;
-
-	msg_set_type(mhdr, TIPC_MCAST_MSG);
-	msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE);
-	msg_set_destport(mhdr, 0);
-	msg_set_destnode(mhdr, 0);
-	msg_set_nametype(mhdr, seq->type);
-	msg_set_namelower(mhdr, seq->lower);
-	msg_set_nameupper(mhdr, seq->upper);
-	msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
-
-	skb_queue_head_init(&pktchain);
+	rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
+	if (unlikely(rc))
+		return rc;
 
-new_mtu:
-	mtu = tipc_bcast_get_mtu(net);
-	rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
-	if (unlikely(rc < 0))
+	msg_set_type(hdr, TIPC_MCAST_MSG);
+	msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
+	msg_set_destport(hdr, 0);
+	msg_set_destnode(hdr, 0);
+	msg_set_nametype(hdr, seq->type);
+	msg_set_namelower(hdr, seq->lower);
+	msg_set_nameupper(hdr, seq->upper);
+	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
+
+	skb_queue_head_init(&pkts);
+	rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
+	if (unlikely(rc != dlen))
 		return rc;
 
-	do {
-		rc = tipc_bcast_xmit(net, &pktchain);
-		if (likely(!rc))
-			return dsz;
-
-		if (rc == -ELINKCONG) {
-			tsk->link_cong = 1;
-			rc = tipc_wait_for_cond(sock, &timeo, !tsk->link_cong);
-			if (!rc)
-				continue;
-		}
-		__skb_queue_purge(&pktchain);
-		if (rc == -EMSGSIZE) {
-			msg->msg_iter = save;
-			goto new_mtu;
-		}
-		break;
-	} while (1);
-	return rc;
+	rc = tipc_bcast_xmit(net, &pkts);
+	if (unlikely(rc == -ELINKCONG)) {
+		tsk->cong_link_cnt = 1;
+		rc = 0;
+	}
+
+	return rc ? rc : dlen;
 }
 
 /**
@@ -898,35 +891,38 @@ static int tipc_sendmsg(struct socket *sock,
 	return ret;
 }
 
-static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
+static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 {
-	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
 	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
 	struct net *net = sock_net(sk);
-	struct tipc_msg *mhdr = &tsk->phdr;
-	u32 dnode, dport;
-	struct sk_buff_head pktchain;
-	bool is_connectionless = tipc_sk_type_connectionless(sk);
-	struct sk_buff *skb;
+	struct tipc_sock *tsk = tipc_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+	long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+	struct list_head *clinks = &tsk->cong_links;
+	bool syn = !tipc_sk_type_connectionless(sk);
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq *seq;
-	struct iov_iter save;
-	u32 mtu;
-	long timeo;
-	int rc;
+	struct sk_buff_head pkts;
+	u32 type, inst, domain;
+	u32 dnode, dport;
+	int mtu, rc;
 
-	if (dsz > TIPC_MAX_USER_MSG_SIZE)
+	if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
 		return -EMSGSIZE;
+
 	if (unlikely(!dest)) {
-		if (is_connectionless && tsk->peer.family == AF_TIPC)
-			dest = &tsk->peer;
-		else
+		dest = &tsk->peer;
+		if (!syn || dest->family != AF_TIPC)
 			return -EDESTADDRREQ;
-	} else if (unlikely(m->msg_namelen < sizeof(*dest)) ||
-		   dest->family != AF_TIPC) {
-		return -EINVAL;
 	}
-	if (!is_connectionless) {
+
+	if (unlikely(m->msg_namelen < sizeof(*dest)))
+		return -EINVAL;
+
+	if (unlikely(dest->family != AF_TIPC))
+		return -EINVAL;
+
+	if (unlikely(syn)) {
 		if (sk->sk_state == TIPC_LISTEN)
 			return -EPIPE;
 		if (sk->sk_state != TIPC_OPEN)
@@ -938,72 +934,62 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
 			tsk->conn_instance = dest->addr.name.name.instance;
 		}
 	}
-	seq = &dest->addr.nameseq;
-	timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
 
-	if (dest->addrtype == TIPC_ADDR_MCAST) {
-		return tipc_sendmcast(sock, seq, m, dsz, timeo);
-	} else if (dest->addrtype == TIPC_ADDR_NAME) {
-		u32 type = dest->addr.name.name.type;
-		u32 inst = dest->addr.name.name.instance;
-		u32 domain = dest->addr.name.domain;
+	seq = &dest->addr.nameseq;
+	if (dest->addrtype == TIPC_ADDR_MCAST)
+		return tipc_sendmcast(sock, seq, m, dlen, timeout);
 
+	if (dest->addrtype == TIPC_ADDR_NAME) {
+		type = dest->addr.name.name.type;
+		inst = dest->addr.name.name.instance;
+		domain = dest->addr.name.domain;
 		dnode = domain;
-		msg_set_type(mhdr, TIPC_NAMED_MSG);
-		msg_set_hdr_sz(mhdr, NAMED_H_SIZE);
-		msg_set_nametype(mhdr, type);
-		msg_set_nameinst(mhdr, inst);
-		msg_set_lookup_scope(mhdr, tipc_addr_scope(domain));
+		msg_set_type(hdr, TIPC_NAMED_MSG);
+		msg_set_hdr_sz(hdr, NAMED_H_SIZE);
+		msg_set_nametype(hdr, type);
+		msg_set_nameinst(hdr, inst);
+		msg_set_lookup_scope(hdr, tipc_addr_scope(domain));
 		dport = tipc_nametbl_translate(net, type, inst, &dnode);
-		msg_set_destnode(mhdr, dnode);
-		msg_set_destport(mhdr, dport);
+		msg_set_destnode(hdr, dnode);
+		msg_set_destport(hdr, dport);
 		if (unlikely(!dport && !dnode))
 			return -EHOSTUNREACH;
+
 	} else if (dest->addrtype == TIPC_ADDR_ID) {
 		dnode = dest->addr.id.node;
-		msg_set_type(mhdr, TIPC_DIRECT_MSG);
-		msg_set_lookup_scope(mhdr, 0);
-		msg_set_destnode(mhdr, dnode);
-		msg_set_destport(mhdr, dest->addr.id.ref);
-		msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
+		msg_set_type(hdr, TIPC_DIRECT_MSG);
+		msg_set_lookup_scope(hdr, 0);
+		msg_set_destnode(hdr, dnode);
+		msg_set_destport(hdr, dest->addr.id.ref);
+		msg_set_hdr_sz(hdr, BASIC_H_SIZE);
 	}
 
-	skb_queue_head_init(&pktchain);
-	save = m->msg_iter;
-new_mtu:
+	/* Block or return if destination link is congested */
+	rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode));
+	if (unlikely(rc))
+		return rc;
+
+	skb_queue_head_init(&pkts);
 	mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
-	rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
-	if (rc < 0)
+	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
+	if (unlikely(rc != dlen))
 		return rc;
 
-	do {
-		skb = skb_peek(&pktchain);
-		TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
-		rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
-		if (likely(!rc)) {
-			if (!is_connectionless)
-				tipc_set_sk_state(sk, TIPC_CONNECTING);
-			return dsz;
-		}
-		if (rc == -ELINKCONG) {
-			tsk->link_cong = 1;
-			rc = tipc_wait_for_cond(sock, &timeo, !tsk->link_cong);
-			if (!rc)
-				continue;
-		}
-		__skb_queue_purge(&pktchain);
-		if (rc == -EMSGSIZE) {
-			m->msg_iter = save;
-			goto new_mtu;
-		}
-		break;
-	} while (1);
+	rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+	if (unlikely(rc == -ELINKCONG)) {
+		u32_push(clinks, dnode);
+		tsk->cong_link_cnt++;
+		rc = 0;
+	}
 
-	return rc;
+	if (unlikely(syn && !rc))
+		tipc_set_sk_state(sk, TIPC_CONNECTING);
+
+	return rc ? rc : dlen;
 }
 
 /**
- * tipc_send_stream - send stream-oriented data
+ * tipc_sendstream - send stream-oriented data
  * @sock: socket structure
  * @m: data to send
  * @dsz: total length of data to be transmitted
@@ -1013,97 +999,69 @@ new_mtu:
  * Returns the number of bytes sent on success (or partial success),
  * or errno if no data sent
  */
-static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz)
 {
 	struct sock *sk = sock->sk;
 	int ret;
 
 	lock_sock(sk);
-	ret = __tipc_send_stream(sock, m, dsz);
+	ret = __tipc_sendstream(sock, m, dsz);
 	release_sock(sk);
 
 	return ret;
 }
 
-static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 {
 	struct sock *sk = sock->sk;
-	struct net *net = sock_net(sk);
-	struct tipc_sock *tsk = tipc_sk(sk);
-	struct tipc_msg *mhdr = &tsk->phdr;
-	struct sk_buff_head pktchain;
 	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
-	u32 portid = tsk->portid;
-	int rc = -EINVAL;
-	long timeo;
-	u32 dnode;
-	uint mtu, send, sent = 0;
-	struct iov_iter save;
-	int hlen = MIN_H_SIZE;
-
-	/* Handle implied connection establishment */
-	if (unlikely(dest)) {
-		rc = __tipc_sendmsg(sock, m, dsz);
-		hlen = msg_hdr_sz(mhdr);
-		if (dsz && (dsz == rc))
-			tsk->snt_unacked = tsk_inc(tsk, dsz + hlen);
-		return rc;
-	}
-	if (dsz > (uint)INT_MAX)
-		return -EMSGSIZE;
-
-	if (unlikely(!tipc_sk_connected(sk))) {
-		if (sk->sk_state == TIPC_DISCONNECTING)
-			return -EPIPE;
-		else
-			return -ENOTCONN;
-	}
+	long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_msg *hdr = &tsk->phdr;
+	struct net *net = sock_net(sk);
+	struct sk_buff_head pkts;
+	u32 dnode = tsk_peer_node(tsk);
+	int send, sent = 0;
+	int rc = 0;
 
-	timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
-	if (!timeo && tsk->link_cong)
-		return -ELINKCONG;
+	skb_queue_head_init(&pkts);
 
-	dnode = tsk_peer_node(tsk);
-	skb_queue_head_init(&pktchain);
+	if (unlikely(dlen > INT_MAX))
+		return -EMSGSIZE;
 
-next:
-	save = m->msg_iter;
-	mtu = tsk->max_pkt;
-	send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
-	rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
-	if (unlikely(rc < 0))
+	/* Handle implicit connection setup */
+	if (unlikely(dest)) {
+		rc = __tipc_sendmsg(sock, m, dlen);
+		if (dlen && (dlen == rc))
+			tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr));
 		return rc;
+	}
 
 	do {
-		if (likely(!tsk_conn_cong(tsk))) {
-			rc = tipc_node_xmit(net, &pktchain, dnode, portid);
-			if (likely(!rc)) {
-				tsk->snt_unacked += tsk_inc(tsk, send + hlen);
-				sent += send;
-				if (sent == dsz)
-					return dsz;
-				goto next;
-			}
-			if (rc == -EMSGSIZE) {
-				__skb_queue_purge(&pktchain);
-				tsk->max_pkt = tipc_node_get_mtu(net, dnode,
-								 portid);
-				m->msg_iter = save;
-				goto next;
-			}
-			if (rc != -ELINKCONG)
-				break;
-
-			tsk->link_cong = 1;
-		}
-		rc = tipc_wait_for_cond(sock, &timeo,
-					(!tsk->link_cong &&
+		rc = tipc_wait_for_cond(sock, &timeout,
+					(!tsk->cong_link_cnt &&
 					 !tsk_conn_cong(tsk) &&
 					 tipc_sk_connected(sk)));
-	} while (!rc);
+		if (unlikely(rc))
+			break;
+
+		send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
+		rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts);
+		if (unlikely(rc != send))
+			break;
+
+		rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+		if (unlikely(rc == -ELINKCONG)) {
+			tsk->cong_link_cnt = 1;
+			rc = 0;
+		}
+		if (likely(!rc)) {
+			tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE);
+			sent += send;
+		}
+	} while (sent < dlen && !rc);
 
-	__skb_queue_purge(&pktchain);
-	return sent ? sent : rc;
+	return rc ? rc : sent;
 }
 
 /**
@@ -1121,7 +1079,7 @@ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
 	if (dsz > TIPC_MAX_USER_MSG_SIZE)
 		return -EMSGSIZE;
 
-	return tipc_send_stream(sock, m, dsz);
+	return tipc_sendstream(sock, m, dsz);
 }
 
 /* tipc_sk_finish_conn - complete the setup of a connection
@@ -1688,6 +1646,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
 	unsigned int limit = rcvbuf_limit(sk, skb);
 	int err = TIPC_OK;
 	int usr = msg_user(hdr);
+	u32 onode;
 
 	if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
 		tipc_sk_proto_rcv(tsk, skb, xmitq);
@@ -1695,8 +1654,10 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
 	}
 
 	if (unlikely(usr == SOCK_WAKEUP)) {
+		onode = msg_orignode(hdr);
 		kfree_skb(skb);
-		tsk->link_cong = 0;
+		u32_del(&tsk->cong_links, onode);
+		tsk->cong_link_cnt--;
 		sk->sk_write_space(sk);
 		return false;
 	}
@@ -2104,7 +2065,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
 		struct msghdr m = {NULL,};
 
 		tsk_advance_rx_queue(sk);
-		__tipc_send_stream(new_sock, &m, 0);
+		__tipc_sendstream(new_sock, &m, 0);
 	} else {
 		__skb_dequeue(&sk->sk_receive_queue);
 		__skb_queue_head(&new_sk->sk_receive_queue, buf);
@@ -2565,7 +2526,7 @@ static const struct proto_ops stream_ops = {
 	.shutdown	= tipc_shutdown,
 	.setsockopt	= tipc_setsockopt,
 	.getsockopt	= tipc_getsockopt,
-	.sendmsg	= tipc_send_stream,
+	.sendmsg	= tipc_sendstream,
 	.recvmsg	= tipc_recv_stream,
 	.mmap		= sock_no_mmap,
 	.sendpage	= sock_no_sendpage
-- 
cgit v1.2.3


From ec2507d2a3060a970fa314556891828cfd60093e Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Tue, 3 Jan 2017 19:20:24 +0200
Subject: net/sched: cls_matchall: Fix error path

Fix several error paths in matchall:
 - Release reference to actions in case the hardware fails offloading
   (relevant to skip_sw only)
 - Fix error path in case tcf_exts initialization/validation fail

Fixes: bf3994d2ed31 ("net/sched: introduce Match-all classifier")
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index f935429bd5ef..fcecf5aac666 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -141,10 +141,12 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 	struct tcf_exts e;
 	int err;
 
-	tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
+	err = tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
+	if (err)
+		return err;
 	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
-		return err;
+		goto errout;
 
 	if (tb[TCA_MATCHALL_CLASSID]) {
 		f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
@@ -154,6 +156,9 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 	tcf_exts_change(tp, &f->exts, &e);
 
 	return 0;
+errout:
+	tcf_exts_destroy(&e);
+	return err;
 }
 
 static int mall_change(struct net *net, struct sk_buff *in_skb,
@@ -193,7 +198,9 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (!f)
 		return -ENOBUFS;
 
-	tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0);
+	err = tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0);
+	if (err)
+		goto err_exts_init;
 
 	if (!handle)
 		handle = 1;
@@ -202,13 +209,13 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 
 	err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
 	if (err)
-		goto errout;
+		goto err_set_parms;
 
 	if (tc_should_offload(dev, tp, flags)) {
 		err = mall_replace_hw_filter(tp, f, (unsigned long) f);
 		if (err) {
 			if (tc_skip_sw(flags))
-				goto errout;
+				goto err_replace_hw_filter;
 			else
 				err = 0;
 		}
@@ -219,7 +226,10 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 
 	return 0;
 
-errout:
+err_replace_hw_filter:
+err_set_parms:
+	tcf_exts_destroy(&f->exts);
+err_exts_init:
 	kfree(f);
 	return err;
 }
-- 
cgit v1.2.3


From 1365e547c6bb6b6b137bc40a189675503baa037b Mon Sep 17 00:00:00 2001
From: Alexander Alemayhu
Date: Tue, 3 Jan 2017 17:13:20 +0100
Subject: xfrm: trivial typos

o s/descentant/descendant
o s/workarbound/workaround

Signed-off-by: Alexander Alemayhu <alexander@alemayhu.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 +-
 net/xfrm/xfrm_state.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 177e208e8ff5..99ad1af2927f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -330,7 +330,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
 
-/* Rule must be locked. Release descentant resources, announce
+/* Rule must be locked. Release descendant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
  */
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 64e3c82eedf6..c5cf4d611aab 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -409,7 +409,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
 			if (x->xflags & XFRM_SOFT_EXPIRE) {
 				/* enter hard expire without soft expire first?!
 				 * setting a new date could trigger this.
-				 * workarbound: fix x->curflt.add_time by below:
+				 * workaround: fix x->curflt.add_time by below:
 				 */
 				x->curlft.add_time = now - x->saved_tmo - 1;
 				tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
-- 
cgit v1.2.3


From 5ec71dd7f1b63da20fefebabf1e66cb530b9eb4d Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Wed, 4 Jan 2017 08:24:49 +0100
Subject: cfg80211: sysfs: use wiphy_name()

Instead of open-coding dev_name(), use the wiphy_name() inline
to make the code easier to understand. While at it, clean up
some coding style.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/sysfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 14b3f007826d..16b6b5988be9 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -39,9 +39,11 @@ SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
 
 static ssize_t name_show(struct device *dev,
 			 struct device_attribute *attr,
-			 char *buf) {
+			 char *buf)
+{
 	struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
-	return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
+
+	return sprintf(buf, "%s\n", wiphy_name(wiphy));
 }
 static DEVICE_ATTR_RO(name);
 
-- 
cgit v1.2.3


From 1ff8cebf49ed9e9ca2ae44b5c4176aef9c21af9c Mon Sep 17 00:00:00 2001
From: yuan linyu
Date: Tue, 3 Jan 2017 20:42:17 +0800
Subject: scm: remove use CMSG{_COMPAT}_ALIGN(sizeof(struct {compat_}cmsghdr))

sizeof(struct cmsghdr) and sizeof(struct compat_cmsghdr) already aligned.
remove use CMSG_ALIGN(sizeof(struct cmsghdr)) and
CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) keep code consistent.

Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h |  6 +++---
 net/compat.c           | 14 ++++++--------
 net/core/scm.c         |  2 +-
 net/ipv4/ip_sockglue.c |  2 +-
 net/rxrpc/sendmsg.c    |  2 +-
 5 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index b5cc5a6d7011..c06438023d79 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -92,9 +92,9 @@ struct cmsghdr {
 
 #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) )
 
-#define CMSG_DATA(cmsg)	((void *)((char *)(cmsg) + CMSG_ALIGN(sizeof(struct cmsghdr))))
-#define CMSG_SPACE(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + CMSG_ALIGN(len))
-#define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + (len))
+#define CMSG_DATA(cmsg)	((void *)((char *)(cmsg) + sizeof(struct cmsghdr)))
+#define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len))
+#define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len))
 
 #define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \
 				  (struct cmsghdr *)(ctl) : \
diff --git a/net/compat.c b/net/compat.c
index 96c544b05b15..4e27dd1cd3a6 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -90,11 +90,11 @@ int get_compat_msghdr(struct msghdr *kmsg,
 #define CMSG_COMPAT_ALIGN(len)	ALIGN((len), sizeof(s32))
 
 #define CMSG_COMPAT_DATA(cmsg)				\
-	((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
+	((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr)))
 #define CMSG_COMPAT_SPACE(len)				\
-	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
+	(sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len))
 #define CMSG_COMPAT_LEN(len)				\
-	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
+	(sizeof(struct compat_cmsghdr) + (len))
 
 #define CMSG_COMPAT_FIRSTHDR(msg)			\
 	(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ?	\
@@ -141,8 +141,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
 			return -EINVAL;
 
-		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
-		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
 		tmp = CMSG_ALIGN(tmp);
 		kcmlen += tmp;
 		ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
@@ -168,8 +167,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 			goto Efault;
 		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
 			goto Einval;
-		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
-		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
 		if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
 			goto Einval;
 		kcmsg->cmsg_len = tmp;
@@ -178,7 +176,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 		    __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
 		    copy_from_user(CMSG_DATA(kcmsg),
 				   CMSG_COMPAT_DATA(ucmsg),
-				   (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+				   (ucmlen - sizeof(*ucmsg))))
 			goto Efault;
 
 		/* Advance. */
diff --git a/net/core/scm.c b/net/core/scm.c
index d8820438ba37..b6d83686e149 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -71,7 +71,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 	struct file **fpp;
 	int i, num;
 
-	num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+	num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
 
 	if (num <= 0)
 		return 0;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf9b43de4690..1caa38d8a9c9 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -272,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			continue;
 		switch (cmsg->cmsg_type) {
 		case IP_RETOPTS:
-			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+			err = cmsg->cmsg_len - sizeof(struct cmsghdr);
 
 			/* Our caller is responsible for freeing ipc->opt */
 			err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index b214a4d4a641..0a6ef217aa8a 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -376,7 +376,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 
-		len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+		len = cmsg->cmsg_len - sizeof(struct cmsghdr);
 		_debug("CMSG %d, %d, %d",
 		       cmsg->cmsg_level, cmsg->cmsg_type, len);
 
-- 
cgit v1.2.3


From ac4340fc3ce0e0b1cb627b05d6dcbd473544d7b3 Mon Sep 17 00:00:00 2001
From: David S. Miller
Date: Wed, 4 Jan 2017 13:24:19 -0500
Subject: net: Assert at build time the assumptions we make about the CMSG
 header.

It must always be the case that CMSG_ALIGN(sizeof(hdr)) == sizeof(hdr).

Otherwise there are missing adjustments in the various calculations
that parse and build these things.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/compat.c | 3 +++
 net/socket.c | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/compat.c b/net/compat.c
index 4e27dd1cd3a6..ba3ac722714d 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -130,6 +130,9 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 	__kernel_size_t kcmlen, tmp;
 	int err = -EFAULT;
 
+	BUILD_BUG_ON(sizeof(struct compat_cmsghdr) !=
+		     CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)));
+
 	kcmlen = 0;
 	kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
 	ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
diff --git a/net/socket.c b/net/socket.c
index 8487bf136e5c..5f3b5a2c4f37 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1948,6 +1948,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 		ctl_buf = msg_sys->msg_control;
 		ctl_len = msg_sys->msg_controllen;
 	} else if (ctl_len) {
+		BUILD_BUG_ON(sizeof(struct cmsghdr) !=
+			     CMSG_ALIGN(sizeof(struct cmsghdr)));
 		if (ctl_len > sizeof(ctl)) {
 			ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
 			if (ctl_buf == NULL)
-- 
cgit v1.2.3


From a896eee3349e4e7f35a83f3b1a93c2e048d976b9 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Tue, 3 Jan 2017 14:31:49 -0500
Subject: net: dsa: remove out label in dsa_switch_setup_one

The "out" label in dsa_switch_setup_one() is useless, thus remove it.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c | 40 +++++++++++++---------------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7899919cd9f0..89e66b623d73 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -329,8 +329,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 			if (dst->cpu_switch != -1) {
 				netdev_err(dst->master_netdev,
 					   "multiple cpu ports?!\n");
-				ret = -EINVAL;
-				goto out;
+				return -EINVAL;
 			}
 			dst->cpu_switch = index;
 			dst->cpu_port = i;
@@ -343,10 +342,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 		valid_name_found = true;
 	}
 
-	if (!valid_name_found && i == DSA_MAX_PORTS) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (!valid_name_found && i == DSA_MAX_PORTS)
+		return -EINVAL;
 
 	/* Make the built-in MII bus mask match the number of ports,
 	 * switch drivers can override this later
@@ -363,10 +360,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 
 		tag_protocol = ops->get_tag_protocol(ds);
 		dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
-		if (IS_ERR(dst->tag_ops)) {
-			ret = PTR_ERR(dst->tag_ops);
-			goto out;
-		}
+		if (IS_ERR(dst->tag_ops))
+			return PTR_ERR(dst->tag_ops);
 
 		dst->rcv = dst->tag_ops->rcv;
 	}
@@ -378,25 +373,23 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	 */
 	ret = ops->setup(ds);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	if (ops->set_addr) {
 		ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
 		if (ret < 0)
-			goto out;
+			return ret;
 	}
 
 	if (!ds->slave_mii_bus && ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(parent);
-		if (!ds->slave_mii_bus) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		if (!ds->slave_mii_bus)
+			return -ENOMEM;
 		dsa_slave_mii_bus_init(ds);
 
 		ret = mdiobus_register(ds->slave_mii_bus);
 		if (ret < 0)
-			goto out;
+			return ret;
 	}
 
 	/*
@@ -409,20 +402,16 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 			continue;
 
 		ret = dsa_slave_create(ds, parent, i, cd->port_names[i]);
-		if (ret < 0) {
+		if (ret < 0)
 			netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
 				   index, i, cd->port_names[i], ret);
-			ret = 0;
-		}
 	}
 
 	/* Perform configuration of the CPU and DSA ports */
 	ret = dsa_cpu_dsa_setups(ds, parent);
-	if (ret < 0) {
+	if (ret < 0)
 		netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n",
 			   index);
-		ret = 0;
-	}
 
 	ret = dsa_cpu_port_ethtool_setup(ds);
 	if (ret)
@@ -453,10 +442,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	}
 #endif /* CONFIG_NET_DSA_HWMON */
 
-	return ret;
-
-out:
-	return ret;
+	return 0;
 }
 
 static struct dsa_switch *
-- 
cgit v1.2.3


From 57ea884b0dcf1e59661955919976ef138ec9cdb0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Thu, 5 Jan 2017 02:34:28 +0100
Subject: packet: fix panic in __packet_set_timestamp on tpacket_v3 in tx mode

When TX timestamping is in use with TPACKET_V3's TX ring, then we'll
hit the BUG() in __packet_set_timestamp() when ring buffer slot is
returned to user space via tpacket_destruct_skb(). This is due to v3
being assumed as unreachable here, but since 7f953ab2ba46 ("af_packet:
TX_RING support for TPACKET_V3") it's not anymore. Fix it by filling
the timestamp back into the ring slot.

Fixes: 7f953ab2ba46 ("af_packet: TX_RING support for TPACKET_V3")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7e39087d519b..ddbda255b6ae 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -481,6 +481,9 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
 		h.h2->tp_nsec = ts.tv_nsec;
 		break;
 	case TPACKET_V3:
+		h.h3->tp_sec = ts.tv_sec;
+		h.h3->tp_nsec = ts.tv_nsec;
+		break;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
-- 
cgit v1.2.3


From b54a134a7de461f804cf0e28331d0a43ee82fb13 Mon Sep 17 00:00:00 2001
From: David Howells
Date: Thu, 5 Jan 2017 10:38:33 +0000
Subject: rxrpc: Fix handling of enums-to-string translation in tracing

Fix the way enum values are translated into strings in AF_RXRPC
tracepoints.  The problem with just doing a lookup in a normal flat array
of strings or chars is that external tracing infrastructure can't find it.
Rather, TRACE_DEFINE_ENUM must be used.

Also sort the enums and string tables to make it easier to keep them in
order so that a future patch to __print_symbolic() can be optimised to try
a direct lookup into the table first before iterating over it.

A couple of _proto() macro calls are removed because they refered to tables
that got moved to the tracing infrastructure.  The relevant data can be
found by way of tracing.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 424 ++++++++++++++++++++++++++++++++++++++++---
 net/rxrpc/ar-internal.h      | 194 --------------------
 net/rxrpc/call_object.c      |  18 --
 net/rxrpc/conn_client.c      |   8 -
 net/rxrpc/input.c            |  10 -
 net/rxrpc/misc.c             | 151 ---------------
 6 files changed, 403 insertions(+), 402 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 0383e5e9a0f3..2395a57462c9 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -16,6 +16,386 @@
 
 #include <linux/tracepoint.h>
 
+/*
+ * Define enums for tracing information.
+ *
+ * These should all be kept sorted, making it easier to match the string
+ * mapping tables further on.
+ */
+#ifndef __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY
+#define __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY
+
+enum rxrpc_skb_trace {
+	rxrpc_skb_rx_cleaned,
+	rxrpc_skb_rx_freed,
+	rxrpc_skb_rx_got,
+	rxrpc_skb_rx_lost,
+	rxrpc_skb_rx_purged,
+	rxrpc_skb_rx_received,
+	rxrpc_skb_rx_rotated,
+	rxrpc_skb_rx_seen,
+	rxrpc_skb_tx_cleaned,
+	rxrpc_skb_tx_freed,
+	rxrpc_skb_tx_got,
+	rxrpc_skb_tx_new,
+	rxrpc_skb_tx_rotated,
+	rxrpc_skb_tx_seen,
+};
+
+enum rxrpc_conn_trace {
+	rxrpc_conn_got,
+	rxrpc_conn_new_client,
+	rxrpc_conn_new_service,
+	rxrpc_conn_put_client,
+	rxrpc_conn_put_service,
+	rxrpc_conn_queued,
+	rxrpc_conn_seen,
+};
+
+enum rxrpc_client_trace {
+	rxrpc_client_activate_chans,
+	rxrpc_client_alloc,
+	rxrpc_client_chan_activate,
+	rxrpc_client_chan_disconnect,
+	rxrpc_client_chan_pass,
+	rxrpc_client_chan_unstarted,
+	rxrpc_client_cleanup,
+	rxrpc_client_count,
+	rxrpc_client_discard,
+	rxrpc_client_duplicate,
+	rxrpc_client_exposed,
+	rxrpc_client_replace,
+	rxrpc_client_to_active,
+	rxrpc_client_to_culled,
+	rxrpc_client_to_idle,
+	rxrpc_client_to_inactive,
+	rxrpc_client_to_upgrade,
+	rxrpc_client_to_waiting,
+	rxrpc_client_uncount,
+};
+
+enum rxrpc_call_trace {
+	rxrpc_call_connected,
+	rxrpc_call_error,
+	rxrpc_call_got,
+	rxrpc_call_got_kernel,
+	rxrpc_call_got_userid,
+	rxrpc_call_new_client,
+	rxrpc_call_new_service,
+	rxrpc_call_put,
+	rxrpc_call_put_kernel,
+	rxrpc_call_put_noqueue,
+	rxrpc_call_put_userid,
+	rxrpc_call_queued,
+	rxrpc_call_queued_ref,
+	rxrpc_call_release,
+	rxrpc_call_seen,
+};
+
+enum rxrpc_transmit_trace {
+	rxrpc_transmit_await_reply,
+	rxrpc_transmit_end,
+	rxrpc_transmit_queue,
+	rxrpc_transmit_queue_last,
+	rxrpc_transmit_rotate,
+	rxrpc_transmit_rotate_last,
+	rxrpc_transmit_wait,
+};
+
+enum rxrpc_receive_trace {
+	rxrpc_receive_end,
+	rxrpc_receive_front,
+	rxrpc_receive_incoming,
+	rxrpc_receive_queue,
+	rxrpc_receive_queue_last,
+	rxrpc_receive_rotate,
+};
+
+enum rxrpc_recvmsg_trace {
+	rxrpc_recvmsg_cont,
+	rxrpc_recvmsg_data_return,
+	rxrpc_recvmsg_dequeue,
+	rxrpc_recvmsg_enter,
+	rxrpc_recvmsg_full,
+	rxrpc_recvmsg_hole,
+	rxrpc_recvmsg_next,
+	rxrpc_recvmsg_return,
+	rxrpc_recvmsg_terminal,
+	rxrpc_recvmsg_to_be_accepted,
+	rxrpc_recvmsg_wait,
+};
+
+enum rxrpc_rtt_tx_trace {
+	rxrpc_rtt_tx_data,
+	rxrpc_rtt_tx_ping,
+};
+
+enum rxrpc_rtt_rx_trace {
+	rxrpc_rtt_rx_ping_response,
+	rxrpc_rtt_rx_requested_ack,
+};
+
+enum rxrpc_timer_trace {
+	rxrpc_timer_begin,
+	rxrpc_timer_expired,
+	rxrpc_timer_init_for_reply,
+	rxrpc_timer_init_for_send_reply,
+	rxrpc_timer_set_for_ack,
+	rxrpc_timer_set_for_ping,
+	rxrpc_timer_set_for_resend,
+	rxrpc_timer_set_for_send,
+};
+
+enum rxrpc_propose_ack_trace {
+	rxrpc_propose_ack_client_tx_end,
+	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_lost_ack,
+	rxrpc_propose_ack_ping_for_lost_reply,
+	rxrpc_propose_ack_ping_for_params,
+	rxrpc_propose_ack_processing_op,
+	rxrpc_propose_ack_respond_to_ack,
+	rxrpc_propose_ack_respond_to_ping,
+	rxrpc_propose_ack_retry_tx,
+	rxrpc_propose_ack_rotate_rx,
+	rxrpc_propose_ack_terminal_ack,
+};
+
+enum rxrpc_propose_ack_outcome {
+	rxrpc_propose_ack_subsume,
+	rxrpc_propose_ack_update,
+	rxrpc_propose_ack_use,
+};
+
+enum rxrpc_congest_change {
+	rxrpc_cong_begin_retransmission,
+	rxrpc_cong_cleared_nacks,
+	rxrpc_cong_new_low_nack,
+	rxrpc_cong_no_change,
+	rxrpc_cong_progress,
+	rxrpc_cong_retransmit_again,
+	rxrpc_cong_rtt_window_end,
+	rxrpc_cong_saw_nack,
+};
+
+#endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */
+
+/*
+ * Declare tracing information enums and their string mappings for display.
+ */
+#define rxrpc_skb_traces \
+	EM(rxrpc_skb_rx_cleaned,		"Rx CLN") \
+	EM(rxrpc_skb_rx_freed,			"Rx FRE") \
+	EM(rxrpc_skb_rx_got,			"Rx GOT") \
+	EM(rxrpc_skb_rx_lost,			"Rx *L*") \
+	EM(rxrpc_skb_rx_purged,			"Rx PUR") \
+	EM(rxrpc_skb_rx_received,		"Rx RCV") \
+	EM(rxrpc_skb_rx_rotated,		"Rx ROT") \
+	EM(rxrpc_skb_rx_seen,			"Rx SEE") \
+	EM(rxrpc_skb_tx_cleaned,		"Tx CLN") \
+	EM(rxrpc_skb_tx_freed,			"Tx FRE") \
+	EM(rxrpc_skb_tx_got,			"Tx GOT") \
+	EM(rxrpc_skb_tx_new,			"Tx NEW") \
+	EM(rxrpc_skb_tx_rotated,		"Tx ROT") \
+	E_(rxrpc_skb_tx_seen,			"Tx SEE")
+
+#define rxrpc_conn_traces \
+	EM(rxrpc_conn_got,			"GOT") \
+	EM(rxrpc_conn_new_client,		"NWc") \
+	EM(rxrpc_conn_new_service,		"NWs") \
+	EM(rxrpc_conn_put_client,		"PTc") \
+	EM(rxrpc_conn_put_service,		"PTs") \
+	EM(rxrpc_conn_queued,			"QUE") \
+	E_(rxrpc_conn_seen,			"SEE")
+
+#define rxrpc_client_traces \
+	EM(rxrpc_client_activate_chans,		"Activa") \
+	EM(rxrpc_client_alloc,			"Alloc ") \
+	EM(rxrpc_client_chan_activate,		"ChActv") \
+	EM(rxrpc_client_chan_disconnect,	"ChDisc") \
+	EM(rxrpc_client_chan_pass,		"ChPass") \
+	EM(rxrpc_client_chan_unstarted,		"ChUnst") \
+	EM(rxrpc_client_cleanup,		"Clean ") \
+	EM(rxrpc_client_count,			"Count ") \
+	EM(rxrpc_client_discard,		"Discar") \
+	EM(rxrpc_client_duplicate,		"Duplic") \
+	EM(rxrpc_client_exposed,		"Expose") \
+	EM(rxrpc_client_replace,		"Replac") \
+	EM(rxrpc_client_to_active,		"->Actv") \
+	EM(rxrpc_client_to_culled,		"->Cull") \
+	EM(rxrpc_client_to_idle,		"->Idle") \
+	EM(rxrpc_client_to_inactive,		"->Inac") \
+	EM(rxrpc_client_to_upgrade,		"->Upgd") \
+	EM(rxrpc_client_to_waiting,		"->Wait") \
+	E_(rxrpc_client_uncount,		"Uncoun")
+
+#define rxrpc_conn_cache_states \
+	EM(RXRPC_CONN_CLIENT_INACTIVE,		"Inac") \
+	EM(RXRPC_CONN_CLIENT_WAITING,		"Wait") \
+	EM(RXRPC_CONN_CLIENT_ACTIVE,		"Actv") \
+	EM(RXRPC_CONN_CLIENT_CULLED,		"Cull") \
+	E_(RXRPC_CONN_CLIENT_IDLE,		"Idle") \
+
+#define rxrpc_call_traces \
+	EM(rxrpc_call_connected,		"CON") \
+	EM(rxrpc_call_error,			"*E*") \
+	EM(rxrpc_call_got,			"GOT") \
+	EM(rxrpc_call_got_kernel,		"Gke") \
+	EM(rxrpc_call_got_userid,		"Gus") \
+	EM(rxrpc_call_new_client,		"NWc") \
+	EM(rxrpc_call_new_service,		"NWs") \
+	EM(rxrpc_call_put,			"PUT") \
+	EM(rxrpc_call_put_kernel,		"Pke") \
+	EM(rxrpc_call_put_noqueue,		"PNQ") \
+	EM(rxrpc_call_put_userid,		"Pus") \
+	EM(rxrpc_call_queued,			"QUE") \
+	EM(rxrpc_call_queued_ref,		"QUR") \
+	EM(rxrpc_call_release,			"RLS") \
+	E_(rxrpc_call_seen,			"SEE")
+
+#define rxrpc_transmit_traces \
+	EM(rxrpc_transmit_await_reply,		"AWR") \
+	EM(rxrpc_transmit_end,			"END") \
+	EM(rxrpc_transmit_queue,		"QUE") \
+	EM(rxrpc_transmit_queue_last,		"QLS") \
+	EM(rxrpc_transmit_rotate,		"ROT") \
+	EM(rxrpc_transmit_rotate_last,		"RLS") \
+	E_(rxrpc_transmit_wait,			"WAI")
+
+#define rxrpc_receive_traces \
+	EM(rxrpc_receive_end,			"END") \
+	EM(rxrpc_receive_front,			"FRN") \
+	EM(rxrpc_receive_incoming,		"INC") \
+	EM(rxrpc_receive_queue,			"QUE") \
+	EM(rxrpc_receive_queue_last,		"QLS") \
+	E_(rxrpc_receive_rotate,		"ROT")
+
+#define rxrpc_recvmsg_traces \
+	EM(rxrpc_recvmsg_cont,			"CONT") \
+	EM(rxrpc_recvmsg_data_return,		"DATA") \
+	EM(rxrpc_recvmsg_dequeue,		"DEQU") \
+	EM(rxrpc_recvmsg_enter,			"ENTR") \
+	EM(rxrpc_recvmsg_full,			"FULL") \
+	EM(rxrpc_recvmsg_hole,			"HOLE") \
+	EM(rxrpc_recvmsg_next,			"NEXT") \
+	EM(rxrpc_recvmsg_return,		"RETN") \
+	EM(rxrpc_recvmsg_terminal,		"TERM") \
+	EM(rxrpc_recvmsg_to_be_accepted,	"TBAC") \
+	E_(rxrpc_recvmsg_wait,			"WAIT")
+
+#define rxrpc_rtt_tx_traces \
+	EM(rxrpc_rtt_tx_data,			"DATA") \
+	E_(rxrpc_rtt_tx_ping,			"PING")
+
+#define rxrpc_rtt_rx_traces \
+	EM(rxrpc_rtt_rx_ping_response,		"PONG") \
+	E_(rxrpc_rtt_rx_requested_ack,		"RACK")
+
+#define rxrpc_timer_traces \
+	EM(rxrpc_timer_begin,			"Begin ") \
+	EM(rxrpc_timer_expired,			"*EXPR*") \
+	EM(rxrpc_timer_init_for_reply,		"IniRpl") \
+	EM(rxrpc_timer_init_for_send_reply,	"SndRpl") \
+	EM(rxrpc_timer_set_for_ack,		"SetAck") \
+	EM(rxrpc_timer_set_for_ping,		"SetPng") \
+	EM(rxrpc_timer_set_for_resend,		"SetRTx") \
+	E_(rxrpc_timer_set_for_send,		"SetTx ")
+
+#define rxrpc_propose_ack_traces \
+	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
+	EM(rxrpc_propose_ack_input_data,	"DataIn ") \
+	EM(rxrpc_propose_ack_ping_for_lost_ack,	"LostAck") \
+	EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
+	EM(rxrpc_propose_ack_ping_for_params,	"Params ") \
+	EM(rxrpc_propose_ack_processing_op,	"ProcOp ") \
+	EM(rxrpc_propose_ack_respond_to_ack,	"Rsp2Ack") \
+	EM(rxrpc_propose_ack_respond_to_ping,	"Rsp2Png") \
+	EM(rxrpc_propose_ack_retry_tx,		"RetryTx") \
+	EM(rxrpc_propose_ack_rotate_rx,		"RxAck  ") \
+	E_(rxrpc_propose_ack_terminal_ack,	"ClTerm ")
+
+#define rxrpc_propose_ack_outcomes \
+	EM(rxrpc_propose_ack_subsume,		" Subsume") \
+	EM(rxrpc_propose_ack_update,		" Update") \
+	E_(rxrpc_propose_ack_use,		"")
+
+#define rxrpc_congest_modes \
+	EM(RXRPC_CALL_CONGEST_AVOIDANCE,	"CongAvoid") \
+	EM(RXRPC_CALL_FAST_RETRANSMIT,		"FastReTx ") \
+	EM(RXRPC_CALL_PACKET_LOSS,		"PktLoss  ") \
+	E_(RXRPC_CALL_SLOW_START,		"SlowStart")
+
+#define rxrpc_congest_changes \
+	EM(rxrpc_cong_begin_retransmission,	" Retrans") \
+	EM(rxrpc_cong_cleared_nacks,		" Cleared") \
+	EM(rxrpc_cong_new_low_nack,		" NewLowN") \
+	EM(rxrpc_cong_no_change,		"") \
+	EM(rxrpc_cong_progress,			" Progres") \
+	EM(rxrpc_cong_retransmit_again,		" ReTxAgn") \
+	EM(rxrpc_cong_rtt_window_end,		" RttWinE") \
+	E_(rxrpc_cong_saw_nack,			" SawNack")
+
+#define rxrpc_pkts \
+	EM(0,					"?00") \
+	EM(RXRPC_PACKET_TYPE_DATA,		"DATA") \
+	EM(RXRPC_PACKET_TYPE_ACK,		"ACK") \
+	EM(RXRPC_PACKET_TYPE_BUSY,		"BUSY") \
+	EM(RXRPC_PACKET_TYPE_ABORT,		"ABORT") \
+	EM(RXRPC_PACKET_TYPE_ACKALL,		"ACKALL") \
+	EM(RXRPC_PACKET_TYPE_CHALLENGE,		"CHALL") \
+	EM(RXRPC_PACKET_TYPE_RESPONSE,		"RESP") \
+	EM(RXRPC_PACKET_TYPE_DEBUG,		"DEBUG") \
+	EM(9,					"?09") \
+	EM(10,					"?10") \
+	EM(11,					"?11") \
+	EM(12,					"?12") \
+	EM(RXRPC_PACKET_TYPE_VERSION,		"VERSION") \
+	EM(14,					"?14") \
+	E_(15,					"?15")
+
+#define rxrpc_ack_names \
+	EM(0,					"-0-") \
+	EM(RXRPC_ACK_REQUESTED,			"REQ") \
+	EM(RXRPC_ACK_DUPLICATE,			"DUP") \
+	EM(RXRPC_ACK_OUT_OF_SEQUENCE,		"OOS") \
+	EM(RXRPC_ACK_EXCEEDS_WINDOW,		"WIN") \
+	EM(RXRPC_ACK_NOSPACE,			"MEM") \
+	EM(RXRPC_ACK_PING,			"PNG") \
+	EM(RXRPC_ACK_PING_RESPONSE,		"PNR") \
+	EM(RXRPC_ACK_DELAY,			"DLY") \
+	EM(RXRPC_ACK_IDLE,			"IDL") \
+	E_(RXRPC_ACK__INVALID,			"-?-")
+
+/*
+ * Export enum symbols via userspace.
+ */
+#undef EM
+#undef E_
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define E_(a, b) TRACE_DEFINE_ENUM(a);
+
+rxrpc_skb_traces;
+rxrpc_conn_traces;
+rxrpc_client_traces;
+rxrpc_call_traces;
+rxrpc_transmit_traces;
+rxrpc_receive_traces;
+rxrpc_recvmsg_traces;
+rxrpc_rtt_tx_traces;
+rxrpc_rtt_rx_traces;
+rxrpc_timer_traces;
+rxrpc_propose_ack_traces;
+rxrpc_propose_ack_outcomes;
+rxrpc_congest_changes;
+
+/*
+ * Now redefine the EM() and E_() macros to map the enums to the strings that
+ * will be printed in the output.
+ */
+#undef EM
+#undef E_
+#define EM(a, b)	{ a, b },
+#define E_(a, b)	{ a, b }
+
 TRACE_EVENT(rxrpc_conn,
 	    TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op,
 		     int usage, const void *where),
@@ -38,7 +418,7 @@ TRACE_EVENT(rxrpc_conn,
 
 	    TP_printk("C=%p %s u=%d sp=%pSR",
 		      __entry->conn,
-		      rxrpc_conn_traces[__entry->op],
+		      __print_symbolic(__entry->op, rxrpc_conn_traces),
 		      __entry->usage,
 		      __entry->where)
 	    );
@@ -70,8 +450,8 @@ TRACE_EVENT(rxrpc_client,
 	    TP_printk("C=%p h=%2d %s %s i=%08x u=%d",
 		      __entry->conn,
 		      __entry->channel,
-		      rxrpc_client_traces[__entry->op],
-		      rxrpc_conn_cache_states[__entry->cs],
+		      __print_symbolic(__entry->op, rxrpc_client_traces),
+		      __print_symbolic(__entry->cs, rxrpc_conn_cache_states),
 		      __entry->cid,
 		      __entry->usage)
 	    );
@@ -100,7 +480,7 @@ TRACE_EVENT(rxrpc_call,
 
 	    TP_printk("c=%p %s u=%d sp=%pSR a=%p",
 		      __entry->call,
-		      rxrpc_call_traces[__entry->op],
+		      __print_symbolic(__entry->op, rxrpc_call_traces),
 		      __entry->usage,
 		      __entry->where,
 		      __entry->aux)
@@ -130,7 +510,7 @@ TRACE_EVENT(rxrpc_skb,
 
 	    TP_printk("s=%p %s u=%d m=%d p=%pSR",
 		      __entry->skb,
-		      rxrpc_skb_traces[__entry->op],
+		      __print_symbolic(__entry->op, rxrpc_skb_traces),
 		      __entry->usage,
 		      __entry->mod_count,
 		      __entry->where)
@@ -154,7 +534,8 @@ TRACE_EVENT(rxrpc_rx_packet,
 		      __entry->hdr.callNumber, __entry->hdr.serviceId,
 		      __entry->hdr.serial, __entry->hdr.seq,
 		      __entry->hdr.type, __entry->hdr.flags,
-		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
+		      __entry->hdr.type <= 15 ?
+		      __print_symbolic(__entry->hdr.type, rxrpc_pkts) : "?UNK")
 	    );
 
 TRACE_EVENT(rxrpc_rx_done,
@@ -225,7 +606,7 @@ TRACE_EVENT(rxrpc_transmit,
 
 	    TP_printk("c=%p %s f=%08x n=%u",
 		      __entry->call,
-		      rxrpc_transmit_traces[__entry->why],
+		      __print_symbolic(__entry->why, rxrpc_transmit_traces),
 		      __entry->tx_hard_ack + 1,
 		      __entry->tx_top - __entry->tx_hard_ack)
 	    );
@@ -251,7 +632,7 @@ TRACE_EVENT(rxrpc_rx_ack,
 
 	    TP_printk("c=%p %s f=%08x n=%u",
 		      __entry->call,
-		      rxrpc_ack_names[__entry->reason],
+		      __print_symbolic(__entry->reason, rxrpc_ack_names),
 		      __entry->first,
 		      __entry->n_acks)
 	    );
@@ -317,7 +698,7 @@ TRACE_EVENT(rxrpc_tx_ack,
 	    TP_printk(" c=%p ACK  %08x %s f=%08x r=%08x n=%u",
 		      __entry->call,
 		      __entry->serial,
-		      rxrpc_ack_names[__entry->reason],
+		      __print_symbolic(__entry->reason, rxrpc_ack_names),
 		      __entry->ack_first,
 		      __entry->ack_serial,
 		      __entry->n_acks)
@@ -349,7 +730,7 @@ TRACE_EVENT(rxrpc_receive,
 
 	    TP_printk("c=%p %s r=%08x q=%08x w=%08x-%08x",
 		      __entry->call,
-		      rxrpc_receive_traces[__entry->why],
+		      __print_symbolic(__entry->why, rxrpc_receive_traces),
 		      __entry->serial,
 		      __entry->seq,
 		      __entry->hard_ack,
@@ -383,7 +764,7 @@ TRACE_EVENT(rxrpc_recvmsg,
 
 	    TP_printk("c=%p %s q=%08x o=%u l=%u ret=%d",
 		      __entry->call,
-		      rxrpc_recvmsg_traces[__entry->why],
+		      __print_symbolic(__entry->why, rxrpc_recvmsg_traces),
 		      __entry->seq,
 		      __entry->offset,
 		      __entry->len,
@@ -410,7 +791,7 @@ TRACE_EVENT(rxrpc_rtt_tx,
 
 	    TP_printk("c=%p %s sr=%08x",
 		      __entry->call,
-		      rxrpc_rtt_tx_traces[__entry->why],
+		      __print_symbolic(__entry->why, rxrpc_rtt_tx_traces),
 		      __entry->send_serial)
 	    );
 
@@ -443,7 +824,7 @@ TRACE_EVENT(rxrpc_rtt_rx,
 
 	    TP_printk("c=%p %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld",
 		      __entry->call,
-		      rxrpc_rtt_rx_traces[__entry->why],
+		      __print_symbolic(__entry->why, rxrpc_rtt_rx_traces),
 		      __entry->send_serial,
 		      __entry->resp_serial,
 		      __entry->rtt,
@@ -481,7 +862,7 @@ TRACE_EVENT(rxrpc_timer,
 
 	    TP_printk("c=%p %s x=%lld a=%lld r=%lld t=%ld",
 		      __entry->call,
-		      rxrpc_timer_traces[__entry->why],
+		      __print_symbolic(__entry->why, rxrpc_timer_traces),
 		      ktime_to_ns(ktime_sub(__entry->expire_at, __entry->now)),
 		      ktime_to_ns(ktime_sub(__entry->ack_at, __entry->now)),
 		      ktime_to_ns(ktime_sub(__entry->resend_at, __entry->now)),
@@ -506,7 +887,8 @@ TRACE_EVENT(rxrpc_rx_lose,
 		      __entry->hdr.callNumber, __entry->hdr.serviceId,
 		      __entry->hdr.serial, __entry->hdr.seq,
 		      __entry->hdr.type, __entry->hdr.flags,
-		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
+		      __entry->hdr.type <= 15 ?
+		      __print_symbolic(__entry->hdr.type, rxrpc_pkts) : "?UNK")
 	    );
 
 TRACE_EVENT(rxrpc_propose_ack,
@@ -539,12 +921,12 @@ TRACE_EVENT(rxrpc_propose_ack,
 
 	    TP_printk("c=%p %s %s r=%08x i=%u b=%u%s",
 		      __entry->call,
-		      rxrpc_propose_ack_traces[__entry->why],
-		      rxrpc_ack_names[__entry->ack_reason],
+		      __print_symbolic(__entry->why, rxrpc_propose_ack_traces),
+		      __print_symbolic(__entry->ack_reason, rxrpc_ack_names),
 		      __entry->serial,
 		      __entry->immediate,
 		      __entry->background,
-		      rxrpc_propose_ack_outcomes[__entry->outcome])
+		      __print_symbolic(__entry->outcome, rxrpc_propose_ack_outcomes))
 	    );
 
 TRACE_EVENT(rxrpc_retransmit,
@@ -603,9 +985,9 @@ TRACE_EVENT(rxrpc_congest,
 	    TP_printk("c=%p %08x %s %08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s",
 		      __entry->call,
 		      __entry->ack_serial,
-		      rxrpc_ack_names[__entry->sum.ack_reason],
+		      __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names),
 		      __entry->hard_ack,
-		      rxrpc_congest_modes[__entry->sum.mode],
+		      __print_symbolic(__entry->sum.mode, rxrpc_congest_modes),
 		      __entry->sum.cwnd,
 		      __entry->sum.ssthresh,
 		      __entry->sum.nr_acks, __entry->sum.nr_nacks,
@@ -615,7 +997,7 @@ TRACE_EVENT(rxrpc_congest,
 		      __entry->sum.cumulative_acks,
 		      __entry->sum.dup_acks,
 		      __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "",
-		      rxrpc_congest_changes[__entry->change],
+		      __print_symbolic(__entry->change, rxrpc_congest_changes),
 		      __entry->sum.retrans_timeo ? " rTxTo" : "")
 	    );
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f60e35576526..84927c7b5fdf 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -593,200 +593,6 @@ struct rxrpc_ack_summary {
 	u8			cumulative_acks;
 };
 
-enum rxrpc_skb_trace {
-	rxrpc_skb_rx_cleaned,
-	rxrpc_skb_rx_freed,
-	rxrpc_skb_rx_got,
-	rxrpc_skb_rx_lost,
-	rxrpc_skb_rx_received,
-	rxrpc_skb_rx_rotated,
-	rxrpc_skb_rx_purged,
-	rxrpc_skb_rx_seen,
-	rxrpc_skb_tx_cleaned,
-	rxrpc_skb_tx_freed,
-	rxrpc_skb_tx_got,
-	rxrpc_skb_tx_new,
-	rxrpc_skb_tx_rotated,
-	rxrpc_skb_tx_seen,
-	rxrpc_skb__nr_trace
-};
-
-extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
-
-enum rxrpc_conn_trace {
-	rxrpc_conn_new_client,
-	rxrpc_conn_new_service,
-	rxrpc_conn_queued,
-	rxrpc_conn_seen,
-	rxrpc_conn_got,
-	rxrpc_conn_put_client,
-	rxrpc_conn_put_service,
-	rxrpc_conn__nr_trace
-};
-
-extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
-
-enum rxrpc_client_trace {
-	rxrpc_client_activate_chans,
-	rxrpc_client_alloc,
-	rxrpc_client_chan_activate,
-	rxrpc_client_chan_disconnect,
-	rxrpc_client_chan_pass,
-	rxrpc_client_chan_unstarted,
-	rxrpc_client_cleanup,
-	rxrpc_client_count,
-	rxrpc_client_discard,
-	rxrpc_client_duplicate,
-	rxrpc_client_exposed,
-	rxrpc_client_replace,
-	rxrpc_client_to_active,
-	rxrpc_client_to_culled,
-	rxrpc_client_to_idle,
-	rxrpc_client_to_inactive,
-	rxrpc_client_to_waiting,
-	rxrpc_client_uncount,
-	rxrpc_client__nr_trace
-};
-
-extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
-extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
-
-enum rxrpc_call_trace {
-	rxrpc_call_new_client,
-	rxrpc_call_new_service,
-	rxrpc_call_queued,
-	rxrpc_call_queued_ref,
-	rxrpc_call_seen,
-	rxrpc_call_connected,
-	rxrpc_call_release,
-	rxrpc_call_got,
-	rxrpc_call_got_userid,
-	rxrpc_call_got_kernel,
-	rxrpc_call_put,
-	rxrpc_call_put_userid,
-	rxrpc_call_put_kernel,
-	rxrpc_call_put_noqueue,
-	rxrpc_call_error,
-	rxrpc_call__nr_trace
-};
-
-extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
-
-enum rxrpc_transmit_trace {
-	rxrpc_transmit_wait,
-	rxrpc_transmit_queue,
-	rxrpc_transmit_queue_last,
-	rxrpc_transmit_rotate,
-	rxrpc_transmit_rotate_last,
-	rxrpc_transmit_await_reply,
-	rxrpc_transmit_end,
-	rxrpc_transmit__nr_trace
-};
-
-extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
-
-enum rxrpc_receive_trace {
-	rxrpc_receive_incoming,
-	rxrpc_receive_queue,
-	rxrpc_receive_queue_last,
-	rxrpc_receive_front,
-	rxrpc_receive_rotate,
-	rxrpc_receive_end,
-	rxrpc_receive__nr_trace
-};
-
-extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
-
-enum rxrpc_recvmsg_trace {
-	rxrpc_recvmsg_enter,
-	rxrpc_recvmsg_wait,
-	rxrpc_recvmsg_dequeue,
-	rxrpc_recvmsg_hole,
-	rxrpc_recvmsg_next,
-	rxrpc_recvmsg_cont,
-	rxrpc_recvmsg_full,
-	rxrpc_recvmsg_data_return,
-	rxrpc_recvmsg_terminal,
-	rxrpc_recvmsg_to_be_accepted,
-	rxrpc_recvmsg_return,
-	rxrpc_recvmsg__nr_trace
-};
-
-extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
-
-enum rxrpc_rtt_tx_trace {
-	rxrpc_rtt_tx_ping,
-	rxrpc_rtt_tx_data,
-	rxrpc_rtt_tx__nr_trace
-};
-
-extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
-
-enum rxrpc_rtt_rx_trace {
-	rxrpc_rtt_rx_ping_response,
-	rxrpc_rtt_rx_requested_ack,
-	rxrpc_rtt_rx__nr_trace
-};
-
-extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
-
-enum rxrpc_timer_trace {
-	rxrpc_timer_begin,
-	rxrpc_timer_init_for_reply,
-	rxrpc_timer_init_for_send_reply,
-	rxrpc_timer_expired,
-	rxrpc_timer_set_for_ack,
-	rxrpc_timer_set_for_ping,
-	rxrpc_timer_set_for_resend,
-	rxrpc_timer_set_for_send,
-	rxrpc_timer__nr_trace
-};
-
-extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
-
-enum rxrpc_propose_ack_trace {
-	rxrpc_propose_ack_client_tx_end,
-	rxrpc_propose_ack_input_data,
-	rxrpc_propose_ack_ping_for_lost_ack,
-	rxrpc_propose_ack_ping_for_lost_reply,
-	rxrpc_propose_ack_ping_for_params,
-	rxrpc_propose_ack_processing_op,
-	rxrpc_propose_ack_respond_to_ack,
-	rxrpc_propose_ack_respond_to_ping,
-	rxrpc_propose_ack_retry_tx,
-	rxrpc_propose_ack_rotate_rx,
-	rxrpc_propose_ack_terminal_ack,
-	rxrpc_propose_ack__nr_trace
-};
-
-enum rxrpc_propose_ack_outcome {
-	rxrpc_propose_ack_use,
-	rxrpc_propose_ack_update,
-	rxrpc_propose_ack_subsume,
-	rxrpc_propose_ack__nr_outcomes
-};
-
-extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
-extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
-
-enum rxrpc_congest_change {
-	rxrpc_cong_begin_retransmission,
-	rxrpc_cong_cleared_nacks,
-	rxrpc_cong_new_low_nack,
-	rxrpc_cong_no_change,
-	rxrpc_cong_progress,
-	rxrpc_cong_retransmit_again,
-	rxrpc_cong_rtt_window_end,
-	rxrpc_cong_saw_nack,
-	rxrpc_congest__nr_change
-};
-
-extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
-extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
-
-extern const char *const rxrpc_pkts[];
-extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
-
 #include <trace/events/rxrpc.h>
 
 /*
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 1ed18d8c9c9f..8b94db3c9b2e 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -43,24 +43,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 	[RXRPC_CALL_NETWORK_ERROR]		= "NetError",
 };
 
-const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
-	[rxrpc_call_new_client]		= "NWc",
-	[rxrpc_call_new_service]	= "NWs",
-	[rxrpc_call_queued]		= "QUE",
-	[rxrpc_call_queued_ref]		= "QUR",
-	[rxrpc_call_connected]		= "CON",
-	[rxrpc_call_release]		= "RLS",
-	[rxrpc_call_seen]		= "SEE",
-	[rxrpc_call_got]		= "GOT",
-	[rxrpc_call_got_userid]		= "Gus",
-	[rxrpc_call_got_kernel]		= "Gke",
-	[rxrpc_call_put]		= "PUT",
-	[rxrpc_call_put_userid]		= "Pus",
-	[rxrpc_call_put_kernel]		= "Pke",
-	[rxrpc_call_put_noqueue]	= "PNQ",
-	[rxrpc_call_error]		= "*E*",
-};
-
 struct kmem_cache *rxrpc_call_jar;
 LIST_HEAD(rxrpc_calls);
 DEFINE_RWLOCK(rxrpc_call_lock);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 6cbcdcc29853..40a1ef2adeb4 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -105,14 +105,6 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
 static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
 			    rxrpc_discard_expired_client_conns);
 
-const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
-	[RXRPC_CONN_CLIENT_INACTIVE]	= "Inac",
-	[RXRPC_CONN_CLIENT_WAITING]	= "Wait",
-	[RXRPC_CONN_CLIENT_ACTIVE]	= "Actv",
-	[RXRPC_CONN_CLIENT_CULLED]	= "Cull",
-	[RXRPC_CONN_CLIENT_IDLE]	= "Idle",
-};
-
 /*
  * Get a connection ID and epoch for a client connection from the global pool.
  * The connection struct pointer is then recorded in the idr radix tree.  The
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 1d87b5453ef7..7c2abd85def9 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -767,15 +767,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 
 	trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
 
-	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-	       sp->hdr.serial,
-	       ntohs(buf.ack.maxSkew),
-	       first_soft_ack,
-	       ntohl(buf.ack.previousPacket),
-	       acked_serial,
-	       rxrpc_ack_names[summary.ack_reason],
-	       buf.ack.nAcks);
-
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
 		rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
 					  sp->hdr.serial);
@@ -931,7 +922,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
 		break;
 
 	default:
-		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
 		break;
 	}
 
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 6dee55fad2d3..1a2d4b112064 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -77,12 +77,6 @@ unsigned int rxrpc_rx_jumbo_max = 4;
  */
 unsigned int rxrpc_resend_timeout = 4 * 1000;
 
-const char *const rxrpc_pkts[] = {
-	"?00",
-	"DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
-	"?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
-};
-
 const s8 rxrpc_ack_priority[] = {
 	[0]				= 0,
 	[RXRPC_ACK_DELAY]		= 1,
@@ -94,148 +88,3 @@ const s8 rxrpc_ack_priority[] = {
 	[RXRPC_ACK_NOSPACE]		= 7,
 	[RXRPC_ACK_PING_RESPONSE]	= 8,
 };
-
-const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
-	"---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
-	"IDL", "-?-"
-};
-
-const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
-	[rxrpc_skb_rx_cleaned]		= "Rx CLN",
-	[rxrpc_skb_rx_freed]		= "Rx FRE",
-	[rxrpc_skb_rx_got]		= "Rx GOT",
-	[rxrpc_skb_rx_lost]		= "Rx *L*",
-	[rxrpc_skb_rx_received]		= "Rx RCV",
-	[rxrpc_skb_rx_purged]		= "Rx PUR",
-	[rxrpc_skb_rx_rotated]		= "Rx ROT",
-	[rxrpc_skb_rx_seen]		= "Rx SEE",
-	[rxrpc_skb_tx_cleaned]		= "Tx CLN",
-	[rxrpc_skb_tx_freed]		= "Tx FRE",
-	[rxrpc_skb_tx_got]		= "Tx GOT",
-	[rxrpc_skb_tx_new]		= "Tx NEW",
-	[rxrpc_skb_tx_rotated]		= "Tx ROT",
-	[rxrpc_skb_tx_seen]		= "Tx SEE",
-};
-
-const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
-	[rxrpc_conn_new_client]		= "NWc",
-	[rxrpc_conn_new_service]	= "NWs",
-	[rxrpc_conn_queued]		= "QUE",
-	[rxrpc_conn_seen]		= "SEE",
-	[rxrpc_conn_got]		= "GOT",
-	[rxrpc_conn_put_client]		= "PTc",
-	[rxrpc_conn_put_service]	= "PTs",
-};
-
-const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
-	[rxrpc_client_activate_chans]	= "Activa",
-	[rxrpc_client_alloc]		= "Alloc ",
-	[rxrpc_client_chan_activate]	= "ChActv",
-	[rxrpc_client_chan_disconnect]	= "ChDisc",
-	[rxrpc_client_chan_pass]	= "ChPass",
-	[rxrpc_client_chan_unstarted]	= "ChUnst",
-	[rxrpc_client_cleanup]		= "Clean ",
-	[rxrpc_client_count]		= "Count ",
-	[rxrpc_client_discard]		= "Discar",
-	[rxrpc_client_duplicate]	= "Duplic",
-	[rxrpc_client_exposed]		= "Expose",
-	[rxrpc_client_replace]		= "Replac",
-	[rxrpc_client_to_active]	= "->Actv",
-	[rxrpc_client_to_culled]	= "->Cull",
-	[rxrpc_client_to_idle]		= "->Idle",
-	[rxrpc_client_to_inactive]	= "->Inac",
-	[rxrpc_client_to_waiting]	= "->Wait",
-	[rxrpc_client_uncount]		= "Uncoun",
-};
-
-const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
-	[rxrpc_transmit_wait]		= "WAI",
-	[rxrpc_transmit_queue]		= "QUE",
-	[rxrpc_transmit_queue_last]	= "QLS",
-	[rxrpc_transmit_rotate]		= "ROT",
-	[rxrpc_transmit_rotate_last]	= "RLS",
-	[rxrpc_transmit_await_reply]	= "AWR",
-	[rxrpc_transmit_end]		= "END",
-};
-
-const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
-	[rxrpc_receive_incoming]	= "INC",
-	[rxrpc_receive_queue]		= "QUE",
-	[rxrpc_receive_queue_last]	= "QLS",
-	[rxrpc_receive_front]		= "FRN",
-	[rxrpc_receive_rotate]		= "ROT",
-	[rxrpc_receive_end]		= "END",
-};
-
-const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
-	[rxrpc_recvmsg_enter]		= "ENTR",
-	[rxrpc_recvmsg_wait]		= "WAIT",
-	[rxrpc_recvmsg_dequeue]		= "DEQU",
-	[rxrpc_recvmsg_hole]		= "HOLE",
-	[rxrpc_recvmsg_next]		= "NEXT",
-	[rxrpc_recvmsg_cont]		= "CONT",
-	[rxrpc_recvmsg_full]		= "FULL",
-	[rxrpc_recvmsg_data_return]	= "DATA",
-	[rxrpc_recvmsg_terminal]	= "TERM",
-	[rxrpc_recvmsg_to_be_accepted]	= "TBAC",
-	[rxrpc_recvmsg_return]		= "RETN",
-};
-
-const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
-	[rxrpc_rtt_tx_ping]		= "PING",
-	[rxrpc_rtt_tx_data]		= "DATA",
-};
-
-const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
-	[rxrpc_rtt_rx_ping_response]	= "PONG",
-	[rxrpc_rtt_rx_requested_ack]	= "RACK",
-};
-
-const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
-	[rxrpc_timer_begin]			= "Begin ",
-	[rxrpc_timer_expired]			= "*EXPR*",
-	[rxrpc_timer_init_for_reply]		= "IniRpl",
-	[rxrpc_timer_init_for_send_reply]	= "SndRpl",
-	[rxrpc_timer_set_for_ack]		= "SetAck",
-	[rxrpc_timer_set_for_ping]		= "SetPng",
-	[rxrpc_timer_set_for_send]		= "SetTx ",
-	[rxrpc_timer_set_for_resend]		= "SetRTx",
-};
-
-const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
-	[rxrpc_propose_ack_client_tx_end]	= "ClTxEnd",
-	[rxrpc_propose_ack_input_data]		= "DataIn ",
-	[rxrpc_propose_ack_ping_for_lost_ack]	= "LostAck",
-	[rxrpc_propose_ack_ping_for_lost_reply]	= "LostRpl",
-	[rxrpc_propose_ack_ping_for_params]	= "Params ",
-	[rxrpc_propose_ack_processing_op]	= "ProcOp ",
-	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
-	[rxrpc_propose_ack_respond_to_ping]	= "Rsp2Png",
-	[rxrpc_propose_ack_retry_tx]		= "RetryTx",
-	[rxrpc_propose_ack_rotate_rx]		= "RxAck  ",
-	[rxrpc_propose_ack_terminal_ack]	= "ClTerm ",
-};
-
-const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
-	[rxrpc_propose_ack_use]			= "",
-	[rxrpc_propose_ack_update]		= " Update",
-	[rxrpc_propose_ack_subsume]		= " Subsume",
-};
-
-const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
-	[RXRPC_CALL_SLOW_START]		= "SlowStart",
-	[RXRPC_CALL_CONGEST_AVOIDANCE]	= "CongAvoid",
-	[RXRPC_CALL_PACKET_LOSS]	= "PktLoss  ",
-	[RXRPC_CALL_FAST_RETRANSMIT]	= "FastReTx ",
-};
-
-const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
-	[rxrpc_cong_begin_retransmission]	= " Retrans",
-	[rxrpc_cong_cleared_nacks]		= " Cleared",
-	[rxrpc_cong_new_low_nack]		= " NewLowN",
-	[rxrpc_cong_no_change]			= "",
-	[rxrpc_cong_progress]			= " Progres",
-	[rxrpc_cong_retransmit_again]		= " ReTxAgn",
-	[rxrpc_cong_rtt_window_end]		= " RttWinE",
-	[rxrpc_cong_saw_nack]			= " SawNack",
-};
-- 
cgit v1.2.3


From b1d9f7fde0bb6c143a9a5b9246ea191e28f2c364 Mon Sep 17 00:00:00 2001
From: David Howells
Date: Thu, 5 Jan 2017 10:38:34 +0000
Subject: rxrpc: Add some more tracing

Add the following extra tracing information:

 (1) Modify the rxrpc_transmit tracepoint to record the Tx window size as
     this is varied by the slow-start algorithm.

 (2) Modify the rxrpc_rx_ack tracepoint to record more information from
     received ACK packets.

 (3) Add an rxrpc_rx_data tracepoint to record the information in DATA
     packets.

 (4) Add an rxrpc_disconnect_call tracepoint to record call disconnection,
     including the reason the call was disconnected.

 (5) Add an rxrpc_improper_term tracepoint to record implicit termination
     of a call by a client either by starting a new call on a particular
     connection channel without first transmitting the final ACK for the
     previous call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 94 +++++++++++++++++++++++++++++++++++++++++---
 net/rxrpc/conn_object.c      |  1 +
 net/rxrpc/input.c            |  6 ++-
 3 files changed, 95 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 2395a57462c9..593f586545eb 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -595,6 +595,7 @@ TRACE_EVENT(rxrpc_transmit,
 		    __field(enum rxrpc_transmit_trace,	why		)
 		    __field(rxrpc_seq_t,		tx_hard_ack	)
 		    __field(rxrpc_seq_t,		tx_top		)
+		    __field(int,			tx_winsize	)
 			     ),
 
 	    TP_fast_assign(
@@ -602,38 +603,81 @@ TRACE_EVENT(rxrpc_transmit,
 		    __entry->why = why;
 		    __entry->tx_hard_ack = call->tx_hard_ack;
 		    __entry->tx_top = call->tx_top;
+		    __entry->tx_winsize = call->tx_winsize;
 			   ),
 
-	    TP_printk("c=%p %s f=%08x n=%u",
+	    TP_printk("c=%p %s f=%08x n=%u/%u",
 		      __entry->call,
 		      __print_symbolic(__entry->why, rxrpc_transmit_traces),
 		      __entry->tx_hard_ack + 1,
-		      __entry->tx_top - __entry->tx_hard_ack)
+		      __entry->tx_top - __entry->tx_hard_ack,
+		      __entry->tx_winsize)
+	    );
+
+TRACE_EVENT(rxrpc_rx_data,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
+		     rxrpc_serial_t serial, u8 flags, u8 anno),
+
+	    TP_ARGS(call, seq, serial, flags, anno),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(u8,				flags		)
+		    __field(u8,				anno		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->seq = seq;
+		    __entry->serial = serial;
+		    __entry->flags = flags;
+		    __entry->anno = anno;
+			   ),
+
+	    TP_printk("c=%p DATA %08x q=%08x fl=%02x a=%02x",
+		      __entry->call,
+		      __entry->serial,
+		      __entry->seq,
+		      __entry->flags,
+		      __entry->anno)
 	    );
 
 TRACE_EVENT(rxrpc_rx_ack,
-	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first, u8 reason, u8 n_acks),
+	    TP_PROTO(struct rxrpc_call *call,
+		     rxrpc_serial_t serial, rxrpc_serial_t ack_serial,
+		     rxrpc_seq_t first, rxrpc_seq_t prev, u8 reason, u8 n_acks),
 
-	    TP_ARGS(call, first, reason, n_acks),
+	    TP_ARGS(call, serial, ack_serial, first, prev, reason, n_acks),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(rxrpc_serial_t,		ack_serial	)
 		    __field(rxrpc_seq_t,		first		)
+		    __field(rxrpc_seq_t,		prev		)
 		    __field(u8,				reason		)
 		    __field(u8,				n_acks		)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call = call;
+		    __entry->serial = serial;
+		    __entry->ack_serial = ack_serial;
 		    __entry->first = first;
+		    __entry->prev = prev;
 		    __entry->reason = reason;
 		    __entry->n_acks = n_acks;
 			   ),
 
-	    TP_printk("c=%p %s f=%08x n=%u",
+	    TP_printk("c=%p %08x %s r=%08x f=%08x p=%08x n=%u",
 		      __entry->call,
+		      __entry->serial,
 		      __print_symbolic(__entry->reason, rxrpc_ack_names),
+		      __entry->ack_serial,
 		      __entry->first,
+		      __entry->prev,
 		      __entry->n_acks)
 	    );
 
@@ -1001,6 +1045,46 @@ TRACE_EVENT(rxrpc_congest,
 		      __entry->sum.retrans_timeo ? " rTxTo" : "")
 	    );
 
+TRACE_EVENT(rxrpc_disconnect_call,
+	    TP_PROTO(struct rxrpc_call *call),
+
+	    TP_ARGS(call),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(u32,			abort_code	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->abort_code = call->abort_code;
+			   ),
+
+	    TP_printk("c=%p ab=%08x",
+		      __entry->call,
+		      __entry->abort_code)
+	    );
+
+TRACE_EVENT(rxrpc_improper_term,
+	    TP_PROTO(struct rxrpc_call *call),
+
+	    TP_ARGS(call),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(u32,			abort_code	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->abort_code = call->abort_code;
+			   ),
+
+	    TP_printk("c=%p ab=%08x",
+		      __entry->call,
+		      __entry->abort_code)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index e1e83af47866..b0ecb770fdce 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -173,6 +173,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
 		/* Save the result of the call so that we can repeat it if necessary
 		 * through the channel, whilst disposing of the actual call record.
 		 */
+		trace_rxrpc_disconnect_call(call);
 		chan->last_service_id = call->service_id;
 		if (call->abort_code) {
 			chan->last_abort = call->abort_code;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 7c2abd85def9..78ec33477adf 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -481,6 +481,7 @@ next_subpacket:
 			return rxrpc_proto_abort("LSA", call, seq);
 	}
 
+	trace_rxrpc_rx_data(call, seq, serial, flags, annotation);
 	if (before_eq(seq, hard_ack)) {
 		ack = RXRPC_ACK_DUPLICATE;
 		ack_serial = serial;
@@ -765,7 +766,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
 			      buf.ack.reason : RXRPC_ACK__INVALID);
 
-	trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
+	trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial,
+			   first_soft_ack, ntohl(buf.ack.previousPacket),
+			   summary.ack_reason, nr_acks);
 
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
 		rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
@@ -951,6 +954,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
 		break;
 	}
 
+	trace_rxrpc_improper_term(call);
 	__rxrpc_disconnect_call(conn, call);
 	rxrpc_notify_socket(call);
 }
-- 
cgit v1.2.3


From 3e018daf045f3803c3ebe89dbb48318f24ffb790 Mon Sep 17 00:00:00 2001
From: David Howells
Date: Thu, 5 Jan 2017 10:38:33 +0000
Subject: rxrpc: Show a call's hard-ACK cursors in /proc/net/rxrpc_calls

Show a call's hard-ACK cursors in /proc/net/rxrpc_calls so that a call's
progress can be more easily monitored.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/proc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 65cd980767fa..b9bcfbfb095c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,6 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	struct rxrpc_sock *rx;
 	struct rxrpc_peer *peer;
 	struct rxrpc_call *call;
+	rxrpc_seq_t tx_hard_ack, rx_hard_ack;
 	char lbuff[50], rbuff[50];
 
 	if (v == &rxrpc_calls) {
@@ -82,9 +83,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	else
 		strcpy(rbuff, "no_connection");
 
+	tx_hard_ack = READ_ONCE(call->tx_hard_ack);
+	rx_hard_ack = READ_ONCE(call->rx_hard_ack);
 	seq_printf(seq,
 		   "UDP   %-47.47s %-47.47s %4x %08x %08x %s %3u"
-		   " %-8.8s %08x %lx\n",
+		   " %-8.8s %08x %lx %08x %02x %08x %02x\n",
 		   lbuff,
 		   rbuff,
 		   call->service_id,
@@ -94,7 +97,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 		   atomic_read(&call->usage),
 		   rxrpc_call_states[call->state],
 		   call->abort_code,
-		   call->user_call_ID);
+		   call->user_call_ID,
+		   tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
+		   rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 343884c87e5a928193796415280269e723f951c7 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel
Date: Wed, 4 Jan 2017 10:53:37 +0000
Subject: cfg80211: only pass sband to set_mandatory_flags_band()

The supported band structure contains the band is applies to
so no need to pass it separately. Also added a default case
to the switch for completeness. The current code base does not
call this function with NUM_NL80211_BANDS but kept that case
statement although default case would cover that.

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 2cf7df8173b6..c91bc25c4dd5 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -136,12 +136,11 @@ struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
 }
 EXPORT_SYMBOL(ieee80211_get_channel);
 
-static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
-				     enum nl80211_band band)
+static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
 {
 	int i, want;
 
-	switch (band) {
+	switch (sband->band) {
 	case NL80211_BAND_5GHZ:
 		want = 3;
 		for (i = 0; i < sband->n_bitrates; i++) {
@@ -191,6 +190,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
 		WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
 		break;
 	case NUM_NL80211_BANDS:
+	default:
 		WARN_ON(1);
 		break;
 	}
@@ -202,7 +202,7 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
 
 	for (band = 0; band < NUM_NL80211_BANDS; band++)
 		if (wiphy->bands[band])
-			set_mandatory_flags_band(wiphy->bands[band], band);
+			set_mandatory_flags_band(wiphy->bands[band]);
 }
 
 bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
-- 
cgit v1.2.3


From 300ae149468f440a2629fa8b33d0ce1e860d479f Mon Sep 17 00:00:00 2001
From: Davide Caratti
Date: Mon, 2 Jan 2017 13:29:40 +0100
Subject: netfilter: select LIBCRC32C together with SCTP conntrack

nf_conntrack needs to compute crc32c when dealing with SCTP packets.
Moreover, NF_NAT_PROTO_SCTP (currently selecting LIBCRC32C) can be enabled
only if conntrack support for SCTP is enabled. Therefore, move enabling of
kernel support for crc32c so that it is selected when NF_CT_PROTO_SCTP=y.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 63729b489c2c..6d425e355bf5 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -162,6 +162,7 @@ config NF_CT_PROTO_SCTP
 	bool 'SCTP protocol connection tracking support'
 	depends on NETFILTER_ADVANCED
 	default y
+	select LIBCRC32C
 	help
 	  With this option enabled, the layer 3 independent connection
 	  tracking code will be able to do state tracking on SCTP connections.
@@ -397,7 +398,6 @@ config NF_NAT_PROTO_SCTP
 	bool
 	default NF_NAT && NF_CT_PROTO_SCTP
 	depends on NF_NAT && NF_CT_PROTO_SCTP
-	select LIBCRC32C
 
 config NF_NAT_AMANDA
 	tristate
-- 
cgit v1.2.3


From cf6e007eef83476c5d541453d84e08b07befe124 Mon Sep 17 00:00:00 2001
From: Davide Caratti
Date: Mon, 2 Jan 2017 13:29:41 +0100
Subject: netfilter: conntrack: validate SCTP crc32c in PREROUTING

implement sctp_error to let nf_conntrack_in validate crc32c on the packet
transport header. Assign skb->ip_summed to CHECKSUM_UNNECESSARY and return
NF_ACCEPT in case of successful validation; otherwise, return -NF_ACCEPT to
let netfilter skip connection tracking, like other protocols do.

Besides preventing corrupted packets from matching conntrack entries, this
fixes functionality of REJECT target: it was not generating any ICMP upon
reception of SCTP packets, because it was computing RFC 1624 checksum on
the packet and systematically mismatching crc32c in the SCTP header.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_sctp.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index a0efde38da44..44a647418948 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -22,7 +22,9 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <net/sctp/checksum.h>
 
+#include <net/netfilter/nf_log.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
@@ -505,6 +507,34 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
+static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
+		      unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+		      u8 pf, unsigned int hooknum)
+{
+	const struct sctphdr *sh;
+	struct sctphdr _sctph;
+	const char *logmsg;
+
+	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+	if (!sh) {
+		logmsg = "nf_ct_sctp: short packet ";
+		goto out_invalid;
+	}
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    skb->ip_summed == CHECKSUM_NONE) {
+		if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
+			logmsg = "nf_ct_sctp: bad CRC ";
+			goto out_invalid;
+		}
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return NF_ACCEPT;
+out_invalid:
+	if (LOG_INVALID(net, IPPROTO_SCTP))
+		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg);
+	return -NF_ACCEPT;
+}
+
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
@@ -752,6 +782,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
+	.error			= sctp_error,
 	.me 			= THIS_MODULE,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= sctp_to_nlattr,
@@ -786,6 +817,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
+	.error			= sctp_error,
 	.me 			= THIS_MODULE,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= sctp_to_nlattr,
-- 
cgit v1.2.3


From 4cc4b72c136a45aeccd86f66b0859b148b47d881 Mon Sep 17 00:00:00 2001
From: Geliang Tang
Date: Tue, 20 Dec 2016 22:02:13 +0800
Subject: netfilter: xt_connlimit: use rb_entry()

To make the code clearer, use rb_entry() instead of container_of() to
deal with rbtree.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_connlimit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 2aff2b7c4689..660b61dbd776 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -218,7 +218,7 @@ count_tree(struct net *net, struct rb_root *root,
 		int diff;
 		bool addit;
 
-		rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
+		rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node);
 
 		parent = *rbnode;
 		diff = same_source_net(addr, mask, &rbconn->addr, family);
@@ -398,7 +398,7 @@ static void destroy_tree(struct rb_root *r)
 	struct rb_node *node;
 
 	while ((node = rb_first(r)) != NULL) {
-		rbconn = container_of(node, struct xt_connlimit_rb, node);
+		rbconn = rb_entry(node, struct xt_connlimit_rb, node);
 
 		rb_erase(node, r);
 
-- 
cgit v1.2.3


From 5e6eb456983c994a8e582127a300b6552e5a1768 Mon Sep 17 00:00:00 2001
From: Volodymyr Bendiuga
Date: Thu, 5 Jan 2017 11:10:13 +0100
Subject: net:dsa: check for EPROBE_DEFER from dsa_dst_parse()

Since there can be multiple dsa switches stacked together but
not all of devicetree nodes available at the time of calling
dsa_dst_parse(), EPROBE_DEFER can be returned by it. When this
happens, only the last dsa switch has to be deleted by
dsa_dst_del_ds(), but not the whole list, because next time linux
cames back to this function it will try to add only the last dsa
switch which returned EPROBE_DEFER.

Signed-off-by: Volodymyr Bendiuga <volodymyr.bendiuga@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 5fff951a0a49..bad119cee2a3 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -647,8 +647,14 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
 	}
 
 	err = dsa_dst_parse(dst);
-	if (err)
+	if (err) {
+		if (err == -EPROBE_DEFER) {
+			dsa_dst_del_ds(dst, ds, ds->index);
+			return err;
+		}
+
 		goto out_del_dst;
+	}
 
 	err = dsa_dst_apply(dst);
 	if (err) {
-- 
cgit v1.2.3


From ad02c4f547826167a709dab8a89a1caefd2c1f50 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh
Date: Wed, 4 Jan 2017 11:19:34 -0500
Subject: tcp: provide timestamps for partial writes

For TCP sockets, TX timestamps are only captured when the user data
is successfully and fully written to the socket. In many cases,
however, TCP writes can be partial for which no timestamp is
collected.

Collect timestamps whenever any user data is (fully or partially)
copied into the socket. Pass tcp_write_queue_tail to tcp_tx_timestamp
instead of the local skb pointer since it can be set to NULL on
the error path.

Note that tcp_write_queue_tail can be NULL, even if bytes have been
copied to the socket. This is because acknowledgements are being
processed in tcp_sendmsg(), and by the time tcp_tx_timestamp is
called tcp_write_queue_tail can be NULL. For such cases, this patch
does not collect any timestamps (i.e., it is best-effort).

This patch is written with suggestions from Willem de Bruijn and
Eric Dumazet.

Change-log V1 -> V2:
	- Use sockc.tsflags instead of sk->sk_tsflags.
	- Use the same code path for normal writes and errors.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2e3807d8eba8..ec97e4b4a62f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -429,7 +429,7 @@ EXPORT_SYMBOL(tcp_init_sock);
 
 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
 {
-	if (tsflags) {
+	if (tsflags && skb) {
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
@@ -958,10 +958,8 @@ new_segment:
 		copied += copy;
 		offset += copy;
 		size -= copy;
-		if (!size) {
-			tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+		if (!size)
 			goto out;
-		}
 
 		if (skb->len < size_goal || (flags & MSG_OOB))
 			continue;
@@ -987,8 +985,11 @@ wait_for_memory:
 	}
 
 out:
-	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
-		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+	if (copied) {
+		tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+		if (!(flags & MSG_SENDPAGE_NOTLAST))
+			tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+	}
 	return copied;
 
 do_error:
@@ -1281,7 +1282,6 @@ new_segment:
 
 		copied += copy;
 		if (!msg_data_left(msg)) {
-			tcp_tx_timestamp(sk, sockc.tsflags, skb);
 			if (unlikely(flags & MSG_EOR))
 				TCP_SKB_CB(skb)->eor = 1;
 			goto out;
@@ -1312,8 +1312,10 @@ wait_for_memory:
 	}
 
 out:
-	if (copied)
+	if (copied) {
+		tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+	}
 out_nopush:
 	release_sock(sk);
 	return copied + copied_syn;
-- 
cgit v1.2.3


From b3b73b8e6df685ba61476b256f98eff1d650c199 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 5 Jan 2017 13:23:58 +0100
Subject: xfrm: state: do not acquire lock in get_mtu helpers

Once flow cache gets removed the mtu initialisation happens for every skb
that gets an xfrm attached, so this lock starts to show up in perf.

It is not obvious why this lock is required -- the caller holds
reference on the state struct, type->destructor is only called from the
state gc worker (all state structs on gc list must have refcount 0).

xfrm_init_state already has been called (else private data accessed
by type->get_mtu() would not be set up).

So just remove the lock -- the race on the state (DEAD?) doesn't
matter (could change right after dropping the lock too).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index c5cf4d611aab..6b3366fef019 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2000,16 +2000,13 @@ EXPORT_SYMBOL(xfrm_state_delete_tunnel);
 
 int xfrm_state_mtu(struct xfrm_state *x, int mtu)
 {
-	int res;
+	const struct xfrm_type *type = READ_ONCE(x->type);
 
-	spin_lock_bh(&x->lock);
 	if (x->km.state == XFRM_STATE_VALID &&
-	    x->type && x->type->get_mtu)
-		res = x->type->get_mtu(x, mtu);
-	else
-		res = mtu - x->props.header_len;
-	spin_unlock_bh(&x->lock);
-	return res;
+	    type && type->get_mtu)
+		return type->get_mtu(x, mtu);
+
+	return mtu - x->props.header_len;
 }
 
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
-- 
cgit v1.2.3


From 4787cfa08476d4193c35d958f564f643bb6ae234 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki
Date: Wed, 4 Jan 2017 18:58:30 +0100
Subject: cfg80211: move function checking range fit to util.c

It is needed for another cfg80211 helper that will be out of reg.c so
move it to common util.c file and make it non-static.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.h |  3 +++
 net/wireless/reg.c  | 27 +++++++--------------------
 net/wireless/util.c | 15 +++++++++++++++
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/wireless/core.h b/net/wireless/core.h
index af6e023020b1..bc8ba6e57519 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -430,6 +430,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
 void cfg80211_process_wdev_events(struct wireless_dev *wdev);
 
+bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
+				u32 center_freq_khz, u32 bw_khz);
+
 /**
  * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
  * @wiphy: the wiphy to validate against
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5dbac3749738..753efcd51fa3 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -748,21 +748,6 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd)
 	return true;
 }
 
-static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
-			    u32 center_freq_khz, u32 bw_khz)
-{
-	u32 start_freq_khz, end_freq_khz;
-
-	start_freq_khz = center_freq_khz - (bw_khz/2);
-	end_freq_khz = center_freq_khz + (bw_khz/2);
-
-	if (start_freq_khz >= freq_range->start_freq_khz &&
-	    end_freq_khz <= freq_range->end_freq_khz)
-		return true;
-
-	return false;
-}
-
 /**
  * freq_in_rule_band - tells us if a frequency is in a frequency band
  * @freq_range: frequency rule we want to query
@@ -1070,7 +1055,7 @@ freq_reg_info_regd(u32 center_freq,
 		if (!band_rule_found)
 			band_rule_found = freq_in_rule_band(fr, center_freq);
 
-		bw_fits = reg_does_bw_fit(fr, center_freq, bw);
+		bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw);
 
 		if (band_rule_found && bw_fits)
 			return rr;
@@ -1138,11 +1123,13 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
 		max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
 
 	/* If we get a reg_rule we can assume that at least 5Mhz fit */
-	if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
-			     MHZ_TO_KHZ(10)))
+	if (!cfg80211_does_bw_fit_range(freq_range,
+					MHZ_TO_KHZ(chan->center_freq),
+					MHZ_TO_KHZ(10)))
 		bw_flags |= IEEE80211_CHAN_NO_10MHZ;
-	if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
-			     MHZ_TO_KHZ(20)))
+	if (!cfg80211_does_bw_fit_range(freq_range,
+					MHZ_TO_KHZ(chan->center_freq),
+					MHZ_TO_KHZ(20)))
 		bw_flags |= IEEE80211_CHAN_NO_20MHZ;
 
 	if (max_bandwidth_khz < MHZ_TO_KHZ(10))
diff --git a/net/wireless/util.c b/net/wireless/util.c
index c91bc25c4dd5..cd8a7ae55e7d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1847,6 +1847,21 @@ void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
 }
 EXPORT_SYMBOL(cfg80211_free_nan_func);
 
+bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
+				u32 center_freq_khz, u32 bw_khz)
+{
+	u32 start_freq_khz, end_freq_khz;
+
+	start_freq_khz = center_freq_khz - (bw_khz / 2);
+	end_freq_khz = center_freq_khz + (bw_khz / 2);
+
+	if (start_freq_khz >= freq_range->start_freq_khz &&
+	    end_freq_khz <= freq_range->end_freq_khz)
+		return true;
+
+	return false;
+}
+
 /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
 /* Ethernet-II snap header (RFC1042 for most EtherTypes) */
 const unsigned char rfc1042_header[] __aligned(2) =
-- 
cgit v1.2.3


From e691ac2f75b69bee743f0370d79454ba4429b175 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki
Date: Wed, 4 Jan 2017 18:58:31 +0100
Subject: cfg80211: support ieee80211-freq-limit DT property

This patch adds a helper for reading that new property and applying
limitations of supported channels specified this way.
It is used with devices that normally support a wide wireless band but
in a given config are limited to some part of it (usually due to board
design). For example a dual-band chipset may be able to support one band
only because of used antennas.
It's also common that tri-band routers have separated radios for lower
and higher part of 5 GHz band and it may be impossible to say which is
which without a DT info.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
[add new function to documentation, fix link]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/80211/cfg80211.rst |   3 +
 include/net/cfg80211.h           |  28 ++++++++
 net/wireless/Makefile            |   1 +
 net/wireless/of.c                | 138 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 170 insertions(+)
 create mode 100644 net/wireless/of.c

(limited to 'net')

diff --git a/Documentation/80211/cfg80211.rst b/Documentation/80211/cfg80211.rst
index b1e149ea6fee..eca534ab6172 100644
--- a/Documentation/80211/cfg80211.rst
+++ b/Documentation/80211/cfg80211.rst
@@ -44,6 +44,9 @@ Device registration
 .. kernel-doc:: include/net/cfg80211.h
    :functions: wiphy_new
 
+.. kernel-doc:: include/net/cfg80211.h
+   :functions: wiphy_read_of_freq_limits
+
 .. kernel-doc:: include/net/cfg80211.h
    :functions: wiphy_register
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ca2ac1ce5862..41a9ecd82ca0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -311,6 +311,34 @@ struct ieee80211_supported_band {
 	struct ieee80211_sta_vht_cap vht_cap;
 };
 
+/**
+ * wiphy_read_of_freq_limits - read frequency limits from device tree
+ *
+ * @wiphy: the wireless device to get extra limits for
+ *
+ * Some devices may have extra limitations specified in DT. This may be useful
+ * for chipsets that normally support more bands but are limited due to board
+ * design (e.g. by antennas or external power amplifier).
+ *
+ * This function reads info from DT and uses it to *modify* channels (disable
+ * unavailable ones). It's usually a *bad* idea to use it in drivers with
+ * shared channel data as DT limitations are device specific. You should make
+ * sure to call it only if channels in wiphy are copied and can be modified
+ * without affecting other devices.
+ *
+ * As this function access device node it has to be called after set_wiphy_dev.
+ * It also modifies channels so they have to be set first.
+ * If using this helper, call it before wiphy_register().
+ */
+#ifdef CONFIG_OF
+void wiphy_read_of_freq_limits(struct wiphy *wiphy);
+#else /* CONFIG_OF */
+static inline void wiphy_read_of_freq_limits(struct wiphy *wiphy)
+{
+}
+#endif /* !CONFIG_OF */
+
+
 /*
  * Wireless hardware/device configuration structures and methods
  */
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 4c9e39f04ef8..95b4c0915412 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
 
 cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
 cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
+cfg80211-$(CONFIG_OF) += of.o
 cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
 cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
 cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
diff --git a/net/wireless/of.c b/net/wireless/of.c
new file mode 100644
index 000000000000..de221f0edca5
--- /dev/null
+++ b/net/wireless/of.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2017 Rafał Miłecki <rafal@milecki.pl>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/of.h>
+#include <net/cfg80211.h>
+#include "core.h"
+
+static bool wiphy_freq_limits_valid_chan(struct wiphy *wiphy,
+					 struct ieee80211_freq_range *freq_limits,
+					 unsigned int n_freq_limits,
+					 struct ieee80211_channel *chan)
+{
+	u32 bw = MHZ_TO_KHZ(20);
+	int i;
+
+	for (i = 0; i < n_freq_limits; i++) {
+		struct ieee80211_freq_range *limit = &freq_limits[i];
+
+		if (cfg80211_does_bw_fit_range(limit,
+					       MHZ_TO_KHZ(chan->center_freq),
+					       bw))
+			return true;
+	}
+
+	return false;
+}
+
+static void wiphy_freq_limits_apply(struct wiphy *wiphy,
+				    struct ieee80211_freq_range *freq_limits,
+				    unsigned int n_freq_limits)
+{
+	enum nl80211_band band;
+	int i;
+
+	if (WARN_ON(!n_freq_limits))
+		return;
+
+	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		struct ieee80211_supported_band *sband = wiphy->bands[band];
+
+		if (!sband)
+			continue;
+
+		for (i = 0; i < sband->n_channels; i++) {
+			struct ieee80211_channel *chan = &sband->channels[i];
+
+			if (chan->flags & IEEE80211_CHAN_DISABLED)
+				continue;
+
+			if (!wiphy_freq_limits_valid_chan(wiphy, freq_limits,
+							  n_freq_limits,
+							  chan)) {
+				pr_debug("Disabling freq %d MHz as it's out of OF limits\n",
+					 chan->center_freq);
+				chan->flags |= IEEE80211_CHAN_DISABLED;
+			}
+		}
+	}
+}
+
+void wiphy_read_of_freq_limits(struct wiphy *wiphy)
+{
+	struct device *dev = wiphy_dev(wiphy);
+	struct device_node *np;
+	struct property *prop;
+	struct ieee80211_freq_range *freq_limits;
+	unsigned int n_freq_limits;
+	const __be32 *p;
+	int len, i;
+	int err = 0;
+
+	if (!dev)
+		return;
+	np = dev_of_node(dev);
+	if (!np)
+		return;
+
+	prop = of_find_property(np, "ieee80211-freq-limit", &len);
+	if (!prop)
+		return;
+
+	if (!len || len % sizeof(u32) || len / sizeof(u32) % 2) {
+		dev_err(dev, "ieee80211-freq-limit wrong format");
+		return;
+	}
+	n_freq_limits = len / sizeof(u32) / 2;
+
+	freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL);
+	if (!freq_limits) {
+		err = -ENOMEM;
+		goto out_kfree;
+	}
+
+	p = NULL;
+	for (i = 0; i < n_freq_limits; i++) {
+		struct ieee80211_freq_range *limit = &freq_limits[i];
+
+		p = of_prop_next_u32(prop, p, &limit->start_freq_khz);
+		if (!p) {
+			err = -EINVAL;
+			goto out_kfree;
+		}
+
+		p = of_prop_next_u32(prop, p, &limit->end_freq_khz);
+		if (!p) {
+			err = -EINVAL;
+			goto out_kfree;
+		}
+
+		if (!limit->start_freq_khz ||
+		    !limit->end_freq_khz ||
+		    limit->start_freq_khz >= limit->end_freq_khz) {
+			err = -EINVAL;
+			goto out_kfree;
+		}
+	}
+
+	wiphy_freq_limits_apply(wiphy, freq_limits, n_freq_limits);
+
+out_kfree:
+	kfree(freq_limits);
+	if (err)
+		dev_err(dev, "Failed to get limits: %d\n", err);
+}
+EXPORT_SYMBOL(wiphy_read_of_freq_limits);
-- 
cgit v1.2.3


From f784ad3d79e5be062b19dc36c53413daffeecc5c Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar
Date: Wed, 4 Jan 2017 15:01:01 -0800
Subject: ipv6: do not send RTM_DELADDR for tentative addresses

RTM_NEWADDR notification is sent when IFA_F_TENTATIVE is cleared from
the address. So if the address is added and deleted before DAD probes
completes, the RTM_DELADDR will be sent for which there was no
RTM_NEWADDR causing asymmetry in notification. However if the same
logic is used while sending RTM_DELADDR notification, this asymmetry
can be avoided.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
CC: Patrick McHardy <kaber@trash.net>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c1e124bc8e1e..ac9bd5620f81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4888,6 +4888,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 	struct net *net = dev_net(ifa->idev->dev);
 	int err = -ENOBUFS;
 
+	/* Don't send DELADDR notification for TENTATIVE address,
+	 * since NEWADDR notification is sent only after removing
+	 * TENTATIVE flag.
+	 */
+	if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
+		return;
+
 	skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
-- 
cgit v1.2.3


From 7558828adebe675179f6a489c7e04082828bf524 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Thu, 5 Jan 2017 12:28:41 -0500
Subject: net: dsa: remove version string

The dsa_driver_version string is irrelevant and has not been bumped
since its introduction about 9 years ago. Kill it.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c      | 5 -----
 net/dsa/dsa_priv.h | 1 -
 net/dsa/slave.c    | 1 -
 3 files changed, 7 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 89e66b623d73..3f85be0aae34 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -27,8 +27,6 @@
 #include <linux/gpio/consumer.h>
 #include "dsa_priv.h"
 
-char dsa_driver_version[] = "0.1";
-
 static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
 					    struct net_device *dev)
 {
@@ -926,9 +924,6 @@ static int dsa_probe(struct platform_device *pdev)
 	struct dsa_switch_tree *dst;
 	int ret;
 
-	pr_notice_once("Distributed Switch Architecture driver version %s\n",
-		       dsa_driver_version);
-
 	if (pdev->dev.of_node) {
 		ret = dsa_of_probe(&pdev->dev);
 		if (ret)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6cfd7388834e..63ae1484abae 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -49,7 +49,6 @@ struct dsa_slave_priv {
 };
 
 /* dsa.c */
-extern char dsa_driver_version[];
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 		      struct device_node *port_dn, int port);
 void dsa_cpu_dsa_destroy(struct device_node *port_dn);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ffd91969b830..5cd5b8137c08 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -673,7 +673,6 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
 				  struct ethtool_drvinfo *drvinfo)
 {
 	strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version));
 	strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
 	strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
 }
-- 
cgit v1.2.3


From 0c8d803f397a65adfffd3aa01f52555389f33239 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 5 Jan 2017 19:32:46 -0800
Subject: net: ipv4: Simplify rt_fill_info

rt_fill_info has only 1 caller and both of the last 2 args -- nowait
and flags -- are hardcoded to 0. Given that remove them as input arguments
and simplify rt_fill_info accordingly.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0fcac8e7a2b2..7b52ac20145b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2454,7 +2454,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-			u32 seq, int event, int nowait, unsigned int flags)
+			u32 seq, int event)
 {
 	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
@@ -2463,7 +2463,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	u32 error;
 	u32 metrics[RTAX_MAX];
 
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
 	if (!nlh)
 		return -EMSGSIZE;
 
@@ -2541,18 +2541,12 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
 			int err = ipmr_get_route(net, skb,
 						 fl4->saddr, fl4->daddr,
-						 r, nowait, portid);
+						 r, 0, portid);
 
 			if (err <= 0) {
-				if (!nowait) {
-					if (err == 0)
-						return 0;
-					goto nla_put_failure;
-				} else {
-					if (err == -EMSGSIZE)
-						goto nla_put_failure;
-					error = err;
-				}
+				if (err == 0)
+					return 0;
+				goto nla_put_failure;
 			}
 		} else
 #endif
@@ -2665,7 +2659,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 
 	err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
-			   RTM_NEWROUTE, 0, 0);
+			   RTM_NEWROUTE);
 	if (err < 0)
 		goto errout_free;
 
-- 
cgit v1.2.3


From c7b371e34c0b9ed9e23271608448f89e6d66ba0a Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 5 Jan 2017 19:33:59 -0800
Subject: net: ipv4: make fib_select_default static

fib_select_default has a single caller within the same file.
Make it static.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 1 -
 net/ipv4/fib_semantics.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5f376af377c7..57c2a863d0b2 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -344,7 +344,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb);
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			u8 tos, int oif, struct net_device *dev,
 			struct in_device *idev, u32 *itag);
-void fib_select_default(const struct flowi4 *flp, struct fib_result *res);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 static inline int fib_num_tclassid_users(struct net *net)
 {
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 7a5b4c7d9a87..05c911d21782 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1434,7 +1434,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 }
 
 /* Must be invoked inside of an RCU protected region.  */
-void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
+static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 {
 	struct fib_info *fi = NULL, *last_resort = NULL;
 	struct hlist_head *fa_head = res->fa_head;
-- 
cgit v1.2.3


From df560056d960a3e164c179d89770d5a51b798537 Mon Sep 17 00:00:00 2001
From: Eric Garver
Date: Thu, 5 Jan 2017 20:22:36 -0500
Subject: udp: inuse checks can quit early for reuseport

UDP lib inuse checks will walk the entire hash bucket to check if the
portaddr is in use. In the case of reuseport we can stop searching when
we find a matching reuseport.

On a 16-core VM a test program that spawns 16 threads that each bind to
1024 sockets (one per 10ms) takes 1m45s. With this change it takes 11s.

Also add a cond_resched() when the port is not specified.

Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1307a7c2e544..4318d72e0248 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -153,13 +153,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
-		     rcu_access_pointer(sk->sk_reuseport_cb) ||
-		     !uid_eq(uid, sock_i_uid(sk2))) &&
 		    saddr_comp(sk, sk2, true)) {
-			if (!bitmap)
-				return 1;
-			__set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+			if (sk2->sk_reuseport && sk->sk_reuseport &&
+			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			    uid_eq(uid, sock_i_uid(sk2))) {
+				if (!bitmap)
+					return 0;
+			} else {
+				if (!bitmap)
+					return 1;
+				__set_bit(udp_sk(sk2)->udp_port_hash >> log,
+					  bitmap);
+			}
 		}
 	}
 	return 0;
@@ -188,11 +193,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
-		     rcu_access_pointer(sk->sk_reuseport_cb) ||
-		     !uid_eq(uid, sock_i_uid(sk2))) &&
 		    saddr_comp(sk, sk2, true)) {
-			res = 1;
+			if (sk2->sk_reuseport && sk->sk_reuseport &&
+			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			    uid_eq(uid, sock_i_uid(sk2))) {
+				res = 0;
+			} else {
+				res = 1;
+			}
 			break;
 		}
 	}
@@ -285,6 +293,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 				snum += rand;
 			} while (snum != first);
 			spin_unlock_bh(&hslot->lock);
+			cond_resched();
 		} while (++first != last);
 		goto fail;
 	} else {
-- 
cgit v1.2.3


From a83863174a6137fb3e03f279c9dcdba9e35315d0 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 6 Jan 2017 22:18:33 +0800
Subject: sctp: prepare asoc stream for stream reconf

sctp stream reconf, described in RFC 6525, needs a structure to
save per stream information in assoc, like stream state.

In the future, sctp stream scheduler also needs it to save some
stream scheduler params and queues.

This patchset is to prepare the stream array in assoc for stream
reconf. It defines sctp_stream that includes stream arrays inside
to replace ssnmap.

Note that we use different structures for IN and OUT streams, as
the members in per OUT stream will get more and more different
from per IN stream.

v1->v2:
  - put these patches into a smaller group.
v2->v3:
  - define sctp_stream to contain stream arrays, and create stream.c
    to put stream-related functions.
  - merge 3 patches into 1, as new sctp_stream has the same name
    with before.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    |   1 -
 include/net/sctp/structs.h |  76 +++++++++++----------------
 net/sctp/Makefile          |   2 +-
 net/sctp/associola.c       |  13 +++--
 net/sctp/objcnt.c          |   2 -
 net/sctp/sm_make_chunk.c   |  10 ++--
 net/sctp/sm_statefuns.c    |   3 +-
 net/sctp/ssnmap.c          | 125 ---------------------------------------------
 net/sctp/stream.c          |  85 ++++++++++++++++++++++++++++++
 net/sctp/ulpqueue.c        |  36 ++++++-------
 10 files changed, 147 insertions(+), 206 deletions(-)
 delete mode 100644 net/sctp/ssnmap.c
 create mode 100644 net/sctp/stream.c

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index d8833a86cd7e..598d938b0d0a 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -283,7 +283,6 @@ extern atomic_t sctp_dbg_objcnt_chunk;
 extern atomic_t sctp_dbg_objcnt_bind_addr;
 extern atomic_t sctp_dbg_objcnt_bind_bucket;
 extern atomic_t sctp_dbg_objcnt_addr;
-extern atomic_t sctp_dbg_objcnt_ssnmap;
 extern atomic_t sctp_dbg_objcnt_datamsg;
 extern atomic_t sctp_dbg_objcnt_keys;
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 87d56cc80a3c..4741ec240caf 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -82,7 +82,6 @@ struct sctp_outq;
 struct sctp_bind_addr;
 struct sctp_ulpq;
 struct sctp_ep_common;
-struct sctp_ssnmap;
 struct crypto_shash;
 
 
@@ -377,54 +376,22 @@ typedef struct sctp_sender_hb_info {
 	__u64 hb_nonce;
 } __packed sctp_sender_hb_info_t;
 
-/*
- *  RFC 2960 1.3.2 Sequenced Delivery within Streams
- *
- *  The term "stream" is used in SCTP to refer to a sequence of user
- *  messages that are to be delivered to the upper-layer protocol in
- *  order with respect to other messages within the same stream.  This is
- *  in contrast to its usage in TCP, where it refers to a sequence of
- *  bytes (in this document a byte is assumed to be eight bits).
- *  ...
- *
- *  This is the structure we use to track both our outbound and inbound
- *  SSN, or Stream Sequence Numbers.
- */
-
-struct sctp_stream {
-	__u16 *ssn;
-	unsigned int len;
-};
-
-struct sctp_ssnmap {
-	struct sctp_stream in;
-	struct sctp_stream out;
-};
-
-struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
-				    gfp_t gfp);
-void sctp_ssnmap_free(struct sctp_ssnmap *map);
-void sctp_ssnmap_clear(struct sctp_ssnmap *map);
+struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp);
+void sctp_stream_free(struct sctp_stream *stream);
+void sctp_stream_clear(struct sctp_stream *stream);
 
 /* What is the current SSN number for this stream? */
-static inline __u16 sctp_ssn_peek(struct sctp_stream *stream, __u16 id)
-{
-	return stream->ssn[id];
-}
+#define sctp_ssn_peek(stream, type, sid) \
+	((stream)->type[sid].ssn)
 
 /* Return the next SSN number for this stream.	*/
-static inline __u16 sctp_ssn_next(struct sctp_stream *stream, __u16 id)
-{
-	return stream->ssn[id]++;
-}
+#define sctp_ssn_next(stream, type, sid) \
+	((stream)->type[sid].ssn++)
 
 /* Skip over this ssn and all below. */
-static inline void sctp_ssn_skip(struct sctp_stream *stream, __u16 id, 
-				 __u16 ssn)
-{
-	stream->ssn[id] = ssn+1;
-}
-              
+#define sctp_ssn_skip(stream, type, sid, ssn) \
+	((stream)->type[sid].ssn = ssn + 1)
+
 /*
  * Pointers to address related SCTP functions.
  * (i.e. things that depend on the address family.)
@@ -1331,6 +1298,25 @@ struct sctp_inithdr_host {
 	__u32 initial_tsn;
 };
 
+struct sctp_stream_out {
+	__u16	ssn;
+	__u8	state;
+};
+
+struct sctp_stream_in {
+	__u16	ssn;
+};
+
+struct sctp_stream {
+	struct sctp_stream_out *out;
+	struct sctp_stream_in *in;
+	__u16 outcnt;
+	__u16 incnt;
+};
+
+#define SCTP_STREAM_CLOSED		0x00
+#define SCTP_STREAM_OPEN		0x01
+
 /* SCTP_GET_ASSOC_STATS counters */
 struct sctp_priv_assoc_stats {
 	/* Maximum observed rto in the association during subsequent
@@ -1746,8 +1732,8 @@ struct sctp_association {
 	/* Default receive parameters */
 	__u32 default_rcv_context;
 
-	/* This tracks outbound ssn for a given stream.	 */
-	struct sctp_ssnmap *ssnmap;
+	/* Stream arrays */
+	struct sctp_stream *stream;
 
 	/* All outbound chunks go through this structure.  */
 	struct sctp_outq outqueue;
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 6c4f7496cec6..70f1b570bab9 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  transport.o chunk.o sm_make_chunk.o ulpevent.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
-	  output.o input.o debug.o ssnmap.o auth.o \
+	  output.o input.o debug.o stream.o auth.o \
 	  offload.o
 
 sctp_probe-y := probe.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index d3cc30c25c41..36294f7fb9a7 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -358,8 +358,8 @@ void sctp_association_free(struct sctp_association *asoc)
 
 	sctp_tsnmap_free(&asoc->peer.tsn_map);
 
-	/* Free ssnmap storage. */
-	sctp_ssnmap_free(asoc->ssnmap);
+	/* Free stream information. */
+	sctp_stream_free(asoc->stream);
 
 	/* Clean up the bound address list. */
 	sctp_bind_addr_free(&asoc->base.bind_addr);
@@ -1137,7 +1137,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
 		/* Reinitialize SSN for both local streams
 		 * and peer's streams.
 		 */
-		sctp_ssnmap_clear(asoc->ssnmap);
+		sctp_stream_clear(asoc->stream);
 
 		/* Flush the ULP reassembly and ordered queue.
 		 * Any data there will now be stale and will
@@ -1162,10 +1162,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
 
 		asoc->ctsn_ack_point = asoc->next_tsn - 1;
 		asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
-		if (!asoc->ssnmap) {
-			/* Move the ssnmap. */
-			asoc->ssnmap = new->ssnmap;
-			new->ssnmap = NULL;
+		if (!asoc->stream) {
+			asoc->stream = new->stream;
+			new->stream = NULL;
 		}
 
 		if (!asoc->assoc_id) {
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 40e7fac96c41..105ac3327b28 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr);
 SCTP_DBG_OBJCNT(bind_bucket);
 SCTP_DBG_OBJCNT(chunk);
 SCTP_DBG_OBJCNT(addr);
-SCTP_DBG_OBJCNT(ssnmap);
 SCTP_DBG_OBJCNT(datamsg);
 SCTP_DBG_OBJCNT(keys);
 
@@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
 	SCTP_DBG_OBJCNT_ENTRY(bind_addr),
 	SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
 	SCTP_DBG_OBJCNT_ENTRY(addr),
-	SCTP_DBG_OBJCNT_ENTRY(ssnmap),
 	SCTP_DBG_OBJCNT_ENTRY(datamsg),
 	SCTP_DBG_OBJCNT_ENTRY(keys),
 };
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9e9690b7afe1..a15d824a313d 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1536,7 +1536,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
 
 	/* All fragments will be on the same stream */
 	sid = ntohs(chunk->subh.data_hdr->stream);
-	stream = &chunk->asoc->ssnmap->out;
+	stream = chunk->asoc->stream;
 
 	/* Now assign the sequence number to the entire message.
 	 * All fragments must have the same stream sequence number.
@@ -1547,9 +1547,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
 			ssn = 0;
 		} else {
 			if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
-				ssn = sctp_ssn_next(stream, sid);
+				ssn = sctp_ssn_next(stream, out, sid);
 			else
-				ssn = sctp_ssn_peek(stream, sid);
+				ssn = sctp_ssn_peek(stream, out, sid);
 		}
 
 		lchunk->subh.data_hdr->ssn = htons(ssn);
@@ -2444,9 +2444,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 	if (!asoc->temp) {
 		int error;
 
-		asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
+		asoc->stream = sctp_stream_new(asoc->c.sinit_max_instreams,
 					       asoc->c.sinit_num_ostreams, gfp);
-		if (!asoc->ssnmap)
+		if (!asoc->stream)
 			goto clean_up;
 
 		error = sctp_assoc_set_id(asoc, gfp);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3382ef254e7b..0ceded37d20b 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6274,9 +6274,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * and is invalid.
 	 */
 	ssn = ntohs(data_hdr->ssn);
-	if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) {
+	if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid)))
 		return SCTP_IERROR_PROTO_VIOLATION;
-	}
 
 	/* Send the data up to the user.  Note:  Schedule  the
 	 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
deleted file mode 100644
index b9c8521c1a98..000000000000
--- a/net/sctp/ssnmap.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/* SCTP kernel implementation
- * Copyright (c) 2003 International Business Machines, Corp.
- *
- * This file is part of the SCTP kernel implementation
- *
- * These functions manipulate sctp SSN tracker.
- *
- * This SCTP implementation is free software;
- * you can redistribute it and/or modify it under the terms of
- * the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This SCTP implementation is distributed in the hope that it
- * will be useful, but WITHOUT ANY WARRANTY; without even the implied
- *                 ************************
- * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNU CC; see the file COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- * Please send any bug reports or fixes you make to the
- * email address(es):
- *    lksctp developers <linux-sctp@vger.kernel.org>
- *
- * Written or modified by:
- *    Jon Grimm             <jgrimm@us.ibm.com>
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <net/sctp/sctp.h>
-#include <net/sctp/sm.h>
-
-static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
-					    __u16 out);
-
-/* Storage size needed for map includes 2 headers and then the
- * specific needs of in or out streams.
- */
-static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
-{
-	return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
-}
-
-
-/* Create a new sctp_ssnmap.
- * Allocate room to store at least 'len' contiguous TSNs.
- */
-struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
-				    gfp_t gfp)
-{
-	struct sctp_ssnmap *retval;
-	int size;
-
-	size = sctp_ssnmap_size(in, out);
-	if (size <= KMALLOC_MAX_SIZE)
-		retval = kmalloc(size, gfp);
-	else
-		retval = (struct sctp_ssnmap *)
-			  __get_free_pages(gfp, get_order(size));
-	if (!retval)
-		goto fail;
-
-	if (!sctp_ssnmap_init(retval, in, out))
-		goto fail_map;
-
-	SCTP_DBG_OBJCNT_INC(ssnmap);
-
-	return retval;
-
-fail_map:
-	if (size <= KMALLOC_MAX_SIZE)
-		kfree(retval);
-	else
-		free_pages((unsigned long)retval, get_order(size));
-fail:
-	return NULL;
-}
-
-
-/* Initialize a block of memory as a ssnmap.  */
-static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
-					    __u16 out)
-{
-	memset(map, 0x00, sctp_ssnmap_size(in, out));
-
-	/* Start 'in' stream just after the map header. */
-	map->in.ssn = (__u16 *)&map[1];
-	map->in.len = in;
-
-	/* Start 'out' stream just after 'in'. */
-	map->out.ssn = &map->in.ssn[in];
-	map->out.len = out;
-
-	return map;
-}
-
-/* Clear out the ssnmap streams.  */
-void sctp_ssnmap_clear(struct sctp_ssnmap *map)
-{
-	size_t size;
-
-	size = (map->in.len + map->out.len) * sizeof(__u16);
-	memset(map->in.ssn, 0x00, size);
-}
-
-/* Dispose of a ssnmap.  */
-void sctp_ssnmap_free(struct sctp_ssnmap *map)
-{
-	int size;
-
-	if (unlikely(!map))
-		return;
-
-	size = sctp_ssnmap_size(map->in.len, map->out.len);
-	if (size <= KMALLOC_MAX_SIZE)
-		kfree(map);
-	else
-		free_pages((unsigned long)map, get_order(size));
-
-	SCTP_DBG_OBJCNT_DEC(ssnmap);
-}
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
new file mode 100644
index 000000000000..f86de43cbbe5
--- /dev/null
+++ b/net/sctp/stream.c
@@ -0,0 +1,85 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp tsn mapping array.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <net/sctp/sctp.h>
+
+struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp)
+{
+	struct sctp_stream *stream;
+	int i;
+
+	stream = kzalloc(sizeof(*stream), gfp);
+	if (!stream)
+		return NULL;
+
+	stream->outcnt = outcnt;
+	stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
+	if (!stream->out) {
+		kfree(stream);
+		return NULL;
+	}
+	for (i = 0; i < stream->outcnt; i++)
+		stream->out[i].state = SCTP_STREAM_OPEN;
+
+	stream->incnt = incnt;
+	stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp);
+	if (!stream->in) {
+		kfree(stream->out);
+		kfree(stream);
+		return NULL;
+	}
+
+	return stream;
+}
+
+void sctp_stream_free(struct sctp_stream *stream)
+{
+	if (unlikely(!stream))
+		return;
+
+	kfree(stream->out);
+	kfree(stream->in);
+	kfree(stream);
+}
+
+void sctp_stream_clear(struct sctp_stream *stream)
+{
+	int i;
+
+	for (i = 0; i < stream->outcnt; i++)
+		stream->out[i].ssn = 0;
+
+	for (i = 0; i < stream->incnt; i++)
+		stream->in[i].ssn = 0;
+}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 84d0fdaf7de9..aa3624d50278 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
 	struct sk_buff_head *event_list;
 	struct sk_buff *pos, *tmp;
 	struct sctp_ulpevent *cevent;
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 	__u16 sid, csid, cssn;
 
 	sid = event->stream;
-	in  = &ulpq->asoc->ssnmap->in;
+	stream  = ulpq->asoc->stream;
 
 	event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
 
@@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
 		if (csid < sid)
 			continue;
 
-		if (cssn != sctp_ssn_peek(in, sid))
+		if (cssn != sctp_ssn_peek(stream, in, sid))
 			break;
 
-		/* Found it, so mark in the ssnmap. */
-		sctp_ssn_next(in, sid);
+		/* Found it, so mark in the stream. */
+		sctp_ssn_next(stream, in, sid);
 
 		__skb_unlink(pos, &ulpq->lobby);
 
@@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 					     struct sctp_ulpevent *event)
 {
 	__u16 sid, ssn;
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 
 	/* Check if this message needs ordering.  */
 	if (SCTP_DATA_UNORDERED & event->msg_flags)
@@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 	/* Note: The stream ID must be verified before this routine.  */
 	sid = event->stream;
 	ssn = event->ssn;
-	in  = &ulpq->asoc->ssnmap->in;
+	stream  = ulpq->asoc->stream;
 
 	/* Is this the expected SSN for this stream ID?  */
-	if (ssn != sctp_ssn_peek(in, sid)) {
+	if (ssn != sctp_ssn_peek(stream, in, sid)) {
 		/* We've received something out of order, so find where it
 		 * needs to be placed.  We order by stream and then by SSN.
 		 */
@@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 	}
 
 	/* Mark that the next chunk has been found.  */
-	sctp_ssn_next(in, sid);
+	sctp_ssn_next(stream, in, sid);
 
 	/* Go find any other chunks that were waiting for
 	 * ordering.
@@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
 	struct sk_buff *pos, *tmp;
 	struct sctp_ulpevent *cevent;
 	struct sctp_ulpevent *event;
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 	struct sk_buff_head temp;
 	struct sk_buff_head *lobby = &ulpq->lobby;
 	__u16 csid, cssn;
 
-	in  = &ulpq->asoc->ssnmap->in;
+	stream = ulpq->asoc->stream;
 
 	/* We are holding the chunks by stream, by SSN.  */
 	skb_queue_head_init(&temp);
@@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
 			continue;
 
 		/* see if this ssn has been marked by skipping */
-		if (!SSN_lt(cssn, sctp_ssn_peek(in, csid)))
+		if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid)))
 			break;
 
 		__skb_unlink(pos, lobby);
@@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
 		csid = cevent->stream;
 		cssn = cevent->ssn;
 
-		if (csid == sid && cssn == sctp_ssn_peek(in, csid)) {
-			sctp_ssn_next(in, csid);
+		if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) {
+			sctp_ssn_next(stream, in, csid);
 			__skb_unlink(pos, lobby);
 			__skb_queue_tail(&temp, pos);
 			event = sctp_skb2event(pos);
@@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
  */
 void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
 {
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 
 	/* Note: The stream ID must be verified before this routine.  */
-	in  = &ulpq->asoc->ssnmap->in;
+	stream  = ulpq->asoc->stream;
 
 	/* Is this an old SSN?  If so ignore. */
-	if (SSN_lt(ssn, sctp_ssn_peek(in, sid)))
+	if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)))
 		return;
 
 	/* Mark that we are no longer expecting this SSN or lower. */
-	sctp_ssn_skip(in, sid, ssn);
+	sctp_ssn_skip(stream, in, sid, ssn);
 
 	/* Go find any other chunks that were waiting for
 	 * ordering and deliver them if needed.
-- 
cgit v1.2.3


From 780e982905bef61d13496d9af5310bf4af3a64d3 Mon Sep 17 00:00:00 2001
From: santosh.shilimkar@oracle.com
Date: Fri, 6 Jan 2017 10:44:15 -0800
Subject: RDS: validate the requested traces user input against max supported

Larger than supported value can lead to array read/write overflow.

Reported-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/af_rds.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index fd8217404162..b405f77d664c 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -310,6 +310,9 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
 	if (copy_from_user(&trace, optval, sizeof(trace)))
 		return -EFAULT;
 
+	if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
+		return -EFAULT;
+
 	rs->rs_rx_traces = trace.rx_traces;
 	for (i = 0; i < rs->rs_rx_traces; i++) {
 		if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
-- 
cgit v1.2.3


From 8cf2f704534147ce84c4df951887c1d49fe9350c Mon Sep 17 00:00:00 2001
From: Guillaume Nault
Date: Fri, 6 Jan 2017 20:03:54 +0100
Subject: l2tp: remove redundant addr_len check in l2tp_ip_bind()

addr_len's value has already been verified at this point.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 3d73278b86ca..992761e721af 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -258,7 +258,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (!sock_flag(sk, SOCK_ZAPPED))
 		goto out;
 
-	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
+	if (sk->sk_state != TCP_CLOSE)
 		goto out;
 
 	chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr);
-- 
cgit v1.2.3


From bb39b0bdc8c62e97ceedb9a5dadea0f098431d8b Mon Sep 17 00:00:00 2001
From: Guillaume Nault
Date: Fri, 6 Jan 2017 20:03:55 +0100
Subject: l2tp: make __l2tp_ip*_bind_lookup() parameters 'const'

Add const qualifier wherever possible for __l2tp_ip_bind_lookup() and
__l2tp_ip6_bind_lookup().

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ip.c  | 4 ++--
 net/l2tp/l2tp_ip6.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 992761e721af..e5686bf898f7 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -53,8 +53,8 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
 	struct sock *sk;
 
 	sk_for_each_bound(sk, &l2tp_ip_bind_table) {
-		struct inet_sock *inet = inet_sk(sk);
-		struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+		const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 
 		if (l2tp == NULL)
 			continue;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 331ccf5a7bad..2f6be6ddc8cb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -57,8 +57,8 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
 	return (struct l2tp_ip6_sock *)sk;
 }
 
-static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
-					   struct in6_addr *laddr,
+static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
+					   const struct in6_addr *laddr,
 					   const struct in6_addr *raddr,
 					   int dif, u32 tunnel_id)
 {
@@ -67,7 +67,7 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
 	sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
 		const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
 		const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
-		struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
+		const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
 
 		if (l2tp == NULL)
 			continue;
-- 
cgit v1.2.3


From 986f7cbc510e29c33b7c8c1701a902a752159425 Mon Sep 17 00:00:00 2001
From: Guillaume Nault
Date: Fri, 6 Jan 2017 20:03:56 +0100
Subject: l2tp: remove useless NULL check in __l2tp_ip*_bind_lookup()

If "l2tp" was NULL, that'd mean "sk" is NULL too. This can't happen
since "sk" is returned by sk_for_each_bound().

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ip.c  | 3 ---
 net/l2tp/l2tp_ip6.c | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index e5686bf898f7..499d3cdbfbc8 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -56,9 +56,6 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
 		const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
 		const struct inet_sock *inet = inet_sk(sk);
 
-		if (l2tp == NULL)
-			continue;
-
 		if ((l2tp->conn_id == tunnel_id) &&
 		    net_eq(sock_net(sk), net) &&
 		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 2f6be6ddc8cb..7165b06d7b25 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -69,9 +69,6 @@ static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
 		const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
 		const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
 
-		if (l2tp == NULL)
-			continue;
-
 		if ((l2tp->conn_id == tunnel_id) &&
 		    net_eq(sock_net(sk), net) &&
 		    (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) &&
-- 
cgit v1.2.3


From c5fdae044030f753c9e6a3efa8db28fe00f8fdad Mon Sep 17 00:00:00 2001
From: Guillaume Nault
Date: Fri, 6 Jan 2017 20:03:57 +0100
Subject: l2tp: rework socket comparison in __l2tp_ip*_bind_lookup()

Split conditions, so that each test becomes clearer.

Also, for l2tp_ip, check if "laddr" is 0. This prevents a socket from
binding to the unspecified address when other sockets are already bound
using the same device (if any), connection ID and namespace.

Same thing for l2tp_ip6: add ipv6_addr_any(laddr) and
ipv6_addr_any(raddr) tests to ensure that an IPv6 unspecified address
passed as parameter is properly treated a wildcard.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ip.c  | 24 +++++++++++++++++-------
 net/l2tp/l2tp_ip6.c | 25 ++++++++++++++++++-------
 2 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 499d3cdbfbc8..b07a859f21bd 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -56,13 +56,23 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
 		const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
 		const struct inet_sock *inet = inet_sk(sk);
 
-		if ((l2tp->conn_id == tunnel_id) &&
-		    net_eq(sock_net(sk), net) &&
-		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-		    (!inet->inet_daddr || !raddr || inet->inet_daddr == raddr) &&
-		    (!sk->sk_bound_dev_if || !dif ||
-		     sk->sk_bound_dev_if == dif))
-			goto found;
+		if (!net_eq(sock_net(sk), net))
+			continue;
+
+		if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+			continue;
+
+		if (inet->inet_rcv_saddr && laddr &&
+		    inet->inet_rcv_saddr != laddr)
+			continue;
+
+		if (inet->inet_daddr && raddr && inet->inet_daddr != raddr)
+			continue;
+
+		if (l2tp->conn_id != tunnel_id)
+			continue;
+
+		goto found;
 	}
 
 	sk = NULL;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 7165b06d7b25..4b06eb415f68 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -69,13 +69,24 @@ static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
 		const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
 		const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
 
-		if ((l2tp->conn_id == tunnel_id) &&
-		    net_eq(sock_net(sk), net) &&
-		    (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) &&
-		    (!raddr || ipv6_addr_any(sk_raddr) || ipv6_addr_equal(sk_raddr, raddr)) &&
-		    (!sk->sk_bound_dev_if || !dif ||
-		     sk->sk_bound_dev_if == dif))
-			goto found;
+		if (!net_eq(sock_net(sk), net))
+			continue;
+
+		if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+			continue;
+
+		if (sk_laddr && !ipv6_addr_any(sk_laddr) &&
+		    !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr))
+			continue;
+
+		if (!ipv6_addr_any(sk_raddr) && raddr &&
+		    !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr))
+			continue;
+
+		if (l2tp->conn_id != tunnel_id)
+			continue;
+
+		goto found;
 	}
 
 	sk = NULL;
-- 
cgit v1.2.3


From 111427f6eb5a5d9ce22f8a90780ac1c18113091a Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 6 Jan 2017 16:42:00 -0500
Subject: net: dsa: move HWMON support to its own file

Isolate the HWMON support in DSA in its own file. Currently only the
legacy DSA code is concerned.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Makefile   |   1 +
 net/dsa/dsa.c      | 131 +----------------------------------------------
 net/dsa/dsa_priv.h |   9 ++++
 net/dsa/hwmon.c    | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+), 129 deletions(-)
 create mode 100644 net/dsa/hwmon.c

(limited to 'net')

diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index a3380ed0e0be..560b6747c276 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,7 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += dsa.o slave.o dsa2.o
+dsa_core-$(CONFIG_NET_DSA_HWMON) += hwmon.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 3f85be0aae34..cda787ebad15 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -9,9 +9,7 @@
  * (at your option) any later version.
  */
 
-#include <linux/ctype.h>
 #include <linux/device.h>
-#include <linux/hwmon.h>
 #include <linux/list.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -108,105 +106,6 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 	return ret;
 }
 
-/* hwmon support ************************************************************/
-
-#ifdef CONFIG_NET_DSA_HWMON
-
-static ssize_t temp1_input_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-static DEVICE_ATTR_RO(temp1_input);
-
-static ssize_t temp1_max_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp_limit(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-
-static ssize_t temp1_max_store(struct device *dev,
-			       struct device_attribute *attr, const char *buf,
-			       size_t count)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = kstrtoint(buf, 0, &temp);
-	if (ret < 0)
-		return ret;
-
-	ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
-	if (ret < 0)
-		return ret;
-
-	return count;
-}
-static DEVICE_ATTR_RW(temp1_max);
-
-static ssize_t temp1_max_alarm_show(struct device *dev,
-				    struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	bool alarm;
-	int ret;
-
-	ret = ds->ops->get_temp_alarm(ds, &alarm);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", alarm);
-}
-static DEVICE_ATTR_RO(temp1_max_alarm);
-
-static struct attribute *dsa_hwmon_attrs[] = {
-	&dev_attr_temp1_input.attr,	/* 0 */
-	&dev_attr_temp1_max.attr,	/* 1 */
-	&dev_attr_temp1_max_alarm.attr,	/* 2 */
-	NULL
-};
-
-static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
-				       struct attribute *attr, int index)
-{
-	struct device *dev = container_of(kobj, struct device, kobj);
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	struct dsa_switch_ops *ops = ds->ops;
-	umode_t mode = attr->mode;
-
-	if (index == 1) {
-		if (!ops->get_temp_limit)
-			mode = 0;
-		else if (!ops->set_temp_limit)
-			mode &= ~S_IWUSR;
-	} else if (index == 2 && !ops->get_temp_alarm) {
-		mode = 0;
-	}
-	return mode;
-}
-
-static const struct attribute_group dsa_hwmon_group = {
-	.attrs = dsa_hwmon_attrs,
-	.is_visible = dsa_hwmon_attrs_visible,
-};
-__ATTRIBUTE_GROUPS(dsa_hwmon);
-
-#endif /* CONFIG_NET_DSA_HWMON */
-
 /* basic switch operations **************************************************/
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 		      struct device_node *port_dn, int port)
@@ -415,30 +314,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_NET_DSA_HWMON
-	/* If the switch provides a temperature sensor,
-	 * register with hardware monitoring subsystem.
-	 * Treat registration error as non-fatal and ignore it.
-	 */
-	if (ops->get_temp) {
-		const char *netname = netdev_name(dst->master_netdev);
-		char hname[IFNAMSIZ + 1];
-		int i, j;
-
-		/* Create valid hwmon 'name' attribute */
-		for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
-			if (isalnum(netname[i]))
-				hname[j++] = netname[i];
-		}
-		hname[j] = '\0';
-		scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d",
-			  hname, index);
-		ds->hwmon_dev = hwmon_device_register_with_groups(NULL,
-					ds->hwmon_name, ds, dsa_hwmon_groups);
-		if (IS_ERR(ds->hwmon_dev))
-			ds->hwmon_dev = NULL;
-	}
-#endif /* CONFIG_NET_DSA_HWMON */
+	dsa_hwmon_register(ds);
 
 	return 0;
 }
@@ -498,10 +374,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 {
 	int port;
 
-#ifdef CONFIG_NET_DSA_HWMON
-	if (ds->hwmon_dev)
-		hwmon_device_unregister(ds->hwmon_dev);
-#endif
+	dsa_hwmon_unregister(ds);
 
 	/* Destroy network devices for physical switch ports. */
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 63ae1484abae..7e3385ec73f4 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -56,6 +56,15 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
 void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
 
+/* hwmon.c */
+#ifdef CONFIG_NET_DSA_HWMON
+void dsa_hwmon_register(struct dsa_switch *ds);
+void dsa_hwmon_unregister(struct dsa_switch *ds);
+#else
+static inline void dsa_hwmon_register(struct dsa_switch *ds) { }
+static inline void dsa_hwmon_unregister(struct dsa_switch *ds) { }
+#endif
+
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/hwmon.c b/net/dsa/hwmon.c
new file mode 100644
index 000000000000..3a9cdf0b22b8
--- /dev/null
+++ b/net/dsa/hwmon.c
@@ -0,0 +1,147 @@
+/*
+ * net/dsa/hwmon.c - HWMON subsystem support
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/ctype.h>
+#include <linux/hwmon.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+static ssize_t temp1_input_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	int temp, ret;
+
+	ret = ds->ops->get_temp(ds, &temp);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%d\n", temp * 1000);
+}
+static DEVICE_ATTR_RO(temp1_input);
+
+static ssize_t temp1_max_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	int temp, ret;
+
+	ret = ds->ops->get_temp_limit(ds, &temp);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%d\n", temp * 1000);
+}
+
+static ssize_t temp1_max_store(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	int temp, ret;
+
+	ret = kstrtoint(buf, 0, &temp);
+	if (ret < 0)
+		return ret;
+
+	ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+static DEVICE_ATTR_RW(temp1_max);
+
+static ssize_t temp1_max_alarm_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	bool alarm;
+	int ret;
+
+	ret = ds->ops->get_temp_alarm(ds, &alarm);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%d\n", alarm);
+}
+static DEVICE_ATTR_RO(temp1_max_alarm);
+
+static struct attribute *dsa_hwmon_attrs[] = {
+	&dev_attr_temp1_input.attr,	/* 0 */
+	&dev_attr_temp1_max.attr,	/* 1 */
+	&dev_attr_temp1_max_alarm.attr,	/* 2 */
+	NULL
+};
+
+static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
+				       struct attribute *attr, int index)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	struct dsa_switch_ops *ops = ds->ops;
+	umode_t mode = attr->mode;
+
+	if (index == 1) {
+		if (!ops->get_temp_limit)
+			mode = 0;
+		else if (!ops->set_temp_limit)
+			mode &= ~S_IWUSR;
+	} else if (index == 2 && !ops->get_temp_alarm) {
+		mode = 0;
+	}
+	return mode;
+}
+
+static const struct attribute_group dsa_hwmon_group = {
+	.attrs = dsa_hwmon_attrs,
+	.is_visible = dsa_hwmon_attrs_visible,
+};
+__ATTRIBUTE_GROUPS(dsa_hwmon);
+
+void dsa_hwmon_register(struct dsa_switch *ds)
+{
+	const char *netname = netdev_name(ds->dst->master_netdev);
+	char hname[IFNAMSIZ + 1];
+	int i, j;
+
+	/* If the switch provides temperature accessors, register with hardware
+	 * monitoring subsystem. Treat registration error as non-fatal.
+	 */
+	if (!ds->ops->get_temp)
+		return;
+
+	/* Create valid hwmon 'name' attribute */
+	for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
+		if (isalnum(netname[i]))
+			hname[j++] = netname[i];
+	}
+	hname[j] = '\0';
+	scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d", hname,
+		  ds->index);
+	ds->hwmon_dev = hwmon_device_register_with_groups(NULL, ds->hwmon_name,
+							  ds, dsa_hwmon_groups);
+	if (IS_ERR(ds->hwmon_dev)) {
+		pr_warn("DSA: failed to register HWMON subsystem for switch %d\n",
+			ds->index);
+		ds->hwmon_dev = NULL;
+	} else {
+		pr_info("DSA: registered HWMON subsystem for switch %d\n",
+			ds->index);
+	}
+}
+
+void dsa_hwmon_unregister(struct dsa_switch *ds)
+{
+	if (ds->hwmon_dev) {
+		hwmon_device_unregister(ds->hwmon_dev);
+		ds->hwmon_dev = NULL;
+	}
+}
-- 
cgit v1.2.3


From 9f09eaeae28a0c67b8fc2b5acb411a5d4fd8cc80 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Fri, 6 Jan 2017 17:39:06 -0800
Subject: net: ipmr: Remove nowait arg to ipmr_get_route

ipmr_get_route has 1 caller and the nowait arg is 0. Remove the arg and
simplify ipmr_get_route accordingly.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 2 +-
 net/ipv4/ipmr.c        | 7 +------
 net/ipv4/route.c       | 2 +-
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index e5fb81376e92..f019b62f27b5 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -120,5 +120,5 @@ struct mfc_cache {
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
-		   struct rtmsg *rtm, int nowait, u32 portid);
+		   struct rtmsg *rtm, u32 portid);
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b35dda57586b..824c4fdf21eb 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2136,7 +2136,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
-		   struct rtmsg *rtm, int nowait, u32 portid)
+		   struct rtmsg *rtm, u32 portid)
 {
 	struct mfc_cache *cache;
 	struct mr_table *mrt;
@@ -2160,11 +2160,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		struct net_device *dev;
 		int vif = -1;
 
-		if (nowait) {
-			rcu_read_unlock();
-			return -EAGAIN;
-		}
-
 		dev = skb->dev;
 		read_lock(&mrt_lock);
 		if (dev)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7b52ac20145b..0965f8c59e7e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2541,7 +2541,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
 			int err = ipmr_get_route(net, skb,
 						 fl4->saddr, fl4->daddr,
-						 r, 0, portid);
+						 r, portid);
 
 			if (err <= 0) {
 				if (err == 0)
-- 
cgit v1.2.3


From dc33da59ff887985db559d0b116ca8b98f63651e Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Fri, 6 Jan 2017 17:39:58 -0800
Subject: net: ipv4: Remove flow arg from ip_mkroute_input

fl4 arg is not used; remove it.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0965f8c59e7e..f51823dc998b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1758,7 +1758,6 @@ standard_hash:
 
 static int ip_mkroute_input(struct sk_buff *skb,
 			    struct fib_result *res,
-			    const struct flowi4 *fl4,
 			    struct in_device *in_dev,
 			    __be32 daddr, __be32 saddr, u32 tos)
 {
@@ -1883,7 +1882,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res.type != RTN_UNICAST)
 		goto martian_destination;
 
-	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+	err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
 out:	return err;
 
 brd_input:
-- 
cgit v1.2.3


From bc1f44709cf27fb2a5766cadafe7e2ad5e9cb221 Mon Sep 17 00:00:00 2001
From: stephen hemminger
Date: Fri, 6 Jan 2017 19:12:52 -0800
Subject: net: make ndo_get_stats64 a void function

The network device operation for reading statistics is only called
in one place, and it ignores the return value. Having a structure
return value is potentially confusing because some future driver could
incorrectly assume that the return value was used.

Fix all drivers with ndo_get_stats64 to have a void function.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                      | 10 ++++------
 drivers/net/dummy.c                                  |  5 ++---
 drivers/net/ethernet/alacritech/slicoss.c            |  6 ++----
 drivers/net/ethernet/amazon/ena/ena_netdev.c         | 10 ++++------
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c             |  6 ++----
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c     |  4 +---
 drivers/net/ethernet/atheros/alx/main.c              |  6 ++----
 drivers/net/ethernet/broadcom/b44.c                  |  5 ++---
 drivers/net/ethernet/broadcom/bnx2.c                 |  5 ++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c            |  6 ++----
 drivers/net/ethernet/broadcom/tg3.c                  |  8 +++-----
 drivers/net/ethernet/brocade/bna/bnad.c              |  6 ++----
 drivers/net/ethernet/calxeda/xgmac.c                 |  5 ++---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c     |  5 ++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c      |  7 +++----
 drivers/net/ethernet/cisco/enic/enic_main.c          |  8 +++-----
 drivers/net/ethernet/ec_bhf.c                        |  4 +---
 drivers/net/ethernet/emulex/benet/be_main.c          |  5 ++---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c       |  6 ++----
 drivers/net/ethernet/hisilicon/hns/hns_enet.c        |  6 ++----
 drivers/net/ethernet/ibm/ehea/ehea_main.c            |  5 ++---
 drivers/net/ethernet/intel/e1000e/e1000.h            |  4 ++--
 drivers/net/ethernet/intel/e1000e/netdev.c           |  5 ++---
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c      |  6 ++----
 drivers/net/ethernet/intel/i40e/i40e.h               |  5 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c          | 18 ++++++------------
 drivers/net/ethernet/intel/igb/igb_main.c            | 10 ++++------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  7 ++++---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c    |  6 ++----
 drivers/net/ethernet/marvell/mvneta.c                |  4 +---
 drivers/net/ethernet/marvell/mvpp2.c                 |  4 +---
 drivers/net/ethernet/marvell/sky2.c                  |  6 ++----
 drivers/net/ethernet/mediatek/mtk_eth_soc.c          |  6 ++----
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c       |  4 +---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c    |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c     |  3 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c       |  4 +---
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c       |  3 +--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c     |  9 ++++-----
 drivers/net/ethernet/neterion/vxge/vxge-main.c       |  4 +---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  |  6 ++----
 drivers/net/ethernet/nvidia/forcedeth.c              |  4 +---
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c | 10 ++++------
 drivers/net/ethernet/qlogic/qede/qede_main.c         |  7 ++-----
 drivers/net/ethernet/qualcomm/emac/emac.c            |  6 ++----
 drivers/net/ethernet/realtek/8139too.c               |  9 +++------
 drivers/net/ethernet/realtek/r8169.c                 |  4 +---
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c      |  8 ++------
 drivers/net/ethernet/sfc/efx.c                       |  6 ++----
 drivers/net/ethernet/sfc/falcon/efx.c                |  6 ++----
 drivers/net/ethernet/sun/niu.c                       |  6 ++----
 drivers/net/ethernet/synopsys/dwc_eth_qos.c          |  4 +---
 drivers/net/ethernet/tile/tilepro.c                  |  4 ++--
 drivers/net/ethernet/via/via-rhine.c                 |  8 +++-----
 drivers/net/fjes/fjes_main.c                         |  7 ++-----
 drivers/net/hyperv/netvsc_drv.c                      |  6 ++----
 drivers/net/ifb.c                                    |  6 ++----
 drivers/net/ipvlan/ipvlan_main.c                     |  5 ++---
 drivers/net/loopback.c                               |  5 ++---
 drivers/net/macsec.c                                 |  8 +++-----
 drivers/net/macvlan.c                                |  5 ++---
 drivers/net/nlmon.c                                  |  4 +---
 drivers/net/ppp/ppp_generic.c                        |  4 +---
 drivers/net/slip/slip.c                              |  3 +--
 drivers/net/team/team.c                              |  3 +--
 drivers/net/tun.c                                    |  3 +--
 drivers/net/veth.c                                   |  6 ++----
 drivers/net/virtio_net.c                             |  6 ++----
 drivers/net/vmxnet3/vmxnet3_ethtool.c                |  4 +---
 drivers/net/vmxnet3/vmxnet3_int.h                    |  4 ++--
 drivers/net/vrf.c                                    |  5 ++---
 drivers/net/xen-netfront.c                           |  6 ++----
 drivers/staging/netlogic/xlr_net.c                   | 10 +---------
 include/linux/netdevice.h                            |  8 ++++----
 include/net/ip_tunnels.h                             |  4 ++--
 net/8021q/vlan_dev.c                                 |  5 ++---
 net/bridge/br_device.c                               |  6 ++----
 net/ipv4/ip_tunnel_core.c                            |  6 ++----
 net/l2tp/l2tp_eth.c                                  |  6 ++----
 net/mac80211/iface.c                                 |  4 +---
 net/openvswitch/vport-internal_dev.c                 |  4 +---
 net/sched/sch_teql.c                                 |  5 ++---
 82 files changed, 166 insertions(+), 309 deletions(-)

(limited to 'net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8029dd4912b6..36919221b3f0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -211,8 +211,8 @@ static int lacp_fast;
 
 static int bond_init(struct net_device *bond_dev);
 static void bond_uninit(struct net_device *bond_dev);
-static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
-						struct rtnl_link_stats64 *stats);
+static void bond_get_stats(struct net_device *bond_dev,
+			   struct rtnl_link_stats64 *stats);
 static void bond_slave_arr_handler(struct work_struct *work);
 static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
 				  int mod);
@@ -3337,8 +3337,8 @@ static void bond_fold_stats(struct rtnl_link_stats64 *_res,
 	}
 }
 
-static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
-						struct rtnl_link_stats64 *stats)
+static void bond_get_stats(struct net_device *bond_dev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct rtnl_link_stats64 temp;
@@ -3362,8 +3362,6 @@ static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
 
 	memcpy(&bond->bond_stats, stats, sizeof(*stats));
 	spin_unlock(&bond->stats_lock);
-
-	return stats;
 }
 
 static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 6421835f11b7..1f2de4e8207c 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -54,8 +54,8 @@ struct pcpu_dstats {
 	struct u64_stats_sync	syncp;
 };
 
-static struct rtnl_link_stats64 *dummy_get_stats64(struct net_device *dev,
-						   struct rtnl_link_stats64 *stats)
+static void dummy_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats)
 {
 	int i;
 
@@ -73,7 +73,6 @@ static struct rtnl_link_stats64 *dummy_get_stats64(struct net_device *dev,
 		stats->tx_bytes += tbytes;
 		stats->tx_packets += tpackets;
 	}
-	return stats;
 }
 
 static netdev_tx_t dummy_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/drivers/net/ethernet/alacritech/slicoss.c b/drivers/net/ethernet/alacritech/slicoss.c
index b21d8aa8d653..15a8096c60df 100644
--- a/drivers/net/ethernet/alacritech/slicoss.c
+++ b/drivers/net/ethernet/alacritech/slicoss.c
@@ -1471,8 +1471,8 @@ drop_skb:
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *slic_get_stats(struct net_device *dev,
-						struct rtnl_link_stats64 *lst)
+static void slic_get_stats(struct net_device *dev,
+			   struct rtnl_link_stats64 *lst)
 {
 	struct slic_device *sdev = netdev_priv(dev);
 	struct slic_stats *stats = &sdev->stats;
@@ -1489,8 +1489,6 @@ static struct rtnl_link_stats64 *slic_get_stats(struct net_device *dev,
 	SLIC_GET_STATS_COUNTER(lst->rx_crc_errors, stats, rx_crc);
 	SLIC_GET_STATS_COUNTER(lst->rx_fifo_errors, stats, rx_oflow802);
 	SLIC_GET_STATS_COUNTER(lst->tx_carrier_errors, stats, tx_carrier);
-
-	return lst;
 }
 
 static int slic_get_sset_count(struct net_device *dev, int sset)
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index cc8b13ebfa75..aca95b397393 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2165,19 +2165,19 @@ err:
 	ena_com_delete_debug_area(adapter->ena_dev);
 }
 
-static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
-						 struct rtnl_link_stats64 *stats)
+static void ena_get_stats64(struct net_device *netdev,
+			    struct rtnl_link_stats64 *stats)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	struct ena_admin_basic_stats ena_stats;
 	int rc;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
-		return NULL;
+		return;
 
 	rc = ena_com_get_dev_basic_stats(adapter->ena_dev, &ena_stats);
 	if (rc)
-		return NULL;
+		return;
 
 	stats->tx_bytes = ((u64)ena_stats.tx_bytes_high << 32) |
 		ena_stats.tx_bytes_low;
@@ -2204,8 +2204,6 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 
 	stats->rx_errors = 0;
 	stats->tx_errors = 0;
-
-	return stats;
 }
 
 static const struct net_device_ops ena_netdev_ops = {
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 155190db682d..130de11fa553 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1759,8 +1759,8 @@ static void xgbe_tx_timeout(struct net_device *netdev)
 	schedule_work(&pdata->restart_work);
 }
 
-static struct rtnl_link_stats64 *xgbe_get_stats64(struct net_device *netdev,
-						  struct rtnl_link_stats64 *s)
+static void xgbe_get_stats64(struct net_device *netdev,
+			     struct rtnl_link_stats64 *s)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	struct xgbe_mmc_stats *pstats = &pdata->mmc_stats;
@@ -1786,8 +1786,6 @@ static struct rtnl_link_stats64 *xgbe_get_stats64(struct net_device *netdev,
 	s->tx_dropped = netdev->stats.tx_dropped;
 
 	DBGPR("<--%s\n", __func__);
-
-	return s;
 }
 
 static int xgbe_vlan_rx_add_vid(struct net_device *netdev, __be16 proto,
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index 523b8eff6d7b..45f2204e6695 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -1453,7 +1453,7 @@ err:
 	return ret;
 }
 
-static struct rtnl_link_stats64 *xgene_enet_get_stats64(
+static void xgene_enet_get_stats64(
 			struct net_device *ndev,
 			struct rtnl_link_stats64 *storage)
 {
@@ -1484,8 +1484,6 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64(
 		}
 	}
 	memcpy(storage, stats, sizeof(struct rtnl_link_stats64));
-
-	return storage;
 }
 
 static int xgene_enet_set_mac_address(struct net_device *ndev, void *addr)
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index c8f525574d68..c66195d00ed4 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1643,8 +1643,8 @@ static void alx_poll_controller(struct net_device *netdev)
 }
 #endif
 
-static struct rtnl_link_stats64 *alx_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *net_stats)
+static void alx_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *net_stats)
 {
 	struct alx_priv *alx = netdev_priv(dev);
 	struct alx_hw_stats *hw_stats = &alx->hw.stats;
@@ -1688,8 +1688,6 @@ static struct rtnl_link_stats64 *alx_get_stats64(struct net_device *dev,
 	net_stats->rx_packets = hw_stats->rx_ok + net_stats->rx_errors;
 
 	spin_unlock(&alx->stats_lock);
-
-	return net_stats;
 }
 
 static const struct net_device_ops alx_netdev_ops = {
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 48707ed76ffc..7aef70f7d8ef 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -1674,8 +1674,8 @@ static int b44_close(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *b44_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *nstat)
+static void b44_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *nstat)
 {
 	struct b44 *bp = netdev_priv(dev);
 	struct b44_hw_stats *hwstat = &bp->hw_stats;
@@ -1718,7 +1718,6 @@ static struct rtnl_link_stats64 *b44_get_stats64(struct net_device *dev,
 #endif
 	} while (u64_stats_fetch_retry_irq(&hwstat->syncp, start));
 
-	return nstat;
 }
 
 static int __b44_load_mcast(struct b44 *bp, struct net_device *dev)
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index d5d1026be4b7..de1d07c08495 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6821,13 +6821,13 @@ bnx2_save_stats(struct bnx2 *bp)
 	(unsigned long) (bp->stats_blk->ctr +			\
 			 bp->temp_stats_blk->ctr)
 
-static struct rtnl_link_stats64 *
+static void
 bnx2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 {
 	struct bnx2 *bp = netdev_priv(dev);
 
 	if (bp->stats_blk == NULL)
-		return net_stats;
+		return;
 
 	net_stats->rx_packets =
 		GET_64BIT_NET_STATS(stat_IfHCInUcastPkts) +
@@ -6891,7 +6891,6 @@ bnx2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 		GET_32BIT_NET_STATS(stat_IfInMBUFDiscards) +
 		GET_32BIT_NET_STATS(stat_FwRxDrop);
 
-	return net_stats;
 }
 
 /* All ethtool functions called with rtnl_lock */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 98e948489700..e5f458396e1a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5879,7 +5879,7 @@ static int bnxt_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return -EOPNOTSUPP;
 }
 
-static struct rtnl_link_stats64 *
+static void
 bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	u32 i;
@@ -5888,7 +5888,7 @@ bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	memset(stats, 0, sizeof(struct rtnl_link_stats64));
 
 	if (!bp->bnapi)
-		return stats;
+		return;
 
 	/* TODO check if we need to synchronize with bnxt_close path */
 	for (i = 0; i < bp->cp_nr_rings; i++) {
@@ -5935,8 +5935,6 @@ bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_fifo_errors = le64_to_cpu(tx->tx_fifo_underruns);
 		stats->tx_errors = le64_to_cpu(tx->tx_err);
 	}
-
-	return stats;
 }
 
 static bool bnxt_mc_list_updated(struct bnxt *bp, u32 *rx_mask)
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 185e9e047aa9..800328f562fa 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14142,8 +14142,8 @@ static const struct ethtool_ops tg3_ethtool_ops = {
 	.set_link_ksettings	= tg3_set_link_ksettings,
 };
 
-static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *stats)
+static void tg3_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *stats)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
@@ -14151,13 +14151,11 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
 	if (!tp->hw_stats) {
 		*stats = tp->net_stats_prev;
 		spin_unlock_bh(&tp->lock);
-		return stats;
+		return;
 	}
 
 	tg3_get_nstats(tp, stats);
 	spin_unlock_bh(&tp->lock);
-
-	return stats;
 }
 
 static void tg3_set_rx_mode(struct net_device *dev)
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index 112030828c4b..73a94113db1f 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -3111,7 +3111,7 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev)
  * Used spin_lock to synchronize reading of stats structures, which
  * is written by BNA under the same lock.
  */
-static struct rtnl_link_stats64 *
+static void
 bnad_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
 {
 	struct bnad *bnad = netdev_priv(netdev);
@@ -3123,8 +3123,6 @@ bnad_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
 	bnad_netdev_hwstats_fill(bnad, stats);
 
 	spin_unlock_irqrestore(&bnad->bna_lock, flags);
-
-	return stats;
 }
 
 static void
@@ -3427,7 +3425,7 @@ static const struct net_device_ops bnad_netdev_ops = {
 	.ndo_open		= bnad_open,
 	.ndo_stop		= bnad_stop,
 	.ndo_start_xmit		= bnad_start_xmit,
-	.ndo_get_stats64		= bnad_get_stats64,
+	.ndo_get_stats64	= bnad_get_stats64,
 	.ndo_set_rx_mode	= bnad_set_rx_mode,
 	.ndo_validate_addr      = eth_validate_addr,
 	.ndo_set_mac_address    = bnad_set_mac_address,
diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c
index ce7de6f72512..b0540658afad 100644
--- a/drivers/net/ethernet/calxeda/xgmac.c
+++ b/drivers/net/ethernet/calxeda/xgmac.c
@@ -1446,9 +1446,9 @@ static void xgmac_poll_controller(struct net_device *dev)
 }
 #endif
 
-static struct rtnl_link_stats64 *
+static void
 xgmac_get_stats64(struct net_device *dev,
-		       struct rtnl_link_stats64 *storage)
+		  struct rtnl_link_stats64 *storage)
 {
 	struct xgmac_priv *priv = netdev_priv(dev);
 	void __iomem *base = priv->base;
@@ -1476,7 +1476,6 @@ xgmac_get_stats64(struct net_device *dev,
 
 	writel(0, base + XGMAC_MMC_CTRL);
 	spin_unlock_bh(&priv->stats_lock);
-	return storage;
 }
 
 static int xgmac_set_mac_address(struct net_device *dev, void *p)
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 2006f58b14b1..273eafdb1c57 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1461,8 +1461,8 @@ void nicvf_update_stats(struct nicvf *nic)
 		nicvf_update_sq_stats(nic, qidx);
 }
 
-static struct rtnl_link_stats64 *nicvf_get_stats64(struct net_device *netdev,
-					    struct rtnl_link_stats64 *stats)
+static void nicvf_get_stats64(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct nicvf *nic = netdev_priv(netdev);
 	struct nicvf_hw_stats *hw_stats = &nic->hw_stats;
@@ -1478,7 +1478,6 @@ static struct rtnl_link_stats64 *nicvf_get_stats64(struct net_device *netdev,
 	stats->tx_packets = hw_stats->tx_frames;
 	stats->tx_dropped = hw_stats->tx_drops;
 
-	return stats;
 }
 
 static void nicvf_tx_timeout(struct net_device *dev)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 629b11879ceb..3349e1f376c3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2375,8 +2375,8 @@ int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
 }
 EXPORT_SYMBOL(cxgb4_remove_server_filter);
 
-static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
-						struct rtnl_link_stats64 *ns)
+static void cxgb_get_stats(struct net_device *dev,
+			   struct rtnl_link_stats64 *ns)
 {
 	struct port_stats stats;
 	struct port_info *p = netdev_priv(dev);
@@ -2389,7 +2389,7 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
 	spin_lock(&adapter->stats_lock);
 	if (!netif_device_present(dev)) {
 		spin_unlock(&adapter->stats_lock);
-		return ns;
+		return;
 	}
 	t4_get_port_stats_offset(adapter, p->tx_chan, &stats,
 				 &p->stats_base);
@@ -2423,7 +2423,6 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
 	ns->tx_errors = stats.tx_error_frames;
 	ns->rx_errors = stats.rx_symbol_err + stats.rx_fcs_err +
 		ns->rx_length_errors + stats.rx_len_err + ns->rx_fifo_errors;
-	return ns;
 }
 
 static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index cdd7a1a59aa7..c5842c525eed 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -680,8 +680,8 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb,
 }
 
 /* dev_base_lock rwlock held, nominally process context */
-static struct rtnl_link_stats64 *enic_get_stats(struct net_device *netdev,
-						struct rtnl_link_stats64 *net_stats)
+static void enic_get_stats(struct net_device *netdev,
+			   struct rtnl_link_stats64 *net_stats)
 {
 	struct enic *enic = netdev_priv(netdev);
 	struct vnic_stats *stats;
@@ -693,7 +693,7 @@ static struct rtnl_link_stats64 *enic_get_stats(struct net_device *netdev,
 	 * recorded stats.
 	 */
 	if (err == -ENOMEM)
-		return net_stats;
+		return;
 
 	net_stats->tx_packets = stats->tx.tx_frames_ok;
 	net_stats->tx_bytes = stats->tx.tx_bytes_ok;
@@ -707,8 +707,6 @@ static struct rtnl_link_stats64 *enic_get_stats(struct net_device *netdev,
 	net_stats->rx_over_errors = enic->rq_truncated_pkts;
 	net_stats->rx_crc_errors = enic->rq_bad_fcs;
 	net_stats->rx_dropped = stats->rx.rx_no_bufs + stats->rx.rx_drop;
-
-	return net_stats;
 }
 
 static int enic_mc_sync(struct net_device *netdev, const u8 *mc_addr)
diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c
index 7bf78a0d322c..278f139f2a22 100644
--- a/drivers/net/ethernet/ec_bhf.c
+++ b/drivers/net/ethernet/ec_bhf.c
@@ -457,7 +457,7 @@ static int ec_bhf_stop(struct net_device *net_dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *
+static void
 ec_bhf_get_stats(struct net_device *net_dev,
 		 struct rtnl_link_stats64 *stats)
 {
@@ -472,8 +472,6 @@ ec_bhf_get_stats(struct net_device *net_dev,
 
 	stats->tx_bytes = priv->stat_tx_bytes;
 	stats->rx_bytes = priv->stat_rx_bytes;
-
-	return stats;
 }
 
 static const struct net_device_ops ec_bhf_netdev_ops = {
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 225e9a4877d7..0a679d2eaeee 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -639,8 +639,8 @@ void be_parse_stats(struct be_adapter *adapter)
 	}
 }
 
-static struct rtnl_link_stats64 *be_get_stats64(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void be_get_stats64(struct net_device *netdev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_drv_stats *drvs = &adapter->drv_stats;
@@ -704,7 +704,6 @@ static struct rtnl_link_stats64 *be_get_stats64(struct net_device *netdev,
 	stats->rx_fifo_errors = drvs->rxpp_fifo_overflow_drop +
 				drvs->rx_input_fifo_overflow_drop +
 				drvs->rx_drops_no_pbuf;
-	return stats;
 }
 
 void be_link_status_update(struct be_adapter *adapter, u8 link_status)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index c9b7ad65e563..b7cbc26a0911 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -313,8 +313,8 @@ static void dpaa_tx_timeout(struct net_device *net_dev)
 /* Calculates the statistics for the given device by adding the statistics
  * collected by each CPU.
  */
-static struct rtnl_link_stats64 *dpaa_get_stats64(struct net_device *net_dev,
-						  struct rtnl_link_stats64 *s)
+static void dpaa_get_stats64(struct net_device *net_dev,
+			     struct rtnl_link_stats64 *s)
 {
 	int numstats = sizeof(struct rtnl_link_stats64) / sizeof(u64);
 	struct dpaa_priv *priv = netdev_priv(net_dev);
@@ -332,8 +332,6 @@ static struct rtnl_link_stats64 *dpaa_get_stats64(struct net_device *net_dev,
 		for (j = 0; j < numstats; j++)
 			netstats[j] += cpustats[j];
 	}
-
-	return s;
 }
 
 static struct mac_device *dpaa_mac_dev_get(struct platform_device *pdev)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 672b64606321..b7cb61385ad8 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -1625,8 +1625,8 @@ void hns_nic_set_rx_mode(struct net_device *ndev)
 		netdev_err(ndev, "sync uc address fail\n");
 }
 
-struct rtnl_link_stats64 *hns_nic_get_stats64(struct net_device *ndev,
-					      struct rtnl_link_stats64 *stats)
+static void hns_nic_get_stats64(struct net_device *ndev,
+				struct rtnl_link_stats64 *stats)
 {
 	int idx = 0;
 	u64 tx_bytes = 0;
@@ -1668,8 +1668,6 @@ struct rtnl_link_stats64 *hns_nic_get_stats64(struct net_device *ndev,
 	stats->tx_window_errors = ndev->stats.tx_window_errors;
 	stats->rx_compressed = ndev->stats.rx_compressed;
 	stats->tx_compressed = ndev->stats.tx_compressed;
-
-	return stats;
 }
 
 static u16
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 702446a93697..1e53d7a82675 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -328,8 +328,8 @@ out:
 	spin_unlock_irqrestore(&ehea_bcmc_regs.lock, flags);
 }
 
-static struct rtnl_link_stats64 *ehea_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *stats)
+static void ehea_get_stats64(struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
 {
 	struct ehea_port *port = netdev_priv(dev);
 	u64 rx_packets = 0, tx_packets = 0, rx_bytes = 0, tx_bytes = 0;
@@ -352,7 +352,6 @@ static struct rtnl_link_stats64 *ehea_get_stats64(struct net_device *dev,
 
 	stats->multicast = port->stats.multicast;
 	stats->rx_errors = port->stats.rx_errors;
-	return stats;
 }
 
 static void ehea_update_stats(struct work_struct *work)
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 879cca47b021..a29b12e80855 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -493,8 +493,8 @@ int e1000e_setup_rx_resources(struct e1000_ring *ring);
 int e1000e_setup_tx_resources(struct e1000_ring *ring);
 void e1000e_free_rx_resources(struct e1000_ring *ring);
 void e1000e_free_tx_resources(struct e1000_ring *ring);
-struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats);
+void e1000e_get_stats64(struct net_device *netdev,
+			struct rtnl_link_stats64 *stats);
 void e1000e_set_interrupt_capability(struct e1000_adapter *adapter);
 void e1000e_reset_interrupt_capability(struct e1000_adapter *adapter);
 void e1000e_get_hw_control(struct e1000_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index af3960853a32..723025b317cc 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5920,8 +5920,8 @@ static void e1000_reset_task(struct work_struct *work)
  *
  * Returns the address of the device statistics structure.
  **/
-struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats)
+void e1000e_get_stats64(struct net_device *netdev,
+			struct rtnl_link_stats64 *stats)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 
@@ -5958,7 +5958,6 @@ struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev,
 	/* Tx Dropped needs to be maintained elsewhere */
 
 	spin_unlock(&adapter->stats64_lock);
-	return stats;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index bc5ef6eb3dd6..01db688cf539 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1118,8 +1118,8 @@ void fm10k_reset_rx_state(struct fm10k_intfc *interface)
  * Returns 64bit statistics, for use in the ndo_get_stats64 callback. This
  * function replaces fm10k_get_stats for kernels which support it.
  */
-static struct rtnl_link_stats64 *fm10k_get_stats64(struct net_device *netdev,
-						   struct rtnl_link_stats64 *stats)
+static void fm10k_get_stats64(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_ring *ring;
@@ -1164,8 +1164,6 @@ static struct rtnl_link_stats64 *fm10k_get_stats64(struct net_device *netdev,
 
 	/* following stats updated by fm10k_service_task() */
 	stats->rx_missed_errors	= netdev->stats.rx_missed_errors;
-
-	return stats;
 }
 
 int fm10k_setup_tc(struct net_device *dev, u8 tc)
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index ba8d30984bee..342007df4663 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -834,9 +834,8 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector)
 void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf);
 void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba);
 #ifdef I40E_FCOE
-struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
-					     struct net_device *netdev,
-					     struct rtnl_link_stats64 *storage);
+void i40e_get_netdev_stats_struct(struct net_device *netdev,
+				  struct rtnl_link_stats64 *storage);
 int i40e_set_mac(struct net_device *netdev, void *p);
 void i40e_set_rx_mode(struct net_device *netdev);
 #endif
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ad4cf639430e..b2f76d24000d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -409,15 +409,11 @@ struct rtnl_link_stats64 *i40e_get_vsi_stats_struct(struct i40e_vsi *vsi)
  * Returns the address of the device statistics structure.
  * The statistics are actually updated from the service task.
  **/
-#ifdef I40E_FCOE
-struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
-					     struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats)
-#else
-static struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
-					     struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats)
+#ifndef I40E_FCOE
+static
 #endif
+void i40e_get_netdev_stats_struct(struct net_device *netdev,
+				  struct rtnl_link_stats64 *stats)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_ring *tx_ring, *rx_ring;
@@ -426,10 +422,10 @@ static struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
 	int i;
 
 	if (test_bit(__I40E_DOWN, &vsi->state))
-		return stats;
+		return;
 
 	if (!vsi->tx_rings)
-		return stats;
+		return;
 
 	rcu_read_lock();
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
@@ -469,8 +465,6 @@ static struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
 	stats->rx_dropped	= vsi_stats->rx_dropped;
 	stats->rx_crc_errors	= vsi_stats->rx_crc_errors;
 	stats->rx_length_errors	= vsi_stats->rx_length_errors;
-
-	return stats;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 594604e09f8d..7546109d4980 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -137,8 +137,8 @@ static void igb_update_phy_info(unsigned long);
 static void igb_watchdog(unsigned long);
 static void igb_watchdog_task(struct work_struct *);
 static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *);
-static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev,
-					  struct rtnl_link_stats64 *stats);
+static void igb_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *stats);
 static int igb_change_mtu(struct net_device *, int);
 static int igb_set_mac(struct net_device *, void *);
 static void igb_set_uta(struct igb_adapter *adapter, bool set);
@@ -5404,8 +5404,8 @@ static void igb_reset_task(struct work_struct *work)
  *  @netdev: network interface device structure
  *  @stats: rtnl_link_stats64 pointer
  **/
-static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void igb_get_stats64(struct net_device *netdev,
+			    struct rtnl_link_stats64 *stats)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 
@@ -5413,8 +5413,6 @@ static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *netdev,
 	igb_update_stats(adapter, &adapter->stats64);
 	memcpy(stats, &adapter->stats64, sizeof(*stats));
 	spin_unlock(&adapter->stats64_lock);
-
-	return stats;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0c6eca570791..ffe7d940d9ff 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8173,8 +8173,9 @@ static void ixgbe_netpoll(struct net_device *netdev)
 }
 
 #endif
-static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
-						   struct rtnl_link_stats64 *stats)
+
+static void ixgbe_get_stats64(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	int i;
@@ -8212,13 +8213,13 @@ static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
 		}
 	}
 	rcu_read_unlock();
+
 	/* following stats updated by ixgbe_watchdog_task() */
 	stats->multicast	= netdev->stats.multicast;
 	stats->rx_errors	= netdev->stats.rx_errors;
 	stats->rx_length_errors	= netdev->stats.rx_length_errors;
 	stats->rx_crc_errors	= netdev->stats.rx_crc_errors;
 	stats->rx_missed_errors	= netdev->stats.rx_missed_errors;
-	return stats;
 }
 
 #ifdef CONFIG_IXGBE_DCB
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 1a28349114f8..b06863560c7d 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3896,8 +3896,8 @@ static void ixgbevf_shutdown(struct pci_dev *pdev)
 	ixgbevf_suspend(pdev, PMSG_SUSPEND);
 }
 
-static struct rtnl_link_stats64 *ixgbevf_get_stats(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void ixgbevf_get_stats(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	unsigned int start;
@@ -3930,8 +3930,6 @@ static struct rtnl_link_stats64 *ixgbevf_get_stats(struct net_device *netdev,
 		stats->tx_bytes += bytes;
 		stats->tx_packets += packets;
 	}
-
-	return stats;
 }
 
 #define IXGBEVF_MAX_MAC_HDR_LEN		127
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index e05e22705cf7..3607d8febbcf 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -652,7 +652,7 @@ static void mvneta_mib_counters_clear(struct mvneta_port *pp)
 }
 
 /* Get System Network Statistics */
-static struct rtnl_link_stats64 *
+static void
 mvneta_get_stats64(struct net_device *dev,
 		   struct rtnl_link_stats64 *stats)
 {
@@ -686,8 +686,6 @@ mvneta_get_stats64(struct net_device *dev,
 	stats->rx_dropped	= dev->stats.rx_dropped;
 
 	stats->tx_dropped	= dev->stats.tx_dropped;
-
-	return stats;
 }
 
 /* Rx descriptors helper methods */
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 4fe430ceb194..69db40e1a4e1 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -5739,7 +5739,7 @@ error:
 	return err;
 }
 
-static struct rtnl_link_stats64 *
+static void
 mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
@@ -5771,8 +5771,6 @@ mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_errors	= dev->stats.rx_errors;
 	stats->rx_dropped	= dev->stats.rx_dropped;
 	stats->tx_dropped	= dev->stats.tx_dropped;
-
-	return stats;
 }
 
 static int mvpp2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index b60ad0e56a9f..18d6336fa162 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -3888,8 +3888,8 @@ static void sky2_set_multicast(struct net_device *dev)
 	gma_write16(hw, port, GM_RX_CTRL, reg);
 }
 
-static struct rtnl_link_stats64 *sky2_get_stats(struct net_device *dev,
-						struct rtnl_link_stats64 *stats)
+static void sky2_get_stats(struct net_device *dev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
@@ -3929,8 +3929,6 @@ static struct rtnl_link_stats64 *sky2_get_stats(struct net_device *dev,
 	stats->rx_dropped = dev->stats.rx_dropped;
 	stats->rx_fifo_errors = dev->stats.rx_fifo_errors;
 	stats->tx_fifo_errors = dev->stats.tx_fifo_errors;
-
-	return stats;
 }
 
 /* Can have one global because blinking is controlled by
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 3dd87889e67e..25ae0c5bce3a 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -462,8 +462,8 @@ static void mtk_stats_update(struct mtk_eth *eth)
 	}
 }
 
-static struct rtnl_link_stats64 *mtk_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *storage)
+static void mtk_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *storage)
 {
 	struct mtk_mac *mac = netdev_priv(dev);
 	struct mtk_hw_stats *hw_stats = mac->hw_stats;
@@ -494,8 +494,6 @@ static struct rtnl_link_stats64 *mtk_get_stats64(struct net_device *dev,
 	storage->tx_errors = dev->stats.tx_errors;
 	storage->rx_dropped = dev->stats.rx_dropped;
 	storage->tx_dropped = dev->stats.tx_dropped;
-
-	return storage;
 }
 
 static inline int mtk_max_frag_size(int mtu)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index edbe200ac2fa..06ef23f040a4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1321,7 +1321,7 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
 }
 
 
-static struct rtnl_link_stats64 *
+static void
 mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -1330,8 +1330,6 @@ mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	mlx4_en_fold_software_stats(dev);
 	netdev_stats_to_stats64(stats, &dev->stats);
 	spin_unlock_bh(&priv->stats_lock);
-
-	return stats;
 }
 
 static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 88dd731bb8cb..60e5670452a1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2686,7 +2686,7 @@ mqprio:
 	return mlx5e_setup_tc(dev, tc->tc);
 }
 
-static struct rtnl_link_stats64 *
+static void
 mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
@@ -2729,7 +2729,6 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->multicast =
 		VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
 
-	return stats;
 }
 
 static void mlx5e_set_rx_mode(struct net_device *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 850378893b25..2c864574a9d5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -374,13 +374,12 @@ int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
 	return -EINVAL;
 }
 
-static struct rtnl_link_stats64 *
+static void
 mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
 	memcpy(stats, &priv->stats.vf_vport, sizeof(*stats));
-	return stats;
 }
 
 static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index d768c7b6c6d6..46c53a042e6b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -947,15 +947,13 @@ out:
 /* Return the stats from a cache that is updated periodically,
  * as this function might get called in an atomic context.
  */
-static struct rtnl_link_stats64 *
+static void
 mlxsw_sp_port_get_stats64(struct net_device *dev,
 			  struct rtnl_link_stats64 *stats)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 
 	memcpy(stats, mlxsw_sp_port->hw_stats.cache, sizeof(*stats));
-
-	return stats;
 }
 
 int mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid_begin,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index 150ccf5192a9..696d40612d28 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -381,7 +381,7 @@ static int mlxsw_sx_port_change_mtu(struct net_device *dev, int mtu)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *
+static void
 mlxsw_sx_port_get_stats64(struct net_device *dev,
 			  struct rtnl_link_stats64 *stats)
 {
@@ -410,7 +410,6 @@ mlxsw_sx_port_get_stats64(struct net_device *dev,
 		tx_dropped	+= p->tx_dropped;
 	}
 	stats->tx_dropped	= tx_dropped;
-	return stats;
 }
 
 static int mlxsw_sx_port_get_phys_port_name(struct net_device *dev, char *name,
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index e506ca876d0d..db297cfce6f4 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -378,8 +378,8 @@ static inline void put_be32(__be32 val, __be32 __iomem * p)
 	__raw_writel((__force __u32) val, (__force void __iomem *)p);
 }
 
-static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
-						    struct rtnl_link_stats64 *stats);
+static void myri10ge_get_stats(struct net_device *dev,
+			       struct rtnl_link_stats64 *stats);
 
 static void set_fw_name(struct myri10ge_priv *mgp, char *name, bool allocated)
 {
@@ -3119,8 +3119,8 @@ drop:
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
-						    struct rtnl_link_stats64 *stats)
+static void myri10ge_get_stats(struct net_device *dev,
+			       struct rtnl_link_stats64 *stats)
 {
 	const struct myri10ge_priv *mgp = netdev_priv(dev);
 	const struct myri10ge_slice_netstats *slice_stats;
@@ -3135,7 +3135,6 @@ static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
 		stats->rx_dropped += slice_stats->rx_dropped;
 		stats->tx_dropped += slice_stats->tx_dropped;
 	}
-	return stats;
 }
 
 static void myri10ge_set_multicast_list(struct net_device *dev)
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index e07b936f64ec..f364502229db 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3111,7 +3111,7 @@ static int vxge_change_mtu(struct net_device *dev, int new_mtu)
  * @stats: pointer to struct rtnl_link_stats64
  *
  */
-static struct rtnl_link_stats64 *
+static void
 vxge_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 {
 	struct vxgedev *vdev = netdev_priv(dev);
@@ -3150,8 +3150,6 @@ vxge_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 		net_stats->tx_bytes += bytes;
 		net_stats->tx_errors += txstats->tx_errors;
 	}
-
-	return net_stats;
 }
 
 static enum vxge_hw_status vxge_timestamp_config(struct __vxge_hw_device *devh)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e8d448109e03..67afd95ffb93 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2638,8 +2638,8 @@ static int nfp_net_change_mtu(struct net_device *netdev, int new_mtu)
 	return nfp_net_ring_reconfig(nn, &nn->xdp_prog, &rx, NULL);
 }
 
-static struct rtnl_link_stats64 *nfp_net_stat64(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void nfp_net_stat64(struct net_device *netdev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 	int r;
@@ -2669,8 +2669,6 @@ static struct rtnl_link_stats64 *nfp_net_stat64(struct net_device *netdev,
 		stats->tx_bytes += data[1];
 		stats->tx_errors += data[2];
 	}
-
-	return stats;
 }
 
 static bool nfp_net_ebpf_capable(struct nfp_net *nn)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 3913f07279d2..dfc2c8149d22 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -1733,7 +1733,7 @@ static void nv_update_stats(struct net_device *dev)
  * Called with read_lock(&dev_base_lock) held for read -
  * only synchronized against unregister_netdevice.
  */
-static struct rtnl_link_stats64*
+static void
 nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
 	__acquires(&netdev_priv(dev)->hwstats_lock)
 	__releases(&netdev_priv(dev)->hwstats_lock)
@@ -1793,8 +1793,6 @@ nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
 
 		spin_unlock_bh(&np->hwstats_lock);
 	}
-
-	return storage;
 }
 
 /*
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 561fb94c7267..86fb9d3df700 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -90,8 +90,8 @@ static irqreturn_t netxen_msix_intr(int irq, void *data);
 
 static void netxen_free_ip_list(struct netxen_adapter *, bool);
 static void netxen_restore_indev_addr(struct net_device *dev, unsigned long);
-static struct rtnl_link_stats64 *netxen_nic_get_stats(struct net_device *dev,
-						      struct rtnl_link_stats64 *stats);
+static void netxen_nic_get_stats(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats);
 static int netxen_nic_set_mac(struct net_device *netdev, void *p);
 
 /*  PCI Device ID Table  */
@@ -2302,8 +2302,8 @@ request_reset:
 	clear_bit(__NX_RESETTING, &adapter->state);
 }
 
-static struct rtnl_link_stats64 *netxen_nic_get_stats(struct net_device *netdev,
-						      struct rtnl_link_stats64 *stats)
+static void netxen_nic_get_stats(struct net_device *netdev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct netxen_adapter *adapter = netdev_priv(netdev);
 
@@ -2313,8 +2313,6 @@ static struct rtnl_link_stats64 *netxen_nic_get_stats(struct net_device *netdev,
 	stats->tx_bytes = adapter->stats.txbytes;
 	stats->rx_dropped = adapter->stats.rxdropped;
 	stats->tx_dropped = adapter->stats.txdropped;
-
-	return stats;
 }
 
 static irqreturn_t netxen_intr(int irq, void *data)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index b58509feecd5..40a76a1d5973 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -398,9 +398,8 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	edev->stats.tx_mac_ctrl_frames = stats.tx_mac_ctrl_frames;
 }
 
-static
-struct rtnl_link_stats64 *qede_get_stats64(struct net_device *dev,
-					   struct rtnl_link_stats64 *stats)
+static void qede_get_stats64(struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 
@@ -430,8 +429,6 @@ struct rtnl_link_stats64 *qede_get_stats64(struct net_device *dev,
 	stats->collisions = edev->stats.tx_total_collisions;
 	stats->rx_crc_errors = edev->stats.rx_crc_errors;
 	stats->rx_frame_errors = edev->stats.rx_align_errors;
-
-	return stats;
 }
 
 #ifdef CONFIG_QED_SRIOV
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 422289c232bc..40ebe010b06f 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -312,8 +312,8 @@ static int emac_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 }
 
 /* Provide network statistics info for the interface */
-static struct rtnl_link_stats64 *emac_get_stats64(struct net_device *netdev,
-						  struct rtnl_link_stats64 *net_stats)
+static void emac_get_stats64(struct net_device *netdev,
+			     struct rtnl_link_stats64 *net_stats)
 {
 	struct emac_adapter *adpt = netdev_priv(netdev);
 	unsigned int addr = REG_MAC_RX_STATUS_BIN;
@@ -377,8 +377,6 @@ static struct rtnl_link_stats64 *emac_get_stats64(struct net_device *netdev,
 	net_stats->tx_window_errors = stats->tx_late_col;
 
 	spin_unlock(&stats->lock);
-
-	return net_stats;
 }
 
 static const struct net_device_ops emac_netdev_ops = {
diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 9bc047ac883b..5ad59c6d29a4 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -653,9 +653,8 @@ static int rtl8139_poll(struct napi_struct *napi, int budget);
 static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance);
 static int rtl8139_close (struct net_device *dev);
 static int netdev_ioctl (struct net_device *dev, struct ifreq *rq, int cmd);
-static struct rtnl_link_stats64 *rtl8139_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64
-						    *stats);
+static void rtl8139_get_stats64(struct net_device *dev,
+				struct rtnl_link_stats64 *stats);
 static void rtl8139_set_rx_mode (struct net_device *dev);
 static void __set_rx_mode (struct net_device *dev);
 static void rtl8139_hw_start (struct net_device *dev);
@@ -2516,7 +2515,7 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 }
 
 
-static struct rtnl_link_stats64 *
+static void
 rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
@@ -2544,8 +2543,6 @@ rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_packets = tp->tx_stats.packets;
 		stats->tx_bytes = tp->tx_stats.bytes;
 	} while (u64_stats_fetch_retry_irq(&tp->tx_stats.syncp, start));
-
-	return stats;
 }
 
 /* Set or clear the multicast filter for this adaptor.
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 44389c90056a..858f4554de11 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7755,7 +7755,7 @@ err_pm_runtime_put:
 	goto out;
 }
 
-static struct rtnl_link_stats64 *
+static void
 rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
@@ -7809,8 +7809,6 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		le16_to_cpu(tp->tc_offset.tx_aborted);
 
 	pm_runtime_put_noidle(&pdev->dev);
-
-	return stats;
 }
 
 static void rtl8169_net_suspend(struct net_device *dev)
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index cddcff5a00a7..07074d9bc45d 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1706,11 +1706,9 @@ static inline u64 sxgbe_get_stat64(void __iomem *ioaddr, int reg_lo, int reg_hi)
  *  This function is a driver entry point whenever ifconfig command gets
  *  executed to see device statistics. Statistics are number of
  *  bytes sent or received, errors occurred etc.
- *  Return value:
- *  This function returns various statistical information of device.
  */
-static struct rtnl_link_stats64 *sxgbe_get_stats64(struct net_device *dev,
-						   struct rtnl_link_stats64 *stats)
+static void sxgbe_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct sxgbe_priv_data *priv = netdev_priv(dev);
 	void __iomem *ioaddr = priv->ioaddr;
@@ -1761,8 +1759,6 @@ static struct rtnl_link_stats64 *sxgbe_get_stats64(struct net_device *dev,
 						 SXGBE_MMC_TXUFLWHI_GBCNT_REG);
 	writel(0, ioaddr + SXGBE_MMC_CTL_REG);
 	spin_unlock(&priv->stats_lock);
-
-	return stats;
 }
 
 /*  sxgbe_set_features - entry point to set offload features of the device.
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index bbbed2e84de8..ebeecb8fed45 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2219,16 +2219,14 @@ int efx_net_stop(struct net_device *net_dev)
 }
 
 /* Context: process, dev_base_lock or RTNL held, non-blocking. */
-static struct rtnl_link_stats64 *efx_net_stats(struct net_device *net_dev,
-					       struct rtnl_link_stats64 *stats)
+static void efx_net_stats(struct net_device *net_dev,
+			  struct rtnl_link_stats64 *stats)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
 	spin_lock_bh(&efx->stats_lock);
 	efx->type->update_stats(efx, NULL, stats);
 	spin_unlock_bh(&efx->stats_lock);
-
-	return stats;
 }
 
 /* Context: netif_tx_lock held, BHs disabled. */
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index ec3ac0e45cc9..8cfbe01e1ddf 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -2158,16 +2158,14 @@ int ef4_net_stop(struct net_device *net_dev)
 }
 
 /* Context: process, dev_base_lock or RTNL held, non-blocking. */
-static struct rtnl_link_stats64 *ef4_net_stats(struct net_device *net_dev,
-					       struct rtnl_link_stats64 *stats)
+static void ef4_net_stats(struct net_device *net_dev,
+			  struct rtnl_link_stats64 *stats)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
 
 	spin_lock_bh(&efx->stats_lock);
 	efx->type->update_stats(efx, NULL, stats);
 	spin_unlock_bh(&efx->stats_lock);
-
-	return stats;
 }
 
 /* Context: netif_tx_lock held, BHs disabled. */
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index f90d1af6d390..e557a3290a25 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -6294,8 +6294,8 @@ no_rings:
 	stats->tx_errors = errors;
 }
 
-static struct rtnl_link_stats64 *niu_get_stats(struct net_device *dev,
-					       struct rtnl_link_stats64 *stats)
+static void niu_get_stats(struct net_device *dev,
+			  struct rtnl_link_stats64 *stats)
 {
 	struct niu *np = netdev_priv(dev);
 
@@ -6303,8 +6303,6 @@ static struct rtnl_link_stats64 *niu_get_stats(struct net_device *dev,
 		niu_get_rx_stats(np, stats);
 		niu_get_tx_stats(np, stats);
 	}
-
-	return stats;
 }
 
 static void niu_load_hash_xmac(struct niu *np, u16 *hash)
diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
index 09f5a67da35e..467dcc53f5e1 100644
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -2490,7 +2490,7 @@ static void dwceqos_read_mmc_counters(struct net_local *lp, u32 rx_mask,
 			dwceqos_read(lp, DWC_MMC_RXPACKETCOUNT_GB);
 }
 
-static struct rtnl_link_stats64*
+static void
 dwceqos_get_stats64(struct net_device *ndev, struct rtnl_link_stats64 *s)
 {
 	unsigned long flags;
@@ -2522,8 +2522,6 @@ dwceqos_get_stats64(struct net_device *ndev, struct rtnl_link_stats64 *s)
 	else
 		s->tx_errors = hwstats->txunderflowerror +
 			hwstats->txcarriererror;
-
-	return s;
 }
 
 static void
diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c
index 0a3b7dafa3ba..30cfea62a356 100644
--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -2047,8 +2047,8 @@ static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
  *
  * Returns the address of the device statistics structure.
  */
-static struct rtnl_link_stats64 *tile_net_get_stats64(struct net_device *dev,
-		struct rtnl_link_stats64 *stats)
+static void tile_net_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct tile_net_priv *priv = netdev_priv(dev);
 	u64 rx_packets = 0, tx_packets = 0;
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index 0a6c4e804eed..453a1fad560c 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -513,8 +513,8 @@ static irqreturn_t rhine_interrupt(int irq, void *dev_instance);
 static void rhine_tx(struct net_device *dev);
 static int rhine_rx(struct net_device *dev, int limit);
 static void rhine_set_rx_mode(struct net_device *dev);
-static struct rtnl_link_stats64 *rhine_get_stats64(struct net_device *dev,
-	       struct rtnl_link_stats64 *stats);
+static void rhine_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats);
 static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 static const struct ethtool_ops netdev_ethtool_ops;
 static int  rhine_close(struct net_device *dev);
@@ -2221,7 +2221,7 @@ out_unlock:
 	mutex_unlock(&rp->task_lock);
 }
 
-static struct rtnl_link_stats64 *
+static void
 rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct rhine_private *rp = netdev_priv(dev);
@@ -2244,8 +2244,6 @@ rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_packets = rp->tx_stats.packets;
 		stats->tx_bytes = rp->tx_stats.bytes;
 	} while (u64_stats_fetch_retry_irq(&rp->tx_stats.syncp, start));
-
-	return stats;
 }
 
 static void rhine_set_rx_mode(struct net_device *dev)
diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c
index b77e4ecf3cf2..5028001429c7 100644
--- a/drivers/net/fjes/fjes_main.c
+++ b/drivers/net/fjes/fjes_main.c
@@ -57,8 +57,7 @@ static void fjes_raise_intr_rxdata_task(struct work_struct *);
 static void fjes_tx_stall_task(struct work_struct *);
 static void fjes_force_close_task(struct work_struct *);
 static irqreturn_t fjes_intr(int, void*);
-static struct rtnl_link_stats64 *
-fjes_get_stats64(struct net_device *, struct rtnl_link_stats64 *);
+static void fjes_get_stats64(struct net_device *, struct rtnl_link_stats64 *);
 static int fjes_change_mtu(struct net_device *, int);
 static int fjes_vlan_rx_add_vid(struct net_device *, __be16 proto, u16);
 static int fjes_vlan_rx_kill_vid(struct net_device *, __be16 proto, u16);
@@ -782,14 +781,12 @@ static void fjes_tx_retry(struct net_device *netdev)
 	netif_tx_wake_queue(queue);
 }
 
-static struct rtnl_link_stats64 *
+static void
 fjes_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
 {
 	struct fjes_adapter *adapter = netdev_priv(netdev);
 
 	memcpy(stats, &adapter->stats64, sizeof(struct rtnl_link_stats64));
-
-	return stats;
 }
 
 static int fjes_change_mtu(struct net_device *netdev, int new_mtu)
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index c9414c054852..05374fce7da4 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -908,8 +908,8 @@ out:
 	return ret;
 }
 
-static struct rtnl_link_stats64 *netvsc_get_stats64(struct net_device *net,
-						    struct rtnl_link_stats64 *t)
+static void netvsc_get_stats64(struct net_device *net,
+			       struct rtnl_link_stats64 *t)
 {
 	struct net_device_context *ndev_ctx = netdev_priv(net);
 	int cpu;
@@ -947,8 +947,6 @@ static struct rtnl_link_stats64 *netvsc_get_stats64(struct net_device *net,
 
 	t->rx_dropped	= net->stats.rx_dropped;
 	t->rx_errors	= net->stats.rx_errors;
-
-	return t;
 }
 
 static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 66c0eeafcb5d..082534e187fc 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -129,8 +129,8 @@ resched:
 
 }
 
-static struct rtnl_link_stats64 *ifb_stats64(struct net_device *dev,
-					     struct rtnl_link_stats64 *stats)
+static void ifb_stats64(struct net_device *dev,
+			struct rtnl_link_stats64 *stats)
 {
 	struct ifb_dev_private *dp = netdev_priv(dev);
 	struct ifb_q_private *txp = dp->tx_private;
@@ -157,8 +157,6 @@ static struct rtnl_link_stats64 *ifb_stats64(struct net_device *dev,
 	}
 	stats->rx_dropped = dev->stats.rx_dropped;
 	stats->tx_dropped = dev->stats.tx_dropped;
-
-	return stats;
 }
 
 static int ifb_dev_init(struct net_device *dev)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ce7ca6a5aa8a..1cdb8c5ec403 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -303,8 +303,8 @@ static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
 	dev_mc_sync(ipvlan->phy_dev, dev);
 }
 
-static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64 *s)
+static void ipvlan_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *s)
 {
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 
@@ -341,7 +341,6 @@ static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev,
 		s->rx_dropped = rx_errs;
 		s->tx_dropped = tx_drps;
 	}
-	return s;
 }
 
 static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 1e05b7c2d157..30a493936e63 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -97,8 +97,8 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
-						      struct rtnl_link_stats64 *stats)
+static void loopback_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	u64 bytes = 0;
 	u64 packets = 0;
@@ -122,7 +122,6 @@ static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
 	stats->tx_packets = packets;
 	stats->rx_bytes   = bytes;
 	stats->tx_bytes   = bytes;
-	return stats;
 }
 
 static u32 always_on(struct net_device *dev)
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index f83cf6696820..778a77303c49 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2888,13 +2888,13 @@ static int macsec_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *macsec_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64 *s)
+static void macsec_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *s)
 {
 	int cpu;
 
 	if (!dev->tstats)
-		return s;
+		return;
 
 	for_each_possible_cpu(cpu) {
 		struct pcpu_sw_netstats *stats;
@@ -2918,8 +2918,6 @@ static struct rtnl_link_stats64 *macsec_get_stats64(struct net_device *dev,
 
 	s->rx_dropped = dev->stats.rx_dropped;
 	s->tx_dropped = dev->stats.tx_dropped;
-
-	return s;
 }
 
 static int macsec_get_iflink(const struct net_device *dev)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 20b3fdf282c5..440ab3d8adf7 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -855,8 +855,8 @@ static void macvlan_uninit(struct net_device *dev)
 		macvlan_port_destroy(port->dev);
 }
 
-static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
-							 struct rtnl_link_stats64 *stats)
+static void macvlan_dev_get_stats64(struct net_device *dev,
+				    struct rtnl_link_stats64 *stats)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
@@ -893,7 +893,6 @@ static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
 		stats->rx_dropped	= rx_errors;
 		stats->tx_dropped	= tx_dropped;
 	}
-	return stats;
 }
 
 static int macvlan_vlan_rx_add_vid(struct net_device *dev,
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index 2de7faee9b19..b91603835d26 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -58,7 +58,7 @@ static int nlmon_close(struct net_device *dev)
 	return netlink_remove_tap(&nlmon->nt);
 }
 
-static struct rtnl_link_stats64 *
+static void
 nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -86,8 +86,6 @@ nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 
 	stats->rx_bytes = bytes;
 	stats->tx_bytes = 0;
-
-	return stats;
 }
 
 static u32 always_on(struct net_device *dev)
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 3d3b1f4339ef..a411b43a69eb 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1297,7 +1297,7 @@ ppp_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return err;
 }
 
-static struct rtnl_link_stats64*
+static void
 ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
 {
 	struct ppp *ppp = netdev_priv(dev);
@@ -1317,8 +1317,6 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
 	stats64->rx_dropped       = dev->stats.rx_dropped;
 	stats64->tx_dropped       = dev->stats.tx_dropped;
 	stats64->rx_length_errors = dev->stats.rx_length_errors;
-
-	return stats64;
 }
 
 static int ppp_dev_init(struct net_device *dev)
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 9841f3dc0682..08db4d687533 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -566,7 +566,7 @@ static int sl_change_mtu(struct net_device *dev, int new_mtu)
 
 /* Netdevice get statistics request */
 
-static struct rtnl_link_stats64 *
+static void
 sl_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct net_device_stats *devstats = &dev->stats;
@@ -597,7 +597,6 @@ sl_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->collisions     += comp->sls_o_misses;
 	}
 #endif
-	return stats;
 }
 
 /* Netdevice register callback */
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index bdc58567d10e..a3711769544b 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1798,7 +1798,7 @@ unwind:
 	return err;
 }
 
-static struct rtnl_link_stats64 *
+static void
 team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct team *team = netdev_priv(dev);
@@ -1835,7 +1835,6 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_dropped	= rx_dropped;
 	stats->tx_dropped	= tx_dropped;
 	stats->rx_nohandler	= rx_nohandler;
-	return stats;
 }
 
 static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cd8e02c94be0..8c1d3bd6b4d0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -953,7 +953,7 @@ static void tun_set_headroom(struct net_device *dev, int new_hr)
 	tun->align = new_hr;
 }
 
-static struct rtnl_link_stats64 *
+static void
 tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0;
@@ -987,7 +987,6 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_dropped  = rx_dropped;
 	stats->rx_frame_errors = rx_frame_errors;
 	stats->tx_dropped = tx_dropped;
-	return stats;
 }
 
 static const struct net_device_ops tun_netdev_ops = {
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 0520952aa096..8c39d6d690e5 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -158,8 +158,8 @@ static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
 	return atomic64_read(&priv->dropped);
 }
 
-static struct rtnl_link_stats64 *veth_get_stats64(struct net_device *dev,
-						  struct rtnl_link_stats64 *tot)
+static void veth_get_stats64(struct net_device *dev,
+			     struct rtnl_link_stats64 *tot)
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	struct net_device *peer;
@@ -177,8 +177,6 @@ static struct rtnl_link_stats64 *veth_get_stats64(struct net_device *dev,
 		tot->rx_packets = one.packets;
 	}
 	rcu_read_unlock();
-
-	return tot;
 }
 
 /* fake multicast ability */
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 2cea022e6e6e..37db91d1a0a3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1272,8 +1272,8 @@ out:
 	return ret;
 }
 
-static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
-					       struct rtnl_link_stats64 *tot)
+static void virtnet_stats(struct net_device *dev,
+			  struct rtnl_link_stats64 *tot)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int cpu;
@@ -1306,8 +1306,6 @@ static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
 	tot->rx_dropped = dev->stats.rx_dropped;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
-
-	return tot;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index aabc6ef366b4..f88ffafebfbf 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -113,7 +113,7 @@ vmxnet3_global_stats[] = {
 };
 
 
-struct rtnl_link_stats64 *
+void
 vmxnet3_get_stats64(struct net_device *netdev,
 		   struct rtnl_link_stats64 *stats)
 {
@@ -160,8 +160,6 @@ vmxnet3_get_stats64(struct net_device *netdev,
 		stats->rx_dropped += drvRxStats->drop_total;
 		stats->multicast +=  devRxStats->mcastPktsRxOK;
 	}
-
-	return stats;
 }
 
 static int
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 59e077be8829..ba1c9f93592b 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -465,8 +465,8 @@ vmxnet3_create_queues(struct vmxnet3_adapter *adapter,
 
 void vmxnet3_set_ethtool_ops(struct net_device *netdev);
 
-struct rtnl_link_stats64 *
-vmxnet3_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats);
+void vmxnet3_get_stats64(struct net_device *dev,
+			 struct rtnl_link_stats64 *stats);
 
 extern char vmxnet3_driver_name[];
 #endif
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 23dfb0eac098..895e3e258543 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -77,8 +77,8 @@ static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
-static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
-						 struct rtnl_link_stats64 *stats)
+static void vrf_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *stats)
 {
 	int i;
 
@@ -102,7 +102,6 @@ static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
 		stats->rx_bytes += rbytes;
 		stats->rx_packets += rpkts;
 	}
-	return stats;
 }
 
 /* Local traffic destined to local address. Reinsert the packet to rx
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index a479cd99911d..40f26b69beb1 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1073,8 +1073,8 @@ static int xennet_change_mtu(struct net_device *dev, int mtu)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *xennet_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64 *tot)
+static void xennet_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *tot)
 {
 	struct netfront_info *np = netdev_priv(dev);
 	int cpu;
@@ -1105,8 +1105,6 @@ static struct rtnl_link_stats64 *xennet_get_stats64(struct net_device *dev,
 
 	tot->rx_errors  = dev->stats.rx_errors;
 	tot->tx_dropped = dev->stats.tx_dropped;
-
-	return tot;
 }
 
 static void xennet_release_tx_bufs(struct netfront_queue *queue)
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index fb0928a4fb97..f84069ffa8c6 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -397,14 +397,6 @@ static void xlr_stats(struct net_device *ndev, struct rtnl_link_stats64 *stats)
 			TX_DROP_FRAME_COUNTER);
 }
 
-static struct rtnl_link_stats64 *xlr_get_stats64(struct net_device *ndev,
-						 struct rtnl_link_stats64 *stats
-						 )
-{
-	xlr_stats(ndev, stats);
-	return stats;
-}
-
 static const struct net_device_ops xlr_netdev_ops = {
 	.ndo_open = xlr_net_open,
 	.ndo_stop = xlr_net_stop,
@@ -412,7 +404,7 @@ static const struct net_device_ops xlr_netdev_ops = {
 	.ndo_select_queue = xlr_net_select_queue,
 	.ndo_set_mac_address = xlr_net_set_mac_addr,
 	.ndo_set_rx_mode = xlr_set_rx_mode,
-	.ndo_get_stats64 = xlr_get_stats64,
+	.ndo_get_stats64 = xlr_stats,
 };
 
 /*
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ecd78b3c9aba..b14ad9c139d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -913,8 +913,8 @@ struct netdev_xdp {
  *	Callback used when the transmitter has not made any progress
  *	for dev->watchdog ticks.
  *
- * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
- *                      struct rtnl_link_stats64 *storage);
+ * void (*ndo_get_stats64)(struct net_device *dev,
+ *                         struct rtnl_link_stats64 *storage);
  * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
  *	Called when a user wants to get the network device usage
  *	statistics. Drivers must do one of the following:
@@ -1165,8 +1165,8 @@ struct net_device_ops {
 						   struct neigh_parms *);
 	void			(*ndo_tx_timeout) (struct net_device *dev);
 
-	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
-						     struct rtnl_link_stats64 *storage);
+	void			(*ndo_get_stats64)(struct net_device *dev,
+						   struct rtnl_link_stats64 *storage);
 	bool			(*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
 	int			(*ndo_get_offload_stats)(int attr_id,
 							 const struct net_device *dev,
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index e893fe43dd13..3d4ca4df1209 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -261,8 +261,8 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
 
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *tot);
+void ip_tunnel_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *tot);
 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 				   int link, __be16 flags,
 				   __be32 remote, __be32 local,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 10da6c588bf8..116455ac3db5 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
 	return 0;
 }
 
-static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+static void vlan_dev_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct vlan_pcpu_stats *p;
 	u32 rx_errors = 0, tx_dropped = 0;
@@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
 	}
 	stats->rx_errors  = rx_errors;
 	stats->tx_dropped = tx_dropped;
-
-	return stats;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ed3b3192fb00..6c46d1b4cdbb 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -153,8 +153,8 @@ static int br_dev_stop(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *stats)
+static void br_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -178,8 +178,6 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
 	stats->tx_packets = sum.tx_packets;
 	stats->rx_bytes   = sum.rx_bytes;
 	stats->rx_packets = sum.rx_packets;
-
-	return stats;
 }
 
 static int br_change_mtu(struct net_device *dev, int new_mtu)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index fed3d29f9eb3..5476110598f7 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
 EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
 
 /* Often modified stats are per cpu, other are shared (netdev->stats) */
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *tot)
+void ip_tunnel_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *tot)
 {
 	int i;
 
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 		tot->rx_bytes   += rx_bytes;
 		tot->tx_bytes   += tx_bytes;
 	}
-
-	return tot;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
 
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index e2c6ae024565..8bf18a5f66e0 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
-						      struct rtnl_link_stats64 *stats)
+static void l2tp_eth_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct l2tp_eth *priv = netdev_priv(dev);
 
@@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
 	stats->rx_bytes   = atomic_long_read(&priv->rx_bytes);
 	stats->rx_packets = atomic_long_read(&priv->rx_packets);
 	stats->rx_errors  = atomic_long_read(&priv->rx_errors);
-	return stats;
 }
 
-
 static const struct net_device_ops l2tp_eth_netdev_ops = {
 	.ndo_init		= l2tp_eth_dev_init,
 	.ndo_uninit		= l2tp_eth_dev_uninit,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 41497b670e2b..77e8a42225f9 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1122,7 +1122,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
 	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
 }
 
-static struct rtnl_link_stats64 *
+static void
 ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -1147,8 +1147,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->rx_bytes   += rx_bytes;
 		stats->tx_bytes   += tx_bytes;
 	}
-
-	return stats;
 }
 
 static const struct net_device_ops ieee80211_dataif_ops = {
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index d5d6caecd072..09141a18ee2d 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -97,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev)
 	free_netdev(dev);
 }
 
-static struct rtnl_link_stats64 *
+static void
 internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -125,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_bytes         += local_stats.tx_bytes;
 		stats->tx_packets       += local_stats.tx_packets;
 	}
-
-	return stats;
 }
 
 static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index b0196366d58d..9fe6b427afed 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
-						     struct rtnl_link_stats64 *stats)
+static void teql_master_stats64(struct net_device *dev,
+				struct rtnl_link_stats64 *stats)
 {
 	struct teql_master *m = netdev_priv(dev);
 
@@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
 	stats->tx_bytes		= m->tx_bytes;
 	stats->tx_errors	= m->tx_errors;
 	stats->tx_dropped	= m->tx_dropped;
-	return stats;
 }
 
 static int teql_master_mtu(struct net_device *dev, int new_mtu)
-- 
cgit v1.2.3


From d6264071ce7d100a2b7c1f295167796ab5178caf Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Sat, 7 Jan 2017 17:06:34 -0500
Subject: net-tc: make MAX_RECLASSIFY_LOOP local

This field is no longer kept in tc_verd. Remove it from the global
definition of that struct.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 5 -----
 net/sched/sch_api.c          | 3 ++-
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index c769f71972f5..bba23dbb3ab6 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -17,9 +17,6 @@
 
 /* verdict bit breakdown 
  *
-bit 2,3,4,5: Reclassify counter - sort of reverse TTL - if exceeded
-assume loop
-
 bit 6,7: Where this packet was last seen 
 0: Above the transmit example at the socket level
 1: on the Ingress
@@ -48,8 +45,6 @@ bit 8: when set --> Request not to classify on ingress.
 #define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
 #define V_TC_AT(x)       _TC_MAKEVALUE(x,S_TC_AT)
 #define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
-
-#define MAX_REC_LOOP 4
 #endif
 
 /* Action attributes */
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d7b93429f0cc..ef53ede11590 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1861,6 +1861,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 {
 	__be16 protocol = tc_skb_protocol(skb);
 #ifdef CONFIG_NET_CLS_ACT
+	const int max_reclassify_loop = 4;
 	const struct tcf_proto *old_tp = tp;
 	int limit = 0;
 
@@ -1885,7 +1886,7 @@ reclassify:
 	return TC_ACT_UNSPEC; /* signal: continue lookup */
 #ifdef CONFIG_NET_CLS_ACT
 reset:
-	if (unlikely(limit++ >= MAX_REC_LOOP)) {
+	if (unlikely(limit++ >= max_reclassify_loop)) {
 		net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
 				       tp->q->ops->id, tp->prio & 0xffff,
 				       ntohs(tp->protocol));
-- 
cgit v1.2.3


From e7246e122aaa99ebbb8ad7da80f35a20577bd8af Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Sat, 7 Jan 2017 17:06:35 -0500
Subject: net-tc: extract skip classify bit from tc_verd

Packets sent by the IFB device skip subsequent tc classification.
A single bit governs this state. Move it out of tc_verd in
anticipation of removing that __u16 completely.

The new bitfield tc_skip_classify temporarily uses one bit of a
hole, until tc_verd is removed completely in a follow-up patch.

Remove the bit hole comment. It could be 2, 3, 4 or 5 bits long.
With that many options, little value in documenting it.

Introduce a helper function to deduplicate the logic in the two
sites that check this bit.

The field tc_skip_classify is set only in IFB on skbs cloned in
act_mirred, so original packet sources do not have to clear the
bit when reusing packets (notably, pktgen and octeon).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c            |  2 +-
 include/linux/skbuff.h       |  5 ++++-
 include/net/sch_generic.h    | 11 +++++++++++
 include/uapi/linux/pkt_cls.h |  6 ------
 net/core/dev.c               | 10 +++-------
 net/sched/act_api.c          | 11 ++++-------
 6 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 082534e187fc..442c4c4a9606 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -81,7 +81,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		u32 from = G_TC_FROM(skb->tc_verd);
 
 		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
 		txp->tx_packets++;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..570f60ec6cb4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -589,6 +589,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@pkt_type: Packet class
  *	@fclone: skbuff clone status
  *	@ipvs_property: skbuff is owned by ipvs
+ *	@tc_skip_classify: do not classify packet. set by IFB device
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -749,7 +750,9 @@ struct sk_buff {
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 #endif
-	/* 2, 4 or 5 bit hole */
+#ifdef CONFIG_NET_CLS_ACT
+	__u8			tc_skip_classify:1;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 498f81b229a4..857356f2d74b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -418,6 +418,17 @@ static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_skip_tc_classify(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (skb->tc_skip_classify) {
+		skb->tc_skip_classify = 0;
+		return true;
+	}
+#endif
+	return false;
+}
+
 /* Reset all TX qdiscs greater then index of a device.  */
 static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
 {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index bba23dbb3ab6..1eed5d7509bc 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -22,8 +22,6 @@ bit 6,7: Where this packet was last seen
 1: on the Ingress
 2: on the Egress
 
-bit 8: when set --> Request not to classify on ingress. 
-
  *
  * */
 
@@ -36,10 +34,6 @@ bit 8: when set --> Request not to classify on ingress.
 #define AT_INGRESS	0x1
 #define AT_EGRESS	0x2
 
-#define TC_NCLS          _TC_MAKEMASK1(8)
-#define SET_TC_NCLS(v)   ( TC_NCLS | (v & ~TC_NCLS))
-#define CLR_TC_NCLS(v)   ( v & ~TC_NCLS)
-
 #define S_TC_AT          _TC_MAKE32(12)
 #define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
 #define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
diff --git a/net/core/dev.c b/net/core/dev.c
index 56818f7eab2b..e39e35d2e082 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4093,12 +4093,8 @@ another_round:
 			goto out;
 	}
 
-#ifdef CONFIG_NET_CLS_ACT
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		goto ncls;
-	}
-#endif
+	if (skb_skip_tc_classify(skb))
+		goto skip_classify;
 
 	if (pfmemalloc)
 		goto skip_taps;
@@ -4128,8 +4124,8 @@ skip_taps:
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = 0;
-ncls:
 #endif
+skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 2095c83ce773..f04715a57300 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -426,11 +426,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 {
 	int ret = -1, i;
 
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		ret = TC_ACT_OK;
-		goto exec_done;
-	}
+	if (skb_skip_tc_classify(skb))
+		return TC_ACT_OK;
+
 	for (i = 0; i < nr_actions; i++) {
 		const struct tc_action *a = actions[i];
 
@@ -439,9 +437,8 @@ repeat:
 		if (ret == TC_ACT_REPEAT)
 			goto repeat;	/* we need a ttl - JHS */
 		if (ret != TC_ACT_PIPE)
-			goto exec_done;
+			break;
 	}
-exec_done:
 	return ret;
 }
 EXPORT_SYMBOL(tcf_action_exec);
-- 
cgit v1.2.3


From a5135bcfba7345031df45e02cd150a45add47cf8 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Sat, 7 Jan 2017 17:06:36 -0500
Subject: net-tc: convert tc_verd to integer bitfields

Extract the remaining two fields from tc_verd and remove the __u16
completely. TC_AT and TC_FROM are converted to equivalent two-bit
integer fields tc_at and tc_from. Where possible, use existing
helper skb_at_tc_ingress when reading tc_at. Introduce helper
skb_reset_tc to clear fields.

Not documenting tc_from and tc_at, because they will be replaced
with single bit fields in follow-on patches.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c                    |  7 +++----
 drivers/staging/octeon/ethernet-tx.c |  5 ++---
 include/linux/skbuff.h               |  6 ++----
 include/net/sch_generic.h            | 10 +++++++++-
 include/uapi/linux/pkt_cls.h         | 31 -------------------------------
 net/core/dev.c                       | 10 ++++------
 net/core/pktgen.c                    |  4 +---
 net/core/skbuff.c                    |  3 ---
 net/sched/act_ife.c                  |  7 +++----
 net/sched/act_mirred.c               |  9 ++++-----
 net/sched/sch_netem.c                |  2 +-
 11 files changed, 29 insertions(+), 65 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 442c4c4a9606..b73b6b6c066b 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -78,9 +78,9 @@ static void ifb_ri_tasklet(unsigned long _txp)
 	}
 
 	while ((skb = __skb_dequeue(&txp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
+		u32 from = skb->tc_from;
 
-		skb->tc_verd = 0;
+		skb_reset_tc(skb);
 		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
@@ -239,7 +239,6 @@ static void ifb_setup(struct net_device *dev)
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ifb_dev_private *dp = netdev_priv(dev);
-	u32 from = G_TC_FROM(skb->tc_verd);
 	struct ifb_q_private *txp = dp->tx_private + skb_get_queue_mapping(skb);
 
 	u64_stats_update_begin(&txp->rsync);
@@ -247,7 +246,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	txp->rx_bytes += skb->len;
 	u64_stats_update_end(&txp->rsync);
 
-	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
+	if (skb->tc_from == AT_STACK || !skb->skb_iif) {
 		dev_kfree_skb(skb);
 		dev->stats.rx_dropped++;
 		return NETDEV_TX_OK;
diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c
index 6b4c20872323..0b8053205091 100644
--- a/drivers/staging/octeon/ethernet-tx.c
+++ b/drivers/staging/octeon/ethernet-tx.c
@@ -23,6 +23,7 @@
 #endif /* CONFIG_XFRM */
 
 #include <linux/atomic.h>
+#include <net/sch_generic.h>
 
 #include <asm/octeon/octeon.h>
 
@@ -369,9 +370,7 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 
 #ifdef CONFIG_NET_SCHED
 	skb->tc_index = 0;
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = 0;
-#endif /* CONFIG_NET_CLS_ACT */
+	skb_reset_tc(skb);
 #endif /* CONFIG_NET_SCHED */
 #endif /* REUSE_SKBUFFS_WITHOUT_FREE */
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 570f60ec6cb4..f738d09947b2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -599,7 +599,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
  *	@tc_index: Traffic control index
- *	@tc_verd: traffic control verdict
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
@@ -752,13 +751,12 @@ struct sk_buff {
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
+	__u8			tc_at:2;
+	__u8			tc_from:2;
 #endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
-#ifdef CONFIG_NET_CLS_ACT
-	__u16			tc_verd;	/* traffic control verdict */
-#endif
 #endif
 
 	union {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 857356f2d74b..f80dba516964 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -409,10 +409,18 @@ bool tcf_destroy(struct tcf_proto *tp, bool force);
 void tcf_destroy_chain(struct tcf_proto __rcu **fl);
 int skb_do_redirect(struct sk_buff *);
 
+static inline void skb_reset_tc(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	skb->tc_at = 0;
+	skb->tc_from = 0;
+#endif
+}
+
 static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	return G_TC_AT(skb->tc_verd) & AT_INGRESS;
+	return skb->tc_at & AT_INGRESS;
 #else
 	return false;
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 1eed5d7509bc..cee753a7a40c 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -5,40 +5,9 @@
 #include <linux/pkt_sched.h>
 
 #ifdef __KERNEL__
-/* I think i could have done better macros ; for now this is stolen from
- * some arch/mips code - jhs
-*/
-#define _TC_MAKE32(x) ((x))
-
-#define _TC_MAKEMASK1(n) (_TC_MAKE32(1) << _TC_MAKE32(n))
-#define _TC_MAKEMASK(v,n) (_TC_MAKE32((_TC_MAKE32(1)<<(v))-1) << _TC_MAKE32(n))
-#define _TC_MAKEVALUE(v,n) (_TC_MAKE32(v) << _TC_MAKE32(n))
-#define _TC_GETVALUE(v,n,m) ((_TC_MAKE32(v) & _TC_MAKE32(m)) >> _TC_MAKE32(n))
-
-/* verdict bit breakdown 
- *
-bit 6,7: Where this packet was last seen 
-0: Above the transmit example at the socket level
-1: on the Ingress
-2: on the Egress
-
- *
- * */
-
-#define S_TC_FROM          _TC_MAKE32(6)
-#define M_TC_FROM          _TC_MAKEMASK(2,S_TC_FROM)
-#define G_TC_FROM(x)       _TC_GETVALUE(x,S_TC_FROM,M_TC_FROM)
-#define V_TC_FROM(x)       _TC_MAKEVALUE(x,S_TC_FROM)
-#define SET_TC_FROM(v,n)   ((V_TC_FROM(n)) | (v & ~M_TC_FROM))
 #define AT_STACK	0x0
 #define AT_INGRESS	0x1
 #define AT_EGRESS	0x2
-
-#define S_TC_AT          _TC_MAKE32(12)
-#define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
-#define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
-#define V_TC_AT(x)       _TC_MAKEVALUE(x,S_TC_AT)
-#define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
 #endif
 
 /* Action attributes */
diff --git a/net/core/dev.c b/net/core/dev.c
index e39e35d2e082..8b5d6d033473 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3153,7 +3153,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	if (!cl)
 		return skb;
 
-	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
+	/* skb->tc_at and qdisc_skb_cb(skb)->pkt_len were already set
 	 * earlier by the caller.
 	 */
 	qdisc_bstats_cpu_update(cl->q, skb);
@@ -3320,7 +3320,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+	skb->tc_at = AT_EGRESS;
 # ifdef CONFIG_NET_EGRESS
 	if (static_key_false(&egress_needed)) {
 		skb = sch_handle_egress(skb, &rc, dev);
@@ -3920,7 +3920,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	}
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+	skb->tc_at = AT_INGRESS;
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -4122,9 +4122,7 @@ skip_taps:
 			goto out;
 	}
 #endif
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = 0;
-#endif
+	skb_reset_tc(skb);
 skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8e69ce472236..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 			/* skb was 'freed' by stack, so clean few
 			 * bits and reuse it
 			 */
-#ifdef CONFIG_NET_CLS_ACT
-			skb->tc_verd = 0; /* reset reclass/redir ttl */
-#endif
+			skb_reset_tc(skb);
 		} while (--burst > 0);
 		goto out; /* Skips xmit_mode M_START_XMIT */
 	} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..adec4bf807d8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -878,9 +878,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 #ifdef CONFIG_NET_SCHED
 	CHECK_SKB_FIELD(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
-	CHECK_SKB_FIELD(tc_verd);
-#endif
 #endif
 
 }
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 80b848d3f096..921fb20eaa7c 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -736,12 +736,11 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	u16 metalen = ife_get_sz(skb, ife);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
 	unsigned int skboff = skb->dev->hard_header_len;
-	u32 at = G_TC_AT(skb->tc_verd);
 	int new_len = skb->len + hdrm;
 	bool exceed_mtu = false;
 	int err;
 
-	if (at & AT_EGRESS) {
+	if (!skb_at_tc_ingress(skb)) {
 		if (new_len > skb->dev->mtu)
 			exceed_mtu = true;
 	}
@@ -773,7 +772,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		return TC_ACT_SHOT;
 	}
 
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_push(skb, skb->dev->hard_header_len);
 
 	iethh = (struct ethhdr *)skb->data;
@@ -816,7 +815,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		ether_addr_copy(oethh->h_dest, iethh->h_dest);
 	oethh->h_proto = htons(ife->eth_type);
 
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_pull(skb, skb->dev->hard_header_len);
 
 	spin_unlock(&ife->tcf_lock);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 2d9fa6e0a1b4..8543279bba49 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -170,7 +170,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	int retval, err = 0;
 	int m_eaction;
 	int mac_len;
-	u32 at;
 
 	tcf_lastuse_update(&m->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -191,7 +190,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 		goto out;
 	}
 
-	at = G_TC_AT(skb->tc_verd);
 	skb2 = skb_clone(skb, GFP_ATOMIC);
 	if (!skb2)
 		goto out;
@@ -200,8 +198,9 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	 * and devices expect a mac header on xmit, then mac push/pull is
 	 * needed.
 	 */
-	if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) {
-		if (at & AT_EGRESS) {
+	if (skb->tc_at != tcf_mirred_act_direction(m_eaction) &&
+	    m_mac_header_xmit) {
+		if (!skb_at_tc_ingress(skb)) {
 			/* caught at egress, act ingress: pull mac */
 			mac_len = skb_network_header(skb) - skb_mac_header(skb);
 			skb_pull_rcsum(skb2, mac_len);
@@ -213,7 +212,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	/* mirror is always swallowed */
 	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+		skb2->tc_from = skb2->tc_at;
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bcfadfdea8e0..bb5c638b6852 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
 			 * If it's at ingress let's pretend the delay is
 			 * from the network (tstamp will be updated).
 			 */
-			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+			if (skb->tc_from & AT_INGRESS)
 				skb->tstamp = 0;
 #endif
 
-- 
cgit v1.2.3


From 8dc07fdbf2054f157e8333f940a1ad728916c786 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Sat, 7 Jan 2017 17:06:37 -0500
Subject: net-tc: convert tc_at to tc_at_ingress

Field tc_at is used only within tc actions to distinguish ingress from
egress processing. A single bit is sufficient for this purpose.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  3 ++-
 include/net/sch_generic.h |  3 +--
 net/core/dev.c            |  8 +++-----
 net/sched/act_mirred.c    | 12 ++++++------
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f738d09947b2..fab3f87e9bd1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -590,6 +590,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@fclone: skbuff clone status
  *	@ipvs_property: skbuff is owned by ipvs
  *	@tc_skip_classify: do not classify packet. set by IFB device
+ *	@tc_at_ingress: used within tc_classify to distinguish in/egress
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -751,7 +752,7 @@ struct sk_buff {
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
-	__u8			tc_at:2;
+	__u8			tc_at_ingress:1;
 	__u8			tc_from:2;
 #endif
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f80dba516964..4bd6d5387209 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -412,7 +412,6 @@ int skb_do_redirect(struct sk_buff *);
 static inline void skb_reset_tc(struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_at = 0;
 	skb->tc_from = 0;
 #endif
 }
@@ -420,7 +419,7 @@ static inline void skb_reset_tc(struct sk_buff *skb)
 static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	return skb->tc_at & AT_INGRESS;
+	return skb->tc_at_ingress;
 #else
 	return false;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 8b5d6d033473..c143f1391117 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3153,9 +3153,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	if (!cl)
 		return skb;
 
-	/* skb->tc_at and qdisc_skb_cb(skb)->pkt_len were already set
-	 * earlier by the caller.
-	 */
+	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3320,7 +3318,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_at = AT_EGRESS;
+	skb->tc_at_ingress = 0;
 # ifdef CONFIG_NET_EGRESS
 	if (static_key_false(&egress_needed)) {
 		skb = sch_handle_egress(skb, &rc, dev);
@@ -3920,7 +3918,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	}
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	skb->tc_at = AT_INGRESS;
+	skb->tc_at_ingress = 1;
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8543279bba49..e832c62fd705 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -39,15 +39,15 @@ static bool tcf_mirred_is_act_redirect(int action)
 	return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
 }
 
-static u32 tcf_mirred_act_direction(int action)
+static bool tcf_mirred_act_wants_ingress(int action)
 {
 	switch (action) {
 	case TCA_EGRESS_REDIR:
 	case TCA_EGRESS_MIRROR:
-		return AT_EGRESS;
+		return false;
 	case TCA_INGRESS_REDIR:
 	case TCA_INGRESS_MIRROR:
-		return AT_INGRESS;
+		return true;
 	default:
 		BUG();
 	}
@@ -198,7 +198,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	 * and devices expect a mac header on xmit, then mac push/pull is
 	 * needed.
 	 */
-	if (skb->tc_at != tcf_mirred_act_direction(m_eaction) &&
+	if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
 	    m_mac_header_xmit) {
 		if (!skb_at_tc_ingress(skb)) {
 			/* caught at egress, act ingress: pull mac */
@@ -212,11 +212,11 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	/* mirror is always swallowed */
 	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_from = skb2->tc_at;
+		skb2->tc_from = skb_at_tc_ingress(skb) ? AT_INGRESS : AT_EGRESS;
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
-	if (tcf_mirred_act_direction(m_eaction) & AT_EGRESS)
+	if (!tcf_mirred_act_wants_ingress(m_eaction))
 		err = dev_queue_xmit(skb2);
 	else
 		err = netif_receive_skb(skb2);
-- 
cgit v1.2.3


From bc31c905e946b5c55df5d2938335e78ffb3157ca Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Sat, 7 Jan 2017 17:06:38 -0500
Subject: net-tc: convert tc_from to tc_from_ingress and tc_redirected

The tc_from field fulfills two roles. It encodes whether a packet was
redirected by an act_mirred device and, if so, whether act_mirred was
called on ingress or egress. Split it into separate fields.

The information is needed by the special IFB loop, where packets are
taken out of the normal path by act_mirred, forwarded to IFB, then
reinjected at their original location (ingress or egress) by IFB.

The IFB device cannot use skb->tc_at_ingress, because that may have
been overwritten as the packet travels from act_mirred to ifb_xmit,
when it passes through tc_classify on the IFB egress path. Cache this
value in skb->tc_from_ingress.

That field is valid only if a packet arriving at ifb_xmit came from
act_mirred. Other packets can be crafted to reach ifb_xmit. These
must be dropped. Set tc_redirected on redirection and drop all packets
that do not have this bit set.

Both fields are set only on cloned skbs in tc actions, so original
packet sources do not have to clear the bit when reusing packets
(notably, pktgen and octeon).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c            | 13 +++++--------
 include/linux/skbuff.h       |  5 ++++-
 include/net/sch_generic.h    |  2 +-
 include/uapi/linux/pkt_cls.h |  6 ------
 net/sched/act_mirred.c       |  6 ++++--
 net/sched/sch_netem.c        |  2 +-
 6 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index b73b6b6c066b..312fce7302d3 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -78,9 +78,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 	}
 
 	while ((skb = __skb_dequeue(&txp->tq)) != NULL) {
-		u32 from = skb->tc_from;
-
-		skb_reset_tc(skb);
+		skb->tc_redirected = 0;
 		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
@@ -101,13 +99,12 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		rcu_read_unlock();
 		skb->skb_iif = txp->dev->ifindex;
 
-		if (from & AT_EGRESS) {
+		if (!skb->tc_from_ingress) {
 			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
+		} else {
 			skb_pull(skb, skb->mac_len);
 			netif_receive_skb(skb);
-		} else
-			BUG();
+		}
 	}
 
 	if (__netif_tx_trylock(txq)) {
@@ -246,7 +243,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	txp->rx_bytes += skb->len;
 	u64_stats_update_end(&txp->rsync);
 
-	if (skb->tc_from == AT_STACK || !skb->skb_iif) {
+	if (!skb->tc_redirected || !skb->skb_iif) {
 		dev_kfree_skb(skb);
 		dev->stats.rx_dropped++;
 		return NETDEV_TX_OK;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fab3f87e9bd1..3149a88de548 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -591,6 +591,8 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@ipvs_property: skbuff is owned by ipvs
  *	@tc_skip_classify: do not classify packet. set by IFB device
  *	@tc_at_ingress: used within tc_classify to distinguish in/egress
+ *	@tc_redirected: packet was redirected by a tc action
+ *	@tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -753,7 +755,8 @@ struct sk_buff {
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;
-	__u8			tc_from:2;
+	__u8			tc_redirected:1;
+	__u8			tc_from_ingress:1;
 #endif
 
 #ifdef CONFIG_NET_SCHED
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4bd6d5387209..e2f426f6d62f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -412,7 +412,7 @@ int skb_do_redirect(struct sk_buff *);
 static inline void skb_reset_tc(struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_from = 0;
+	skb->tc_redirected = 0;
 #endif
 }
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index cee753a7a40c..a081efbd61a2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -4,12 +4,6 @@
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
 
-#ifdef __KERNEL__
-#define AT_STACK	0x0
-#define AT_INGRESS	0x1
-#define AT_EGRESS	0x2
-#endif
-
 /* Action attributes */
 enum {
 	TCA_ACT_UNSPEC,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e832c62fd705..84682f02b611 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -211,8 +211,10 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	/* mirror is always swallowed */
-	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_from = skb_at_tc_ingress(skb) ? AT_INGRESS : AT_EGRESS;
+	if (tcf_mirred_is_act_redirect(m_eaction)) {
+		skb2->tc_redirected = 1;
+		skb2->tc_from_ingress = skb2->tc_at_ingress;
+	}
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb5c638b6852..c8bb62a1e744 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
 			 * If it's at ingress let's pretend the delay is
 			 * from the network (tstamp will be updated).
 			 */
-			if (skb->tc_from & AT_INGRESS)
+			if (skb->tc_redirected && skb->tc_from_ingress)
 				skb->tstamp = 0;
 #endif
 
-- 
cgit v1.2.3


From 9b8e34e211b15af429b72388a8f2b3b1823d172e Mon Sep 17 00:00:00 2001
From: Michał Kępień
Date: Fri, 6 Jan 2017 07:07:47 +0100
Subject: rfkill: Add rfkill-any LED trigger

Add a new "global" (i.e. not per-rfkill device) LED trigger, rfkill-any,
which may be useful on laptops with a single "radio LED" and multiple
radio transmitters.  The trigger is meant to turn a LED on whenever
there is at least one radio transmitter active and turn it off
otherwise.

Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index afa4f71b4c7b..2064c3a35ef8 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,6 +176,50 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 	led_trigger_unregister(&rfkill->led_trigger);
 }
+
+static struct led_trigger rfkill_any_led_trigger;
+static struct work_struct rfkill_any_work;
+
+static void rfkill_any_led_trigger_worker(struct work_struct *work)
+{
+	enum led_brightness brightness = LED_OFF;
+	struct rfkill *rfkill;
+
+	mutex_lock(&rfkill_global_mutex);
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
+			brightness = LED_FULL;
+			break;
+		}
+	}
+	mutex_unlock(&rfkill_global_mutex);
+
+	led_trigger_event(&rfkill_any_led_trigger, brightness);
+}
+
+static void rfkill_any_led_trigger_event(void)
+{
+	schedule_work(&rfkill_any_work);
+}
+
+static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+{
+	rfkill_any_led_trigger_event();
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
+	rfkill_any_led_trigger.name = "rfkill-any";
+	rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
+	return led_trigger_register(&rfkill_any_led_trigger);
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+	led_trigger_unregister(&rfkill_any_led_trigger);
+	cancel_work_sync(&rfkill_any_work);
+}
 #else
 static void rfkill_led_trigger_event(struct rfkill *rfkill)
 {
@@ -189,6 +233,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
 static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 }
+
+static void rfkill_any_led_trigger_event(void)
+{
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	return 0;
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+}
 #endif /* CONFIG_RFKILL_LEDS */
 
 static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -297,6 +354,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
 	if (prev != curr)
 		rfkill_event(rfkill);
@@ -477,6 +535,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
 	if (rfkill->registered && prev != blocked)
 		schedule_work(&rfkill->uevent_work);
@@ -520,6 +579,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
 	return blocked;
 }
@@ -569,6 +629,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 			schedule_work(&rfkill->uevent_work);
 
 		rfkill_led_trigger_event(rfkill);
+		rfkill_any_led_trigger_event();
 	}
 }
 EXPORT_SYMBOL(rfkill_set_states);
@@ -985,6 +1046,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 #endif
 	}
 
+	rfkill_any_led_trigger_event();
 	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
 	mutex_unlock(&rfkill_global_mutex);
@@ -1017,6 +1079,7 @@ void rfkill_unregister(struct rfkill *rfkill)
 	mutex_lock(&rfkill_global_mutex);
 	rfkill_send_events(rfkill, RFKILL_OP_DEL);
 	list_del_init(&rfkill->node);
+	rfkill_any_led_trigger_event();
 	mutex_unlock(&rfkill_global_mutex);
 
 	rfkill_led_trigger_unregister(rfkill);
@@ -1269,6 +1332,10 @@ static int __init rfkill_init(void)
 	if (error)
 		goto error_misc;
 
+	error = rfkill_any_led_trigger_register();
+	if (error)
+		goto error_led_trigger;
+
 #ifdef CONFIG_RFKILL_INPUT
 	error = rfkill_handler_init();
 	if (error)
@@ -1279,8 +1346,10 @@ static int __init rfkill_init(void)
 
 #ifdef CONFIG_RFKILL_INPUT
 error_input:
-	misc_deregister(&rfkill_miscdev);
+	rfkill_any_led_trigger_unregister();
 #endif
+error_led_trigger:
+	misc_deregister(&rfkill_miscdev);
 error_misc:
 	class_unregister(&rfkill_class);
 error_class:
@@ -1293,6 +1362,7 @@ static void __exit rfkill_exit(void)
 #ifdef CONFIG_RFKILL_INPUT
 	rfkill_handler_exit();
 #endif
+	rfkill_any_led_trigger_unregister();
 	misc_deregister(&rfkill_miscdev);
 	class_unregister(&rfkill_class);
 }
-- 
cgit v1.2.3


From 210f035316f545e6f507e7d61e191495ba983e27 Mon Sep 17 00:00:00 2001
From: David Howells
Date: Thu, 5 Jan 2017 10:38:36 +0000
Subject: rxrpc: Allow listen(sock, 0) to be used to disable listening

Allow listen() with a backlog of 0 to be used to disable listening on an
AF_RXRPC socket.  This also releases any preallocation, thereby making it
easier for a kernel service to account for all allocated call structures
when shutting down the service.

The socket cannot thereafter have listening reenabled, but must rather be
closed and reopened.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    | 8 ++++++++
 net/rxrpc/ar-internal.h | 1 +
 net/rxrpc/call_accept.c | 3 ++-
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 5f63f6dcaabb..199b46e93e64 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -224,6 +224,14 @@ static int rxrpc_listen(struct socket *sock, int backlog)
 		else
 			sk->sk_max_ack_backlog = old;
 		break;
+	case RXRPC_SERVER_LISTENING:
+		if (backlog == 0) {
+			rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED;
+			sk->sk_max_ack_backlog = 0;
+			rxrpc_discard_prealloc(rx);
+			ret = 0;
+			break;
+		}
 	default:
 		ret = -EBUSY;
 		break;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 84927c7b5fdf..12be432be9b2 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -60,6 +60,7 @@ enum {
 	RXRPC_CLIENT_BOUND,		/* client local address bound */
 	RXRPC_SERVER_BOUND,		/* server local address bound */
 	RXRPC_SERVER_LISTENING,		/* server listening for connections */
+	RXRPC_SERVER_LISTEN_DISABLED,	/* server listening disabled */
 	RXRPC_CLOSE,			/* socket is being closed */
 };
 
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 832d854c2d5c..7c4c64ab8da2 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -349,7 +349,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 
 found_service:
 	spin_lock(&rx->incoming_lock);
-	if (rx->sk.sk_state == RXRPC_CLOSE) {
+	if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
+	    rx->sk.sk_state == RXRPC_CLOSE) {
 		trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
 				  sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
 		skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
-- 
cgit v1.2.3


From 4ef8c1c93f848e360754f10eb2e7134c872b6597 Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Mon, 9 Jan 2017 11:10:42 +0100
Subject: cfg80211: size various nl80211 messages correctly

Ilan reported that sometimes nl80211 messages weren't working if
the frames being transported got very large, which was really a
problem for userspace-to-kernel messages, but prompted me to look
at the code.

Upon review, I found various places where variable-length data is
transported in an nl80211 message but the message isn't allocated
taking that into account. This shouldn't cause any problems since
the frames aren't really that long, apart in one place where two
(possibly very long frames) might not fit.

Fix all the places (that I found) that get variable length data
from the driver and put it into a message to take the length of
the variable data into account. The 100 there is just a safe
constant for the remaining message overhead (it's usually around
50 for most messages.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 23692658fe98..fed33ec20a71 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13018,7 +13018,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return;
 
@@ -13170,7 +13170,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -13212,7 +13212,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -13249,7 +13249,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	msg = nlmsg_new(100 + ie_len, GFP_KERNEL);
 	if (!msg)
 		return;
 
@@ -13325,7 +13325,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
 
 	trace_cfg80211_notify_new_peer_candidate(dev, addr);
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -13696,7 +13696,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return -ENOMEM;
 
@@ -13740,7 +13740,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 
 	trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return;
 
@@ -14551,7 +14551,7 @@ void cfg80211_ft_event(struct net_device *netdev,
 	if (!ft_event->target_ap)
 		return;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL);
 	if (!msg)
 		return;
 
-- 
cgit v1.2.3


From bd2522b168847106c1885f0319a2833bdf88bf9a Mon Sep 17 00:00:00 2001
From: Andrzej Zaborowski
Date: Fri, 6 Jan 2017 16:33:43 -0500
Subject: cfg80211: NL80211_ATTR_SOCKET_OWNER support for CMD_CONNECT

Disconnect or deauthenticate when the owning socket is closed if this
flag is supplied to CMD_CONNECT or CMD_ASSOCIATE.  This may be used
to ensure userspace daemon doesn't leave an unmanaged connection behind.

In some situations it would be possible to account for that, to some
degree, in the deamon restart code or in the up/down scripts without
the use of this attribute.  But there will be systems where the daemon
can go away for varying periods without a warning due to local resource
management.

Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  7 +++++++
 include/uapi/linux/nl80211.h |  2 ++
 net/wireless/core.c          |  3 +++
 net/wireless/core.h          |  1 +
 net/wireless/mlme.c          |  5 +++++
 net/wireless/nl80211.c       | 26 +++++++++++++++++++++++++-
 net/wireless/sme.c           | 33 +++++++++++++++++++++++++++++++++
 7 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 41a9ecd82ca0..cb13789ebaef 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3865,6 +3865,9 @@ struct cfg80211_cached_keys;
  * @conn: (private) cfg80211 software SME connection state machine data
  * @connect_keys: (private) keys to set after connection is established
  * @conn_bss_type: connecting/connected BSS type
+ * @conn_owner_nlportid: (private) connection owner socket port ID
+ * @disconnect_wk: (private) auto-disconnect work
+ * @disconnect_bssid: (private) the BSSID to use for auto-disconnect
  * @ibss_fixed: (private) IBSS is using fixed BSSID
  * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
@@ -3896,6 +3899,10 @@ struct wireless_dev {
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
 	enum ieee80211_bss_type conn_bss_type;
+	u32 conn_owner_nlportid;
+
+	struct work_struct disconnect_wk;
+	u8 disconnect_bssid[ETH_ALEN];
 
 	struct list_head event_list;
 	spinlock_t event_lock;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d74e10b1246a..174f4b30e804 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1820,6 +1820,8 @@ enum nl80211_commands {
  *	and remove functions. NAN notifications will be sent in unicast to that
  *	socket. Without this attribute, any socket can add functions and the
  *	notifications will be sent to the %NL80211_MCGRP_NAN multicast group.
+ *	If set during %NL80211_CMD_ASSOCIATE or %NL80211_CMD_CONNECT the
+ *	station will deauthenticate when the socket is closed.
  *
  * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is
  *	the TDLS link initiator.
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 158c59ecf90a..903fc419217a 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1142,6 +1142,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 		     wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
 			dev->priv_flags |= IFF_DONT_BRIDGE;
 
+		INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
+
 		nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
 		break;
 	case NETDEV_GOING_DOWN:
@@ -1230,6 +1232,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 #ifdef CONFIG_CFG80211_WEXT
 			kzfree(wdev->wext.keys);
 #endif
+			flush_work(&wdev->disconnect_wk);
 		}
 		/*
 		 * synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index bc8ba6e57519..ba42055a036d 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -400,6 +400,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 		       const u8 *resp_ie, size_t resp_ie_len);
 int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
+void cfg80211_autodisconnect_wk(struct work_struct *work);
 
 /* SME implementation */
 void cfg80211_conn_work(struct work_struct *work);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 4646cf5695b9..1c63a77aea34 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -345,6 +345,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 	     !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
 		return 0;
 
+	if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
+	    (wdev->current_bss &&
+	     ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+		wdev->conn_owner_nlportid = 0;
+
 	return rdev_deauth(rdev, dev, &req);
 }
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fed33ec20a71..b378d0a04003 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8050,8 +8050,17 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
 		wdev_lock(dev->ieee80211_ptr);
+
 		err = cfg80211_mlme_assoc(rdev, dev, chan, bssid,
 					  ssid, ssid_len, &req);
+
+		if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+			dev->ieee80211_ptr->conn_owner_nlportid =
+				info->snd_portid;
+			memcpy(dev->ieee80211_ptr->disconnect_bssid,
+			       bssid, ETH_ALEN);
+		}
+
 		wdev_unlock(dev->ieee80211_ptr);
 	}
 
@@ -8770,11 +8779,24 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	wdev_lock(dev->ieee80211_ptr);
+
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
 			       connect.prev_bssid);
-	wdev_unlock(dev->ieee80211_ptr);
 	if (err)
 		kzfree(connkeys);
+
+	if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+		dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
+		if (connect.bssid)
+			memcpy(dev->ieee80211_ptr->disconnect_bssid,
+			       connect.bssid, ETH_ALEN);
+		else
+			memset(dev->ieee80211_ptr->disconnect_bssid,
+			       0, ETH_ALEN);
+	}
+
+	wdev_unlock(dev->ieee80211_ptr);
+
 	return err;
 }
 
@@ -14491,6 +14513,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
 
 			if (wdev->owner_nlportid == notify->portid)
 				schedule_destroy_work = true;
+			else if (wdev->conn_owner_nlportid == notify->portid)
+				schedule_work(&wdev->disconnect_wk);
 		}
 
 		spin_lock_bh(&rdev->beacon_registrations_lock);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5e0d19380302..46693913fcea 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -727,6 +727,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 		kzfree(wdev->connect_keys);
 		wdev->connect_keys = NULL;
 		wdev->ssid_len = 0;
+		wdev->conn_owner_nlportid = 0;
 		if (bss) {
 			cfg80211_unhold_bss(bss_from_pub(bss));
 			cfg80211_put_bss(wdev->wiphy, bss);
@@ -955,6 +956,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 
 	wdev->current_bss = NULL;
 	wdev->ssid_len = 0;
+	wdev->conn_owner_nlportid = 0;
 
 	nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
 
@@ -1098,6 +1100,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 	kzfree(wdev->connect_keys);
 	wdev->connect_keys = NULL;
 
+	wdev->conn_owner_nlportid = 0;
+
 	if (wdev->conn)
 		err = cfg80211_sme_disconnect(wdev, reason);
 	else if (!rdev->ops->disconnect)
@@ -1107,3 +1111,32 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 
 	return err;
 }
+
+/*
+ * Used to clean up after the connection / connection attempt owner socket
+ * disconnects
+ */
+void cfg80211_autodisconnect_wk(struct work_struct *work)
+{
+	struct wireless_dev *wdev =
+		container_of(work, struct wireless_dev, disconnect_wk);
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+	wdev_lock(wdev);
+
+	if (wdev->conn_owner_nlportid) {
+		/*
+		 * Use disconnect_bssid if still connecting and ops->disconnect
+		 * not implemented.  Otherwise we can use cfg80211_disconnect.
+		 */
+		if (rdev->ops->disconnect || wdev->current_bss)
+			cfg80211_disconnect(rdev, wdev->netdev,
+					    WLAN_REASON_DEAUTH_LEAVING, true);
+		else
+			cfg80211_mlme_deauth(rdev, wdev->netdev,
+					     wdev->disconnect_bssid, NULL, 0,
+					     WLAN_REASON_DEAUTH_LEAVING, false);
+	}
+
+	wdev_unlock(wdev);
+}
-- 
cgit v1.2.3


From f32815d21d4d8287336fb9cef4d2d9e0866214c2 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 2 Jan 2017 17:19:40 -0500
Subject: xtables: add xt_match, xt_target and data copy_to_user functions

xt_entry_target, xt_entry_match and their private data may contain
kernel data.

Introduce helper functions xt_match_to_user, xt_target_to_user and
xt_data_to_user that copy only the expected fields. These replace
existing logic that calls copy_to_user on entire structs, then
overwrites select fields.

Private data is defined in xt_match and xt_target. All matches and
targets that maintain kernel data store this at the tail of their
private structure. Extend xt_match and xt_target with .usersize to
limit how many bytes of data are copied. The remainder is cleared.

If compatsize is specified, usersize can only safely be used if all
fields up to usersize use platform-independent types. Otherwise, the
compat_to_user callback must be defined.

This patch does not yet enable the support logic.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  9 +++++++
 net/netfilter/x_tables.c           | 54 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'net')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 5117e4d2ddfa..be378cf47fcc 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -167,6 +167,7 @@ struct xt_match {
 
 	const char *table;
 	unsigned int matchsize;
+	unsigned int usersize;
 #ifdef CONFIG_COMPAT
 	unsigned int compatsize;
 #endif
@@ -207,6 +208,7 @@ struct xt_target {
 
 	const char *table;
 	unsigned int targetsize;
+	unsigned int usersize;
 #ifdef CONFIG_COMPAT
 	unsigned int compatsize;
 #endif
@@ -287,6 +289,13 @@ int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
 		    bool inv_proto);
 
+int xt_match_to_user(const struct xt_entry_match *m,
+		     struct xt_entry_match __user *u);
+int xt_target_to_user(const struct xt_entry_target *t,
+		      struct xt_entry_target __user *u);
+int xt_data_to_user(void __user *dst, const void *src,
+		    int usersize, int size);
+
 void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 				 struct xt_counters_info *info, bool compat);
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2ff499680cc6..feccf527abdd 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -262,6 +262,60 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
 }
 EXPORT_SYMBOL_GPL(xt_request_find_target);
 
+
+static int xt_obj_to_user(u16 __user *psize, u16 size,
+			  void __user *pname, const char *name,
+			  u8 __user *prev, u8 rev)
+{
+	if (put_user(size, psize))
+		return -EFAULT;
+	if (copy_to_user(pname, name, strlen(name) + 1))
+		return -EFAULT;
+	if (put_user(rev, prev))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE)				\
+	xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size,	\
+		       U->u.user.name, K->u.kernel.TYPE->name,		\
+		       &U->u.user.revision, K->u.kernel.TYPE->revision)
+
+int xt_data_to_user(void __user *dst, const void *src,
+		    int usersize, int size)
+{
+	usersize = usersize ? : size;
+	if (copy_to_user(dst, src, usersize))
+		return -EFAULT;
+	if (usersize != size && clear_user(dst + usersize, size - usersize))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_data_to_user);
+
+#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE)				\
+	xt_data_to_user(U->data, K->data,				\
+			K->u.kernel.TYPE->usersize,			\
+			C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+
+int xt_match_to_user(const struct xt_entry_match *m,
+		     struct xt_entry_match __user *u)
+{
+	return XT_OBJ_TO_USER(u, m, match, 0) ||
+	       XT_DATA_TO_USER(u, m, match, 0);
+}
+EXPORT_SYMBOL_GPL(xt_match_to_user);
+
+int xt_target_to_user(const struct xt_entry_target *t,
+		      struct xt_entry_target __user *u)
+{
+	return XT_OBJ_TO_USER(u, t, target, 0) ||
+	       XT_DATA_TO_USER(u, t, target, 0);
+}
+EXPORT_SYMBOL_GPL(xt_target_to_user);
+
 static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
 {
 	const struct xt_match *m;
-- 
cgit v1.2.3


From f77bc5b23fb1af51fc0faa8a479dea8969eb5079 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 2 Jan 2017 17:19:41 -0500
Subject: iptables: use match, target and data copy_to_user helpers

Convert iptables to copying entries, matches and targets one by one,
using the xt_match_to_user and xt_target_to_user helper functions.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 91656a1d8fbd..384b85713e06 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -826,10 +826,6 @@ copy_entries_to_user(unsigned int total_size,
 		return PTR_ERR(counters);
 
 	loc_cpu_entry = private->entries;
-	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
-		ret = -EFAULT;
-		goto free_counters;
-	}
 
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
@@ -839,6 +835,10 @@ copy_entries_to_user(unsigned int total_size,
 		const struct xt_entry_target *t;
 
 		e = (struct ipt_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ipt_entry, counters),
 				 &counters[num],
@@ -852,23 +852,14 @@ copy_entries_to_user(unsigned int total_size,
 		     i += m->u.match_size) {
 			m = (void *)e + i;
 
-			if (copy_to_user(userptr + off + i
-					 + offsetof(struct xt_entry_match,
-						    u.user.name),
-					 m->u.kernel.match->name,
-					 strlen(m->u.kernel.match->name)+1)
-			    != 0) {
+			if (xt_match_to_user(m, userptr + off + i)) {
 				ret = -EFAULT;
 				goto free_counters;
 			}
 		}
 
 		t = ipt_get_target_c(e);
-		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct xt_entry_target,
-					    u.user.name),
-				 t->u.kernel.target->name,
-				 strlen(t->u.kernel.target->name)+1) != 0) {
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
 			ret = -EFAULT;
 			goto free_counters;
 		}
-- 
cgit v1.2.3


From e47ddb2c4691fd2bd8d25745ecb6848408899757 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 2 Jan 2017 17:19:42 -0500
Subject: ip6tables: use match, target and data copy_to_user helpers

Convert ip6tables to copying entries, matches and targets one by one,
using the xt_match_to_user and xt_target_to_user helper functions.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6_tables.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 25a022d41a70..1e15c54fd5e2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -855,10 +855,6 @@ copy_entries_to_user(unsigned int total_size,
 		return PTR_ERR(counters);
 
 	loc_cpu_entry = private->entries;
-	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
-		ret = -EFAULT;
-		goto free_counters;
-	}
 
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
@@ -868,6 +864,10 @@ copy_entries_to_user(unsigned int total_size,
 		const struct xt_entry_target *t;
 
 		e = (struct ip6t_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ip6t_entry, counters),
 				 &counters[num],
@@ -881,23 +881,14 @@ copy_entries_to_user(unsigned int total_size,
 		     i += m->u.match_size) {
 			m = (void *)e + i;
 
-			if (copy_to_user(userptr + off + i
-					 + offsetof(struct xt_entry_match,
-						    u.user.name),
-					 m->u.kernel.match->name,
-					 strlen(m->u.kernel.match->name)+1)
-			    != 0) {
+			if (xt_match_to_user(m, userptr + off + i)) {
 				ret = -EFAULT;
 				goto free_counters;
 			}
 		}
 
 		t = ip6t_get_target_c(e);
-		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct xt_entry_target,
-					    u.user.name),
-				 t->u.kernel.target->name,
-				 strlen(t->u.kernel.target->name)+1) != 0) {
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
 			ret = -EFAULT;
 			goto free_counters;
 		}
-- 
cgit v1.2.3


From 244b531bee2bdcfcd35ca2fff9cc7d073b3aa060 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 2 Jan 2017 17:19:43 -0500
Subject: arptables: use match, target and data copy_to_user helpers

Convert arptables to copying entries, matches and targets one by one,
using the xt_match_to_user and xt_target_to_user helper functions.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index a467e1236c43..6241a81fd7f5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -677,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size,
 		return PTR_ERR(counters);
 
 	loc_cpu_entry = private->entries;
-	/* ... then copy entire thing ... */
-	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
-		ret = -EFAULT;
-		goto free_counters;
-	}
 
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
@@ -689,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size,
 		const struct xt_entry_target *t;
 
 		e = (struct arpt_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
 		if (copy_to_user(userptr + off
 				 + offsetof(struct arpt_entry, counters),
 				 &counters[num],
@@ -698,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size,
 		}
 
 		t = arpt_get_target_c(e);
-		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct xt_entry_target,
-					    u.user.name),
-				 t->u.kernel.target->name,
-				 strlen(t->u.kernel.target->name)+1) != 0) {
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
 			ret = -EFAULT;
 			goto free_counters;
 		}
-- 
cgit v1.2.3


From b5040f6c33a51e6e1fddab75baf0f45d4943cde4 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 2 Jan 2017 17:19:44 -0500
Subject: ebtables: use match, target and data copy_to_user helpers

Convert ebtables to copying entries, matches and targets one by one.

The solution is analogous to that of generic xt_(match|target)_to_user
helpers, but is applied to different structs.

Convert existing helpers ebt_make_XXXname helpers that overwrite
fields of an already copy_to_user'd struct with ebt_XXX_to_user
helpers that copy all relevant fields of the struct from scratch.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 78 +++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 537e3d506fc2..79b69917f521 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1346,56 +1346,72 @@ static int update_counters(struct net *net, const void __user *user,
 				hlp.num_counters, user, len);
 }
 
-static inline int ebt_make_matchname(const struct ebt_entry_match *m,
-				     const char *base, char __user *ubase)
+static inline int ebt_obj_to_user(char __user *um, const char *_name,
+				  const char *data, int entrysize,
+				  int usersize, int datasize)
 {
-	char __user *hlp = ubase + ((char *)m - base);
-	char name[EBT_FUNCTION_MAXNAMELEN] = {};
+	char name[EBT_FUNCTION_MAXNAMELEN] = {0};
 
 	/* ebtables expects 32 bytes long names but xt_match names are 29 bytes
 	 * long. Copy 29 bytes and fill remaining bytes with zeroes.
 	 */
-	strlcpy(name, m->u.match->name, sizeof(name));
-	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
+	strlcpy(name, _name, sizeof(name));
+	if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
+	    put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
+	    xt_data_to_user(um + entrysize, data, usersize, datasize))
 		return -EFAULT;
+
 	return 0;
 }
 
-static inline int ebt_make_watchername(const struct ebt_entry_watcher *w,
-				       const char *base, char __user *ubase)
+static inline int ebt_match_to_user(const struct ebt_entry_match *m,
+				    const char *base, char __user *ubase)
 {
-	char __user *hlp = ubase + ((char *)w - base);
-	char name[EBT_FUNCTION_MAXNAMELEN] = {};
+	return ebt_obj_to_user(ubase + ((char *)m - base),
+			       m->u.match->name, m->data, sizeof(*m),
+			       m->u.match->usersize, m->match_size);
+}
 
-	strlcpy(name, w->u.watcher->name, sizeof(name));
-	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
-		return -EFAULT;
-	return 0;
+static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
+				      const char *base, char __user *ubase)
+{
+	return ebt_obj_to_user(ubase + ((char *)w - base),
+			       w->u.watcher->name, w->data, sizeof(*w),
+			       w->u.watcher->usersize, w->watcher_size);
 }
 
-static inline int ebt_make_names(struct ebt_entry *e, const char *base,
-				 char __user *ubase)
+static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
+				    char __user *ubase)
 {
 	int ret;
 	char __user *hlp;
 	const struct ebt_entry_target *t;
-	char name[EBT_FUNCTION_MAXNAMELEN] = {};
 
-	if (e->bitmask == 0)
+	if (e->bitmask == 0) {
+		/* special case !EBT_ENTRY_OR_ENTRIES */
+		if (copy_to_user(ubase + ((char *)e - base), e,
+				 sizeof(struct ebt_entries)))
+			return -EFAULT;
 		return 0;
+	}
+
+	if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e)))
+		return -EFAULT;
 
 	hlp = ubase + (((char *)e + e->target_offset) - base);
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
 
-	ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase);
+	ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
 	if (ret != 0)
 		return ret;
-	ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
+	ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase);
 	if (ret != 0)
 		return ret;
-	strlcpy(name, t->u.target->name, sizeof(name));
-	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
-		return -EFAULT;
+	ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t),
+			      t->u.target->usersize, t->target_size);
+	if (ret != 0)
+		return ret;
+
 	return 0;
 }
 
@@ -1475,13 +1491,9 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
 	if (ret)
 		return ret;
 
-	if (copy_to_user(tmp.entries, entries, entries_size)) {
-		BUGPRINT("Couldn't copy entries to userspace\n");
-		return -EFAULT;
-	}
 	/* set the match/watcher/target names right */
 	return EBT_ENTRY_ITERATE(entries, entries_size,
-	   ebt_make_names, entries, tmp.entries);
+	   ebt_entry_to_user, entries, tmp.entries);
 }
 
 static int do_ebt_set_ctl(struct sock *sk,
@@ -1630,8 +1642,10 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
 	if (match->compat_to_user) {
 		if (match->compat_to_user(cm->data, m->data))
 			return -EFAULT;
-	} else if (copy_to_user(cm->data, m->data, msize))
+	} else {
+		if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
 			return -EFAULT;
+	}
 
 	*size -= ebt_compat_entry_padsize() + off;
 	*dstptr = cm->data;
@@ -1657,8 +1671,10 @@ static int compat_target_to_user(struct ebt_entry_target *t,
 	if (target->compat_to_user) {
 		if (target->compat_to_user(cm->data, t->data))
 			return -EFAULT;
-	} else if (copy_to_user(cm->data, t->data, tsize))
-		return -EFAULT;
+	} else {
+		if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
+			return -EFAULT;
+	}
 
 	*size -= ebt_compat_entry_padsize() + off;
 	*dstptr = cm->data;
-- 
cgit v1.2.3


From 4915f7bbc402071539ec0cb8c75aad878c982402 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 2 Jan 2017 17:19:45 -0500
Subject: xtables: use match, target and data copy_to_user helpers in compat

Convert compat to copying entries, matches and targets one by one,
using the xt_match_to_user and xt_target_to_user helper functions.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index feccf527abdd..016db6be94b9 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -619,17 +619,14 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 	int off = xt_compat_match_offset(match);
 	u_int16_t msize = m->u.user.match_size - off;
 
-	if (copy_to_user(cm, m, sizeof(*cm)) ||
-	    put_user(msize, &cm->u.user.match_size) ||
-	    copy_to_user(cm->u.user.name, m->u.kernel.match->name,
-			 strlen(m->u.kernel.match->name) + 1))
+	if (XT_OBJ_TO_USER(cm, m, match, msize))
 		return -EFAULT;
 
 	if (match->compat_to_user) {
 		if (match->compat_to_user((void __user *)cm->data, m->data))
 			return -EFAULT;
 	} else {
-		if (copy_to_user(cm->data, m->data, msize - sizeof(*cm)))
+		if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
 			return -EFAULT;
 	}
 
@@ -977,17 +974,14 @@ int xt_compat_target_to_user(const struct xt_entry_target *t,
 	int off = xt_compat_target_offset(target);
 	u_int16_t tsize = t->u.user.target_size - off;
 
-	if (copy_to_user(ct, t, sizeof(*ct)) ||
-	    put_user(tsize, &ct->u.user.target_size) ||
-	    copy_to_user(ct->u.user.name, t->u.kernel.target->name,
-			 strlen(t->u.kernel.target->name) + 1))
+	if (XT_OBJ_TO_USER(ct, t, target, tsize))
 		return -EFAULT;
 
 	if (target->compat_to_user) {
 		if (target->compat_to_user((void __user *)ct->data, t->data))
 			return -EFAULT;
 	} else {
-		if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct)))
+		if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
 			return -EFAULT;
 	}
 
-- 
cgit v1.2.3


From ec23189049651b16dc2ffab35a4371dc1f491aca Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 2 Jan 2017 17:19:46 -0500
Subject: xtables: extend matches and targets with .usersize

In matches and targets that define a kernel-only tail to their
xt_match and xt_target data structs, add a field .usersize that
specifies up to where data is to be shared with userspace.

Performed a search for comment "Used internally by the kernel" to find
relevant matches and targets. Manually inspected the structs to derive
a valid offsetof.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_limit.c   | 1 +
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 +
 net/ipv6/netfilter/ip6t_NPT.c      | 2 ++
 net/netfilter/xt_CT.c              | 3 +++
 net/netfilter/xt_RATEEST.c         | 1 +
 net/netfilter/xt_TEE.c             | 2 ++
 net/netfilter/xt_bpf.c             | 2 ++
 net/netfilter/xt_cgroup.c          | 1 +
 net/netfilter/xt_connlimit.c       | 1 +
 net/netfilter/xt_hashlimit.c       | 4 ++++
 net/netfilter/xt_limit.c           | 2 ++
 net/netfilter/xt_quota.c           | 1 +
 net/netfilter/xt_rateest.c         | 1 +
 net/netfilter/xt_string.c          | 1 +
 14 files changed, 23 insertions(+)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 517e78befcb2..61a9f1be1263 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -105,6 +105,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = {
 	.match		= ebt_limit_mt,
 	.checkentry	= ebt_limit_mt_check,
 	.matchsize	= sizeof(struct ebt_limit_info),
+	.usersize	= offsetof(struct ebt_limit_info, prev),
 #ifdef CONFIG_COMPAT
 	.compatsize	= sizeof(struct ebt_compat_limit_info),
 #endif
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 21db00d0362b..8a3d20ebb815 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -468,6 +468,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
 	.checkentry	= clusterip_tg_check,
 	.destroy	= clusterip_tg_destroy,
 	.targetsize	= sizeof(struct ipt_clusterip_tgt_info),
+	.usersize	= offsetof(struct ipt_clusterip_tgt_info, config),
 #ifdef CONFIG_COMPAT
 	.compatsize	= sizeof(struct compat_ipt_clusterip_tgt_info),
 #endif /* CONFIG_COMPAT */
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 590f767db5d4..a379d2f79b19 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -112,6 +112,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
 		.table		= "mangle",
 		.target		= ip6t_snpt_tg,
 		.targetsize	= sizeof(struct ip6t_npt_tginfo),
+		.usersize	= offsetof(struct ip6t_npt_tginfo, adjustment),
 		.checkentry	= ip6t_npt_checkentry,
 		.family		= NFPROTO_IPV6,
 		.hooks		= (1 << NF_INET_LOCAL_IN) |
@@ -123,6 +124,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
 		.table		= "mangle",
 		.target		= ip6t_dnpt_tg,
 		.targetsize	= sizeof(struct ip6t_npt_tginfo),
+		.usersize	= offsetof(struct ip6t_npt_tginfo, adjustment),
 		.checkentry	= ip6t_npt_checkentry,
 		.family		= NFPROTO_IPV6,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 95c750358747..26b0bccfa0c5 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -373,6 +373,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
 		.name		= "CT",
 		.family		= NFPROTO_UNSPEC,
 		.targetsize	= sizeof(struct xt_ct_target_info),
+		.usersize	= offsetof(struct xt_ct_target_info, ct),
 		.checkentry	= xt_ct_tg_check_v0,
 		.destroy	= xt_ct_tg_destroy_v0,
 		.target		= xt_ct_target_v0,
@@ -384,6 +385,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
 		.family		= NFPROTO_UNSPEC,
 		.revision	= 1,
 		.targetsize	= sizeof(struct xt_ct_target_info_v1),
+		.usersize	= offsetof(struct xt_ct_target_info, ct),
 		.checkentry	= xt_ct_tg_check_v1,
 		.destroy	= xt_ct_tg_destroy_v1,
 		.target		= xt_ct_target_v1,
@@ -395,6 +397,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
 		.family		= NFPROTO_UNSPEC,
 		.revision	= 2,
 		.targetsize	= sizeof(struct xt_ct_target_info_v1),
+		.usersize	= offsetof(struct xt_ct_target_info, ct),
 		.checkentry	= xt_ct_tg_check_v2,
 		.destroy	= xt_ct_tg_destroy_v1,
 		.target		= xt_ct_target_v1,
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 91a373a3f534..498b54fd04d7 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -162,6 +162,7 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = {
 	.checkentry = xt_rateest_tg_checkentry,
 	.destroy    = xt_rateest_tg_destroy,
 	.targetsize = sizeof(struct xt_rateest_target_info),
+	.usersize   = offsetof(struct xt_rateest_target_info, est),
 	.me         = THIS_MODULE,
 };
 
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 1c57ace75ae6..86b0580b2216 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -133,6 +133,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.family     = NFPROTO_IPV4,
 		.target     = tee_tg4,
 		.targetsize = sizeof(struct xt_tee_tginfo),
+		.usersize   = offsetof(struct xt_tee_tginfo, priv),
 		.checkentry = tee_tg_check,
 		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
@@ -144,6 +145,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.family     = NFPROTO_IPV6,
 		.target     = tee_tg6,
 		.targetsize = sizeof(struct xt_tee_tginfo),
+		.usersize   = offsetof(struct xt_tee_tginfo, priv),
 		.checkentry = tee_tg_check,
 		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 2dedaa23ab0a..38986a95216c 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -110,6 +110,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = {
 		.match		= bpf_mt,
 		.destroy	= bpf_mt_destroy,
 		.matchsize	= sizeof(struct xt_bpf_info),
+		.usersize	= offsetof(struct xt_bpf_info, filter),
 		.me		= THIS_MODULE,
 	},
 	{
@@ -120,6 +121,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = {
 		.match		= bpf_mt_v1,
 		.destroy	= bpf_mt_destroy_v1,
 		.matchsize	= sizeof(struct xt_bpf_info_v1),
+		.usersize	= offsetof(struct xt_bpf_info_v1, filter),
 		.me		= THIS_MODULE,
 	},
 };
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a086a914865f..1db1ce59079f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -122,6 +122,7 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
 		.checkentry	= cgroup_mt_check_v1,
 		.match		= cgroup_mt_v1,
 		.matchsize	= sizeof(struct xt_cgroup_info_v1),
+		.usersize	= offsetof(struct xt_cgroup_info_v1, priv),
 		.destroy	= cgroup_mt_destroy_v1,
 		.me		= THIS_MODULE,
 		.hooks		= (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 660b61dbd776..b8fd4ab762ed 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -431,6 +431,7 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
 	.checkentry = connlimit_mt_check,
 	.match      = connlimit_mt,
 	.matchsize  = sizeof(struct xt_connlimit_info),
+	.usersize   = offsetof(struct xt_connlimit_info, data),
 	.destroy    = connlimit_mt_destroy,
 	.me         = THIS_MODULE,
 };
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10063408141d..26ef70c50e3b 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -838,6 +838,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV4,
 		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo1, hinfo),
 		.checkentry     = hashlimit_mt_check_v1,
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
@@ -848,6 +849,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV4,
 		.match          = hashlimit_mt,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
@@ -859,6 +861,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV6,
 		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo1, hinfo),
 		.checkentry     = hashlimit_mt_check_v1,
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
@@ -869,6 +872,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV6,
 		.match          = hashlimit_mt,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bef850596558..dab962df1787 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -192,6 +192,8 @@ static struct xt_match limit_mt_reg __read_mostly = {
 	.compatsize       = sizeof(struct compat_xt_rateinfo),
 	.compat_from_user = limit_mt_compat_from_user,
 	.compat_to_user   = limit_mt_compat_to_user,
+#else
+	.usersize         = offsetof(struct xt_rateinfo, prev),
 #endif
 	.me               = THIS_MODULE,
 };
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 44c8eb4c9d66..10d61a6eed71 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -73,6 +73,7 @@ static struct xt_match quota_mt_reg __read_mostly = {
 	.checkentry = quota_mt_check,
 	.destroy    = quota_mt_destroy,
 	.matchsize  = sizeof(struct xt_quota_info),
+	.usersize   = offsetof(struct xt_quota_info, master),
 	.me         = THIS_MODULE,
 };
 
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 1db02f6fca54..755d2f6693a2 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -133,6 +133,7 @@ static struct xt_match xt_rateest_mt_reg __read_mostly = {
 	.checkentry = xt_rateest_mt_checkentry,
 	.destroy    = xt_rateest_mt_destroy,
 	.matchsize  = sizeof(struct xt_rateest_match_info),
+	.usersize   = offsetof(struct xt_rateest_match_info, est1),
 	.me         = THIS_MODULE,
 };
 
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 0bc3460319c8..423293ee57c2 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -77,6 +77,7 @@ static struct xt_match xt_string_mt_reg __read_mostly = {
 	.match      = string_mt,
 	.destroy    = string_mt_destroy,
 	.matchsize  = sizeof(struct xt_string_info),
+	.usersize   = offsetof(struct xt_string_info, config),
 	.me         = THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 1e9116327e1dbbf33fcb539c977ff39e1c680e6c Mon Sep 17 00:00:00 2001
From: yuan linyu
Date: Sat, 7 Jan 2017 17:18:31 +0800
Subject: net: change init_inodecache() return void

sock_init() call it but not check it's return value,
so change it to void return and add an internal BUG_ON() check.

Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index c65bb927b104..3ef02e97ecf3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -287,7 +287,7 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-static int init_inodecache(void)
+static void init_inodecache(void)
 {
 	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
 					      sizeof(struct socket_alloc),
@@ -296,9 +296,7 @@ static int init_inodecache(void)
 					       SLAB_RECLAIM_ACCOUNT |
 					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					      init_once);
-	if (sock_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
+	BUG_ON(sock_inode_cachep == NULL);
 }
 
 static const struct super_operations sockfs_ops = {
-- 
cgit v1.2.3


From eafea7390e597f766927d1ba7459f3904b0b9194 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Sat, 7 Jan 2017 20:04:23 -0800
Subject: net: ipv4: remove disable of bottom half in inet_rtm_getroute

Nothing about the route lookup requires bottom half to be disabled.
Remove the local_bh_disable ... local_bh_enable around ip_route_input.
This appears to be a vestige of days gone by as it has been there
since the beginning of git time.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f51823dc998b..7144288371cf 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2631,9 +2631,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 		skb->protocol	= htons(ETH_P_IP);
 		skb->dev	= dev;
 		skb->mark	= mark;
-		local_bh_disable();
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
-		local_bh_enable();
 
 		rt = skb_rtable(skb);
 		if (err == 0 && rt->dst.error)
-- 
cgit v1.2.3


From 7cd23e5300c1b95903859a8bdc084e79be66ce16 Mon Sep 17 00:00:00 2001
From: Jason A. Donenfeld
Date: Sun, 8 Jan 2017 13:54:02 +0100
Subject: secure_seq: use SipHash in place of MD5

This gives a clear speed and security improvement. Siphash is both
faster and is more solid crypto than the aging MD5.

Rather than manually filling MD5 buffers, for IPv6, we simply create
a layout by a simple anonymous struct, for which gcc generates
rather efficient code. For IPv4, we pass the values directly to the
short input convenience functions.

64-bit x86_64:
[    1.683628] secure_tcpv6_sequence_number_md5# cycles: 99563527
[    1.717350] secure_tcp_sequence_number_md5# cycles: 92890502
[    1.741968] secure_tcpv6_sequence_number_siphash# cycles: 67825362
[    1.762048] secure_tcp_sequence_number_siphash# cycles: 67485526

32-bit x86:
[    1.600012] secure_tcpv6_sequence_number_md5# cycles: 103227892
[    1.634219] secure_tcp_sequence_number_md5# cycles: 94732544
[    1.669102] secure_tcpv6_sequence_number_siphash# cycles: 96299384
[    1.700165] secure_tcp_sequence_number_siphash# cycles: 86015473

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: David Laight <David.Laight@aculab.com>
Cc: Tom Herbert <tom@herbertland.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/secure_seq.c | 145 ++++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 82 deletions(-)

(limited to 'net')

diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 88a8e429fc3e..3a9fcec94ace 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,3 +1,7 @@
+/*
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/cryptohash.h>
@@ -8,18 +12,18 @@
 #include <linux/ktime.h>
 #include <linux/string.h>
 #include <linux/net.h>
-
+#include <linux/siphash.h>
 #include <net/secure_seq.h>
 
 #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <linux/in6.h>
 #include <net/tcp.h>
-#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
 
-static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+static siphash_key_t net_secret __read_mostly;
 
 static __always_inline void net_secret_init(void)
 {
-	net_get_random_once(net_secret, sizeof(net_secret));
+	net_get_random_once(&net_secret, sizeof(net_secret));
 }
 #endif
 
@@ -44,80 +48,70 @@ static u32 seq_scale(u32 seq)
 u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 				 __be16 sport, __be16 dport, u32 *tsoff)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
-	u32 i;
-
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.sport = sport,
+		.dport = dport
+	};
+	u64 hash;
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32)daddr[i];
-	secret[4] = net_secret[4] +
-		(((__force u16)sport << 16) + (__force u16)dport);
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	*tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
-	return seq_scale(hash[0]);
+	hash = siphash(&combined, offsetofend(typeof(combined), dport),
+		       &net_secret);
+	*tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+	return seq_scale(hash);
 }
 EXPORT_SYMBOL(secure_tcpv6_sequence_number);
 
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
-	u32 i;
-
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.dport = dport
+	};
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32) daddr[i];
-	secret[4] = net_secret[4] + (__force u32)dport;
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	return hash[0];
+	return siphash(&combined, offsetofend(typeof(combined), dport),
+		       &net_secret);
 }
 EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #ifdef CONFIG_INET
 
+/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
+ * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
+ * it would be easy enough to have the former function use siphash_4u32, passing
+ * the arguments as separate u32.
+ */
+
 u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
 			       __be16 sport, __be16 dport, u32 *tsoff)
 {
-	u32 hash[MD5_DIGEST_WORDS];
-
+	u64 hash;
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	*tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
-	return seq_scale(hash[0]);
+	hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+			    (__force u32)sport << 16 | (__force u32)dport,
+			    &net_secret);
+	*tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+	return seq_scale(hash);
 }
 
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
 {
-	u32 hash[MD5_DIGEST_WORDS];
-
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = (__force u32)dport ^ net_secret[14];
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	return hash[0];
+	return siphash_3u32((__force u32)saddr, (__force u32)daddr,
+			    (__force u16)dport, &net_secret);
 }
 EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
 #endif
@@ -126,21 +120,11 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport)
 {
-	u32 hash[MD5_DIGEST_WORDS];
 	u64 seq;
-
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq = siphash_3u32(saddr, daddr, (u32)sport << 16 | dport, &net_secret);
 	seq += ktime_get_real_ns();
 	seq &= (1ull << 48) - 1;
-
 	return seq;
 }
 EXPORT_SYMBOL(secure_dccp_sequence_number);
@@ -149,26 +133,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
 				  __be16 sport, __be16 dport)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.sport = sport,
+		.dport = dport
+	};
 	u64 seq;
-	u32 i;
-
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32)daddr[i];
-	secret[4] = net_secret[4] +
-		(((__force u16)sport << 16) + (__force u16)dport);
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq = siphash(&combined, offsetofend(typeof(combined), dport),
+		      &net_secret);
 	seq += ktime_get_real_ns();
 	seq &= (1ull << 48) - 1;
-
 	return seq;
 }
 EXPORT_SYMBOL(secure_dccpv6_sequence_number);
-- 
cgit v1.2.3


From fe62d05b295bde037fa324767674540907c89362 Mon Sep 17 00:00:00 2001
From: Jason A. Donenfeld
Date: Sun, 8 Jan 2017 13:54:03 +0100
Subject: syncookies: use SipHash in place of SHA1

SHA1 is slower and less secure than SipHash, and so replacing syncookie
generation with SipHash makes natural sense. Some BSDs have been doing
this for several years in fact.

The speedup should be similar -- and even more impressive -- to the
speedup from the sequence number fix in this series.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/syncookies.c | 21 +++++----------------
 net/ipv6/syncookies.c | 41 +++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3e88467d70ee..496b97e17aaf 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
 #include <linux/tcp.h>
 #include <linux/slab.h>
 #include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <net/tcp.h>
 #include <net/route.h>
 
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie_secret[2] __read_mostly;
 
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
 #define TSBITS	6
 #define TSMASK	(((__u32)1 << TSBITS) - 1)
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
-
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
 {
-	__u32 *tmp;
-
 	net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
-
-	tmp  = this_cpu_ptr(ipv4_cookie_scratch);
-	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
-	tmp[0] = (__force u32)saddr;
-	tmp[1] = (__force u32)daddr;
-	tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
-	tmp[3] = count;
-	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
-	return tmp[17];
+	return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+			    (__force u32)sport << 16 | (__force u32)dport,
+			    count, &syncookie_secret[c]);
 }
 
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index a4d49760bf43..895ff650db43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -16,7 +16,7 @@
 
 #include <linux/tcp.h>
 #include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <net/ipv6.h>
 #include <net/tcp.h>
@@ -24,7 +24,7 @@
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie6_secret[2] __read_mostly;
 
 /* RFC 2460, Section 8.3:
  * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -41,30 +41,27 @@ static __u16 const msstab[] = {
 	9000 - 60,
 };
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
-
-static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
+static u32 cookie_hash(const struct in6_addr *saddr,
+		       const struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
 {
-	__u32 *tmp;
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		u32 count;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *saddr,
+		.daddr = *daddr,
+		.count = count,
+		.sport = sport,
+		.dport = dport
+	};
 
 	net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
-
-	tmp  = this_cpu_ptr(ipv6_cookie_scratch);
-
-	/*
-	 * we have 320 bits of information to hash, copy in the remaining
-	 * 192 bits required for sha_transform, from the syncookie6_secret
-	 * and overwrite the digest with the secret
-	 */
-	memcpy(tmp + 10, syncookie6_secret[c], 44);
-	memcpy(tmp, saddr, 16);
-	memcpy(tmp + 4, daddr, 16);
-	tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
-	tmp[9] = count;
-	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
-	return tmp[17];
+	return siphash(&combined, offsetofend(typeof(combined), dport),
+		       &syncookie6_secret[c]);
 }
 
 static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
-- 
cgit v1.2.3


From 58fa118f3de45481df2ac2b8b41e8114cae2574d Mon Sep 17 00:00:00 2001
From: Alexandru Moise
Date: Sun, 8 Jan 2017 18:49:46 +0200
Subject: cls_u32: don't bother explicitly initializing ->divisor to zero

This struct member is already initialized to zero upon root_ht's
allocation via kzalloc().

Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ae83c3aec308..a6ec3e4b57ab 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -334,7 +334,6 @@ static int u32_init(struct tcf_proto *tp)
 	if (root_ht == NULL)
 		return -ENOBUFS;
 
-	root_ht->divisor = 0;
 	root_ht->refcnt++;
 	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
 	root_ht->prio = tp->prio;
-- 
cgit v1.2.3


From ab9d226e3e2a8aeca426701b735194bc35179c2d Mon Sep 17 00:00:00 2001
From: Davide Caratti
Date: Mon, 9 Jan 2017 11:24:20 +0100
Subject: net/sched: Kconfig: select LIBCRC32C if NET_ACT_CSUM is selected

LIBCRC32C is needed to compute crc32c on SCTP packets.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 87956a768d1b..a9aa38d43fa7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -707,6 +707,7 @@ config NET_ACT_SKBEDIT
 config NET_ACT_CSUM
         tristate "Checksum Updating"
         depends on NET_CLS_ACT && INET
+        select LIBCRC32C
         ---help---
 	  Say Y here to update some common checksum after some direct
 	  packet alterations.
-- 
cgit v1.2.3


From c008b33f3ef0915dfb57432dba1fa0ce34fdcc29 Mon Sep 17 00:00:00 2001
From: Davide Caratti
Date: Mon, 9 Jan 2017 11:24:21 +0100
Subject: net/sched: act_csum: compute crc32c on SCTP packets

modify act_csum to compute crc32c on IPv4/IPv6 packets having SCTP in
their payload, and extend UAPI definitions accordingly.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_csum.h |  3 ++-
 net/sched/act_csum.c                | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/tc_act/tc_csum.h b/include/uapi/linux/tc_act/tc_csum.h
index 8ac8041ab5f1..a11bb355dbfb 100644
--- a/include/uapi/linux/tc_act/tc_csum.h
+++ b/include/uapi/linux/tc_act/tc_csum.h
@@ -21,7 +21,8 @@ enum {
 	TCA_CSUM_UPDATE_FLAG_IGMP    = 4,
 	TCA_CSUM_UPDATE_FLAG_TCP     = 8,
 	TCA_CSUM_UPDATE_FLAG_UDP     = 16,
-	TCA_CSUM_UPDATE_FLAG_UDPLITE = 32
+	TCA_CSUM_UPDATE_FLAG_UDPLITE = 32,
+	TCA_CSUM_UPDATE_FLAG_SCTP    = 64,
 };
 
 struct tc_csum {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index a0edd80a44db..e978ccd4402c 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -30,6 +30,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/ip6_checksum.h>
+#include <net/sctp/checksum.h>
 
 #include <net/act_api.h>
 
@@ -322,6 +323,25 @@ ignore_obscure_skb:
 	return 1;
 }
 
+static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
+			 unsigned int ipl)
+{
+	struct sctphdr *sctph;
+
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)
+		return 1;
+
+	sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph));
+	if (!sctph)
+		return 0;
+
+	sctph->checksum = sctp_compute_cksum(skb,
+					     skb_network_offset(skb) + ihl);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
 static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 {
 	const struct iphdr *iph;
@@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 					       ntohs(iph->tot_len), 1))
 				goto fail;
 		break;
+	case IPPROTO_SCTP:
+		if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+		    !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len)))
+			goto fail;
+		break;
 	}
 
 	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
@@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
 						       pl + sizeof(*ip6h), 1))
 					goto fail;
 			goto done;
+		case IPPROTO_SCTP:
+			if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+			    !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h)))
+				goto fail;
+			goto done;
 		default:
 			goto ignore_skb;
 		}
-- 
cgit v1.2.3


From ab3d408d3f40f939d46a32b1c24aa2833a13b846 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Sun, 8 Jan 2017 14:52:07 -0800
Subject: net: dsa: Encapsulate legacy switch drivers into dsa_switch_driver

In preparation for making struct dsa_switch_ops const, encapsulate it
within a dsa_switch_driver which has a list pointer and a pointer to
dsa_switch_ops. This allows us to take the list_head pointer out of
dsa_switch_ops, which is written to by {un,}register_switch_driver.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6060.c      |  8 ++++++--
 drivers/net/dsa/mv88e6xxx/chip.c |  8 ++++++--
 include/net/dsa.h                | 11 +++++++----
 net/dsa/dsa.c                    | 12 +++++++-----
 4 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index 7ce36dbd9b62..bcbd6dcbd8e8 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -261,16 +261,20 @@ static struct dsa_switch_ops mv88e6060_switch_ops = {
 	.phy_write	= mv88e6060_phy_write,
 };
 
+static struct dsa_switch_driver mv88e6060_switch_drv = {
+	.ops		= &mv88e6060_switch_ops,
+};
+
 static int __init mv88e6060_init(void)
 {
-	register_switch_driver(&mv88e6060_switch_ops);
+	register_switch_driver(&mv88e6060_switch_drv);
 	return 0;
 }
 module_init(mv88e6060_init);
 
 static void __exit mv88e6060_cleanup(void)
 {
-	unregister_switch_driver(&mv88e6060_switch_ops);
+	unregister_switch_driver(&mv88e6060_switch_drv);
 }
 module_exit(mv88e6060_cleanup);
 
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 676b0e2ad221..d43d12c281b3 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4403,6 +4403,10 @@ static struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.port_mdb_dump          = mv88e6xxx_port_mdb_dump,
 };
 
+static struct dsa_switch_driver mv88e6xxx_switch_drv = {
+	.ops			= &mv88e6xxx_switch_ops,
+};
+
 static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip,
 				     struct device_node *np)
 {
@@ -4565,7 +4569,7 @@ static struct mdio_driver mv88e6xxx_driver = {
 
 static int __init mv88e6xxx_init(void)
 {
-	register_switch_driver(&mv88e6xxx_switch_ops);
+	register_switch_driver(&mv88e6xxx_switch_drv);
 	return mdio_driver_register(&mv88e6xxx_driver);
 }
 module_init(mv88e6xxx_init);
@@ -4573,7 +4577,7 @@ module_init(mv88e6xxx_init);
 static void __exit mv88e6xxx_cleanup(void)
 {
 	mdio_driver_unregister(&mv88e6xxx_driver);
-	unregister_switch_driver(&mv88e6xxx_switch_ops);
+	unregister_switch_driver(&mv88e6xxx_switch_drv);
 }
 module_exit(mv88e6xxx_cleanup);
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index b122196d5a1f..edfa9b130953 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -240,8 +240,6 @@ struct switchdev_obj_port_mdb;
 struct switchdev_obj_port_vlan;
 
 struct dsa_switch_ops {
-	struct list_head	list;
-
 	/*
 	 * Probing and setup.
 	 */
@@ -390,8 +388,13 @@ struct dsa_switch_ops {
 				 int (*cb)(struct switchdev_obj *obj));
 };
 
-void register_switch_driver(struct dsa_switch_ops *type);
-void unregister_switch_driver(struct dsa_switch_ops *type);
+struct dsa_switch_driver {
+	struct list_head	list;
+	struct dsa_switch_ops	*ops;
+};
+
+void register_switch_driver(struct dsa_switch_driver *type);
+void unregister_switch_driver(struct dsa_switch_driver *type);
 struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev);
 
 static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst)
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index cda787ebad15..4e7bc57cdae5 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -60,18 +60,18 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
 static DEFINE_MUTEX(dsa_switch_drivers_mutex);
 static LIST_HEAD(dsa_switch_drivers);
 
-void register_switch_driver(struct dsa_switch_ops *ops)
+void register_switch_driver(struct dsa_switch_driver *drv)
 {
 	mutex_lock(&dsa_switch_drivers_mutex);
-	list_add_tail(&ops->list, &dsa_switch_drivers);
+	list_add_tail(&drv->list, &dsa_switch_drivers);
 	mutex_unlock(&dsa_switch_drivers_mutex);
 }
 EXPORT_SYMBOL_GPL(register_switch_driver);
 
-void unregister_switch_driver(struct dsa_switch_ops *ops)
+void unregister_switch_driver(struct dsa_switch_driver *drv)
 {
 	mutex_lock(&dsa_switch_drivers_mutex);
-	list_del_init(&ops->list);
+	list_del_init(&drv->list);
 	mutex_unlock(&dsa_switch_drivers_mutex);
 }
 EXPORT_SYMBOL_GPL(unregister_switch_driver);
@@ -90,8 +90,10 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 	mutex_lock(&dsa_switch_drivers_mutex);
 	list_for_each(list, &dsa_switch_drivers) {
 		struct dsa_switch_ops *ops;
+		struct dsa_switch_driver *drv;
 
-		ops = list_entry(list, struct dsa_switch_ops, list);
+		drv = list_entry(list, struct dsa_switch_driver, list);
+		ops = drv->ops;
 
 		name = ops->probe(parent, host_dev, sw_addr, priv);
 		if (name != NULL) {
-- 
cgit v1.2.3


From a82f67afe8e297834bedafa529941d9d0808caf8 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Sun, 8 Jan 2017 14:52:08 -0800
Subject: net: dsa: Make dsa_switch_ops const

Now that we have properly encapsulated and made drivers utilize exported
functions, we can switch dsa_switch_ops to be a annotated with const.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c |  2 +-
 drivers/net/dsa/bcm_sf2.c        |  2 +-
 drivers/net/dsa/mv88e6060.c      |  2 +-
 drivers/net/dsa/mv88e6xxx/chip.c |  2 +-
 drivers/net/dsa/qca8k.c          |  2 +-
 include/net/dsa.h                |  4 ++--
 net/dsa/dsa.c                    | 10 +++++-----
 net/dsa/hwmon.c                  |  2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index a448661b55c6..5102a3701a1a 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1453,7 +1453,7 @@ static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds)
 	return DSA_TAG_PROTO_NONE;
 }
 
-static struct dsa_switch_ops b53_switch_ops = {
+static const struct dsa_switch_ops b53_switch_ops = {
 	.get_tag_protocol	= b53_get_tag_protocol,
 	.setup			= b53_setup,
 	.get_strings		= b53_get_strings,
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 52027718d06f..31d017086f8b 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -977,7 +977,7 @@ static struct b53_io_ops bcm_sf2_io_ops = {
 	.write64 = bcm_sf2_core_write64,
 };
 
-static struct dsa_switch_ops bcm_sf2_ops = {
+static const struct dsa_switch_ops bcm_sf2_ops = {
 	.get_tag_protocol	= bcm_sf2_sw_get_tag_protocol,
 	.setup			= bcm_sf2_sw_setup,
 	.get_strings		= b53_get_strings,
diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index bcbd6dcbd8e8..5934b7a4c448 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -252,7 +252,7 @@ mv88e6060_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val)
 	return reg_write(ds, addr, regnum, val);
 }
 
-static struct dsa_switch_ops mv88e6060_switch_ops = {
+static const struct dsa_switch_ops mv88e6060_switch_ops = {
 	.get_tag_protocol = mv88e6060_get_tag_protocol,
 	.probe		= mv88e6060_drv_probe,
 	.setup		= mv88e6060_setup,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index d43d12c281b3..eea8e0176e33 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4361,7 +4361,7 @@ static int mv88e6xxx_port_mdb_dump(struct dsa_switch *ds, int port,
 	return err;
 }
 
-static struct dsa_switch_ops mv88e6xxx_switch_ops = {
+static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.probe			= mv88e6xxx_drv_probe,
 	.get_tag_protocol	= mv88e6xxx_get_tag_protocol,
 	.setup			= mv88e6xxx_setup,
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index b3df70d07ff6..54d270d59eb0 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -911,7 +911,7 @@ qca8k_get_tag_protocol(struct dsa_switch *ds)
 	return DSA_TAG_PROTO_QCA;
 }
 
-static struct dsa_switch_ops qca8k_switch_ops = {
+static const struct dsa_switch_ops qca8k_switch_ops = {
 	.get_tag_protocol	= qca8k_get_tag_protocol,
 	.setup			= qca8k_setup,
 	.get_strings		= qca8k_get_strings,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index edfa9b130953..b94d1f2ef912 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -169,7 +169,7 @@ struct dsa_switch {
 	/*
 	 * The switch operations.
 	 */
-	struct dsa_switch_ops	*ops;
+	const struct dsa_switch_ops	*ops;
 
 	/*
 	 * An array of which element [a] indicates which port on this
@@ -390,7 +390,7 @@ struct dsa_switch_ops {
 
 struct dsa_switch_driver {
 	struct list_head	list;
-	struct dsa_switch_ops	*ops;
+	const struct dsa_switch_ops *ops;
 };
 
 void register_switch_driver(struct dsa_switch_driver *type);
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 4e7bc57cdae5..fd532487dfdf 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -76,11 +76,11 @@ void unregister_switch_driver(struct dsa_switch_driver *drv)
 }
 EXPORT_SYMBOL_GPL(unregister_switch_driver);
 
-static struct dsa_switch_ops *
+static const struct dsa_switch_ops *
 dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 		 const char **_name, void **priv)
 {
-	struct dsa_switch_ops *ret;
+	const struct dsa_switch_ops *ret;
 	struct list_head *list;
 	const char *name;
 
@@ -89,7 +89,7 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 
 	mutex_lock(&dsa_switch_drivers_mutex);
 	list_for_each(list, &dsa_switch_drivers) {
-		struct dsa_switch_ops *ops;
+		const struct dsa_switch_ops *ops;
 		struct dsa_switch_driver *drv;
 
 		drv = list_entry(list, struct dsa_switch_driver, list);
@@ -207,7 +207,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
 
 static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 {
-	struct dsa_switch_ops *ops = ds->ops;
+	const struct dsa_switch_ops *ops = ds->ops;
 	struct dsa_switch_tree *dst = ds->dst;
 	struct dsa_chip_data *cd = ds->cd;
 	bool valid_name_found = false;
@@ -326,7 +326,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 		 struct device *parent, struct device *host_dev)
 {
 	struct dsa_chip_data *cd = dst->pd->chip + index;
-	struct dsa_switch_ops *ops;
+	const struct dsa_switch_ops *ops;
 	struct dsa_switch *ds;
 	int ret;
 	const char *name;
diff --git a/net/dsa/hwmon.c b/net/dsa/hwmon.c
index 3a9cdf0b22b8..08831a811278 100644
--- a/net/dsa/hwmon.c
+++ b/net/dsa/hwmon.c
@@ -86,7 +86,7 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct dsa_switch *ds = dev_get_drvdata(dev);
-	struct dsa_switch_ops *ops = ds->ops;
+	const struct dsa_switch_ops *ops = ds->ops;
 	umode_t mode = attr->mode;
 
 	if (index == 1) {
-- 
cgit v1.2.3


From 8d9ba388f35b3c681975a6b3f6ba60bb42c98f8d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer
Date: Mon, 9 Jan 2017 16:04:04 +0100
Subject: Revert "icmp: avoid allocating large struct on stack"

This reverts commit 9a99d4a50cb8 ("icmp: avoid allocating large struct
on stack"), because struct icmp_bxm no really a large struct, and
allocating and free of this small 112 bytes hurts performance.

Fixes: 9a99d4a50cb8 ("icmp: avoid allocating large struct on stack")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0777ea949223..b4b9807329a7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -571,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 {
 	struct iphdr *iph;
 	int room;
-	struct icmp_bxm *icmp_param;
+	struct icmp_bxm icmp_param;
 	struct rtable *rt = skb_rtable(skb_in);
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
@@ -648,13 +648,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 	}
 
-	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
-	if (!icmp_param)
-		return;
-
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		goto out_free;
+		return;
 
 	/*
 	 *	Construct source address and options.
@@ -681,7 +677,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					  iph->tos;
 	mark = IP4_REPLY_MARK(net, skb_in->mark);
 
-	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+	if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
 		goto out_unlock;
 
 
@@ -689,22 +685,22 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	 *	Prepare data for ICMP header.
 	 */
 
-	icmp_param->data.icmph.type	 = type;
-	icmp_param->data.icmph.code	 = code;
-	icmp_param->data.icmph.un.gateway = info;
-	icmp_param->data.icmph.checksum	 = 0;
-	icmp_param->skb	  = skb_in;
-	icmp_param->offset = skb_network_offset(skb_in);
+	icmp_param.data.icmph.type	 = type;
+	icmp_param.data.icmph.code	 = code;
+	icmp_param.data.icmph.un.gateway = info;
+	icmp_param.data.icmph.checksum	 = 0;
+	icmp_param.skb	  = skb_in;
+	icmp_param.offset = skb_network_offset(skb_in);
 	inet_sk(sk)->tos = tos;
 	sk->sk_mark = mark;
 	ipc.addr = iph->saddr;
-	ipc.opt = &icmp_param->replyopts.opt;
+	ipc.opt = &icmp_param.replyopts.opt;
 	ipc.tx_flags = 0;
 	ipc.ttl = 0;
 	ipc.tos = -1;
 
 	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
-			       type, code, icmp_param);
+			       type, code, &icmp_param);
 	if (IS_ERR(rt))
 		goto out_unlock;
 
@@ -716,21 +712,19 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	room = dst_mtu(&rt->dst);
 	if (room > 576)
 		room = 576;
-	room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
 	room -= sizeof(struct icmphdr);
 
-	icmp_param->data_len = skb_in->len - icmp_param->offset;
-	if (icmp_param->data_len > room)
-		icmp_param->data_len = room;
-	icmp_param->head_len = sizeof(struct icmphdr);
+	icmp_param.data_len = skb_in->len - icmp_param.offset;
+	if (icmp_param.data_len > room)
+		icmp_param.data_len = room;
+	icmp_param.head_len = sizeof(struct icmphdr);
 
-	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+	icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
 ende:
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
-out_free:
-	kfree(icmp_param);
 out:;
 }
 EXPORT_SYMBOL(icmp_send);
-- 
cgit v1.2.3


From c0303efeab7391ec51c337e0ac5740860ad01fe7 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer
Date: Mon, 9 Jan 2017 16:04:09 +0100
Subject: net: reduce cycles spend on ICMP replies that gets rate limited

This patch split the global and per (inet)peer ICMP-reply limiter
code, and moves the global limit check to earlier in the packet
processing path.  Thus, avoid spending cycles on ICMP replies that
gets limited/suppressed anyhow.

The global ICMP rate limiter icmp_global_allow() is a good solution,
it just happens too late in the process.  The kernel goes through the
full route lookup (return path) for the ICMP message, before taking
the rate limit decision of not sending the ICMP reply.

Details: The kernels global rate limiter for ICMP messages got added
in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
a token bucket limiter with a global lock.  It brilliantly avoids
locking congestion by only updating when 20ms (HZ/50) were elapsed. It
can then avoids taking lock when credit is exhausted (when under
pressure) and time constraint for refill is not yet meet.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 71 ++++++++++++++++++++++++++++++++++++++-------------------
 net/ipv6/icmp.c | 49 +++++++++++++++++++++++++++------------
 2 files changed, 82 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index b4b9807329a7..58d75ca58b83 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -282,6 +282,33 @@ bool icmp_global_allow(void)
 }
 EXPORT_SYMBOL(icmp_global_allow);
 
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+	if (type > NR_ICMP_TYPES)
+		return true;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		return true;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+		return true;
+
+	return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+	if (icmpv4_mask_allow(net, type, code))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  *	Send an ICMP frame.
  */
@@ -290,34 +317,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 			       struct flowi4 *fl4, int type, int code)
 {
 	struct dst_entry *dst = &rt->dst;
+	struct inet_peer *peer;
 	bool rc = true;
+	int vif;
 
-	if (type > NR_ICMP_TYPES)
-		goto out;
-
-	/* Don't limit PMTU discovery. */
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+	if (icmpv4_mask_allow(net, type, code))
 		goto out;
 
 	/* No rate limit on loopback */
 	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
 		goto out;
 
-	/* Limit if icmp type is enabled in ratemask. */
-	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
-		goto out;
-
-	rc = false;
-	if (icmp_global_allow()) {
-		int vif = l3mdev_master_ifindex(dst->dev);
-		struct inet_peer *peer;
-
-		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
-		rc = inet_peer_xrlim_allow(peer,
-					   net->ipv4.sysctl_icmp_ratelimit);
-		if (peer)
-			inet_putpeer(peer);
-	}
+	vif = l3mdev_master_ifindex(dst->dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+	if (peer)
+		inet_putpeer(peer);
 out:
 	return rc;
 }
@@ -396,6 +411,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct inet_sock *inet;
 	__be32 daddr, saddr;
 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
+	int type = icmp_param->data.icmph.type;
+	int code = icmp_param->data.icmph.code;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
@@ -405,6 +422,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 		return;
 	inet = inet_sk(sk);
 
+	/* global icmp_msgs_per_sec */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
+
 	icmp_param->data.icmph.checksum = 0;
 
 	inet->tos = ip_hdr(skb)->tos;
@@ -433,8 +454,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		goto out_unlock;
-	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
-			       icmp_param->data.icmph.code))
+	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
 	ip_rt_put(rt);
 out_unlock:
@@ -650,7 +670,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		return;
+		goto out;
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
 
 	/*
 	 *	Construct source address and options.
@@ -704,6 +728,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	if (IS_ERR(rt))
 		goto out_unlock;
 
+	/* peer icmp_ratelimit */
 	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		goto ende;
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3036f665e6c8..b26ae8b5c1ce 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -168,6 +168,30 @@ static bool is_ineligible(const struct sk_buff *skb)
 	return false;
 }
 
+static bool icmpv6_mask_allow(int type)
+{
+	/* Informational messages are not limited. */
+	if (type & ICMPV6_INFOMSG_MASK)
+		return true;
+
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return true;
+
+	return false;
+}
+
+static bool icmpv6_global_allow(int type)
+{
+	if (icmpv6_mask_allow(type))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  * Check the ICMP output rate limit
  */
@@ -178,12 +202,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	struct dst_entry *dst;
 	bool res = false;
 
-	/* Informational messages are not limited. */
-	if (type & ICMPV6_INFOMSG_MASK)
-		return true;
-
-	/* Do not limit pmtu discovery, it would break it. */
-	if (type == ICMPV6_PKT_TOOBIG)
+	if (icmpv6_mask_allow(type))
 		return true;
 
 	/*
@@ -200,20 +219,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else {
 		struct rt6_info *rt = (struct rt6_info *)dst;
 		int tmo = net->ipv6.sysctl.icmpv6_time;
+		struct inet_peer *peer;
 
 		/* Give more bandwidth to wider prefixes. */
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		if (icmp_global_allow()) {
-			struct inet_peer *peer;
-
-			peer = inet_getpeer_v6(net->ipv6.peers,
-					       &fl6->daddr, 1);
-			res = inet_peer_xrlim_allow(peer, tmo);
-			if (peer)
-				inet_putpeer(peer);
-		}
+		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
+		res = inet_peer_xrlim_allow(peer, tmo);
+		if (peer)
+			inet_putpeer(peer);
 	}
 	dst_release(dst);
 	return res;
@@ -493,6 +508,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
 		return;
+
+	if (!icmpv6_global_allow(type))
+		goto out;
+
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 
-- 
cgit v1.2.3


From 7ba91ecb16824f74ba4fcbc4e88cd4d24a839b25 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer
Date: Mon, 9 Jan 2017 16:04:14 +0100
Subject: net: for rate-limited ICMP replies save one atomic operation

It is possible to avoid the atomic operation in icmp{v6,}_xmit_lock,
by checking the sysctl_icmp_msgs_per_sec ratelimit before these calls,
as pointed out by Eric Dumazet, but the BH disabled state must be correct.

The icmp_global_allow() call states it must be called with BH
disabled.  This protection was given by the calls icmp_xmit_lock and
icmpv6_xmit_lock.  Thus, split out local_bh_disable/enable from these
functions and maintain it explicitly at callers.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 34 +++++++++++++++++++++-------------
 net/ipv6/icmp.c | 25 ++++++++++++++++---------
 2 files changed, 37 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 58d75ca58b83..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
 	return *this_cpu_ptr(net->ipv4.icmp_sk);
 }
 
+/* Called with BH disabled */
 static inline struct sock *icmp_xmit_lock(struct net *net)
 {
 	struct sock *sk;
 
-	local_bh_disable();
-
 	sk = icmp_sk(net);
 
 	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
 		/* This can happen if the output path signals a
 		 * dst_link_failure() for an outgoing ICMP packet.
 		 */
-		local_bh_enable();
 		return NULL;
 	}
 	return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
 
 static inline void icmp_xmit_unlock(struct sock *sk)
 {
-	spin_unlock_bh(&sk->sk_lock.slock);
+	spin_unlock(&sk->sk_lock.slock);
 }
 
 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -417,14 +415,17 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
 
-	sk = icmp_xmit_lock(net);
-	if (!sk)
-		return;
-	inet = inet_sk(sk);
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
 
 	/* global icmp_msgs_per_sec */
 	if (!icmpv4_global_allow(net, type, code))
-		goto out_unlock;
+		goto out_bh_enable;
+
+	sk = icmp_xmit_lock(net);
+	if (!sk)
+		goto out_bh_enable;
+	inet = inet_sk(sk);
 
 	icmp_param->data.icmph.checksum = 0;
 
@@ -459,6 +460,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -668,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 	}
 
-	sk = icmp_xmit_lock(net);
-	if (!sk)
-		goto out;
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
 
 	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
 	if (!icmpv4_global_allow(net, type, code))
-		goto out_unlock;
+		goto out_bh_enable;
+
+	sk = icmp_xmit_lock(net);
+	if (!sk)
+		goto out_bh_enable;
 
 	/*
 	 *	Construct source address and options.
@@ -750,6 +756,8 @@ ende:
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 out:;
 }
 EXPORT_SYMBOL(icmp_send);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index b26ae8b5c1ce..230b5aac9f03 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
+/* Called with BH disabled */
 static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
 {
 	struct sock *sk;
 
-	local_bh_disable();
-
 	sk = icmpv6_sk(net);
 	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
 		/* This can happen if the output path (f.e. SIT or
 		 * ip6ip6 tunnel) signals dst_link_failure() for an
 		 * outgoing ICMP6 packet.
 		 */
-		local_bh_enable();
 		return NULL;
 	}
 	return sk;
@@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
 
 static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
 {
-	spin_unlock_bh(&sk->sk_lock.slock);
+	spin_unlock(&sk->sk_lock.slock);
 }
 
 /*
@@ -489,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		return;
 	}
 
+	/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
+	local_bh_disable();
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv6_global_allow(type))
+		goto out_bh_enable;
+
 	mip6_addr_swap(skb);
 
 	memset(&fl6, 0, sizeof(fl6));
@@ -507,10 +512,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
-		return;
-
-	if (!icmpv6_global_allow(type))
-		goto out;
+		goto out_bh_enable;
 
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
@@ -571,6 +573,8 @@ out_dst_release:
 	dst_release(dst);
 out:
 	icmpv6_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 /* Slightly more convenient version of icmp6_send.
@@ -684,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
+	local_bh_disable();
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
-		return;
+		goto out_bh_enable;
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 
@@ -728,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	dst_release(dst);
 out:
 	icmpv6_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
-- 
cgit v1.2.3


From 4b9d07a44015a0e940448fa3885b894349e8b162 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:12 +0100
Subject: net: introduce keepalive function in struct proto

Direct call of tcp_set_keepalive() function from protocol-agnostic
sock_setsockopt() function in net/core/sock.c violates network
layering. And newly introduced protocol (SMC-R) will need its own
keepalive function. Therefore, add "keepalive" function pointer
to "struct proto", and call it from sock_setsockopt() via this pointer.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Reviewed-by: Utz Bacher <utz.bacher@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h   | 1 +
 net/core/sock.c      | 7 ++-----
 net/ipv4/tcp_ipv4.c  | 1 +
 net/ipv4/tcp_timer.c | 1 +
 net/ipv6/tcp_ipv6.c  | 1 +
 5 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index f0e867f58722..99deda67eba0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1024,6 +1024,7 @@ struct proto {
 	int			(*getsockopt)(struct sock *sk, int level,
 					int optname, char __user *optval,
 					int __user *option);
+	void			(*keepalive)(struct sock *sk, int valbool);
 #ifdef CONFIG_COMPAT
 	int			(*compat_setsockopt)(struct sock *sk,
 					int level,
diff --git a/net/core/sock.c b/net/core/sock.c
index f560e0826009..5018703ee2c2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -762,11 +762,8 @@ set_rcvbuf:
 		goto set_rcvbuf;
 
 	case SO_KEEPALIVE:
-#ifdef CONFIG_INET
-		if (sk->sk_protocol == IPPROTO_TCP &&
-		    sk->sk_type == SOCK_STREAM)
-			tcp_set_keepalive(sk, valbool);
-#endif
+		if (sk->sk_prot->keepalive)
+			sk->sk_prot->keepalive(sk, valbool);
 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 		break;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7e4be4f361f3..56d756ecfb59 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2376,6 +2376,7 @@ struct proto tcp_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3705075f42c3..29a9bd5f1225 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -617,6 +617,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
 	else if (!val)
 		inet_csk_delete_keepalive_timer(sk);
 }
+EXPORT_SYMBOL_GPL(tcp_set_keepalive);
 
 
 static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a4cdf6a34c30..228965dca3c5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1889,6 +1889,7 @@ struct proto tcpv6_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
-- 
cgit v1.2.3


From ac7138746e14137a451f8539614cdd349153e0c0 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:13 +0100
Subject: smc: establish new socket family

* enable smc module loading and unloading
 * register new socket family
 * basic smc socket creation and deletion
 * use backing TCP socket to run CLC (Connection Layer Control)
   handshake of SMC protocol
 * Setup for infiniband traffic is implemented in follow-on patches.
   For now fallback to TCP socket is always used.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Reviewed-by: Utz Bacher <utz.bacher@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS            |   7 +
 include/linux/socket.h |   7 +-
 net/Kconfig            |   1 +
 net/Makefile           |   1 +
 net/core/sock.c        |   6 +-
 net/smc/Kconfig        |  11 +
 net/smc/Makefile       |   2 +
 net/smc/af_smc.c       | 620 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc.h          |  37 +++
 9 files changed, 688 insertions(+), 4 deletions(-)
 create mode 100644 net/smc/Kconfig
 create mode 100644 net/smc/Makefile
 create mode 100644 net/smc/af_smc.c
 create mode 100644 net/smc/smc.h

(limited to 'net')

diff --git a/MAINTAINERS b/MAINTAINERS
index d42a37a351f8..c8df0e1ef833 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10850,6 +10850,13 @@ S:	Maintained
 F:	drivers/staging/media/st-cec/
 F:	Documentation/devicetree/bindings/media/stih-cec.txt
 
+SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS
+M:	Ursula Braun <ubraun@linux.vnet.ibm.com>
+L:	linux-s390@vger.kernel.org
+W:	http://www.ibm.com/developerworks/linux/linux390/
+S:	Supported
+F:	net/smc/
+
 SYNOPSYS DESIGNWARE DMAC DRIVER
 M:	Viresh Kumar <vireshk@kernel.org>
 M:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index c06438023d79..082027457825 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -202,8 +202,12 @@ struct ucred {
 #define AF_VSOCK	40	/* vSockets			*/
 #define AF_KCM		41	/* Kernel Connection Multiplexor*/
 #define AF_QIPCRTR	42	/* Qualcomm IPC Router          */
+#define AF_SMC		43	/* smc sockets: reserve number for
+				 * PF_SMC protocol family that
+				 * reuses AF_INET address family
+				 */
 
-#define AF_MAX		43	/* For now.. */
+#define AF_MAX		44	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -251,6 +255,7 @@ struct ucred {
 #define PF_VSOCK	AF_VSOCK
 #define PF_KCM		AF_KCM
 #define PF_QIPCRTR	AF_QIPCRTR
+#define PF_SMC		AF_SMC
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/net/Kconfig b/net/Kconfig
index a1005007224c..2e9ee61822e2 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -57,6 +57,7 @@ source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
+source "net/smc/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2b4667..5d6e0e5ff7f8 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211)		+= mac80211/
 obj-$(CONFIG_TIPC)		+= tipc/
 obj-$(CONFIG_NETLABEL)		+= netlabel/
 obj-$(CONFIG_IUCV)		+= iucv/
+obj-$(CONFIG_SMC)		+= smc/
 obj-$(CONFIG_RFKILL)		+= rfkill/
 obj-$(CONFIG_NET_9P)		+= 9p/
 obj-$(CONFIG_CAIF)		+= caif/
diff --git a/net/core/sock.c b/net/core/sock.c
index 5018703ee2c2..31f72f3a3559 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
-  "sk_lock-AF_MAX"
+  "sk_lock-AF_SMC"   , "sk_lock-AF_MAX"
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
-  "slock-AF_MAX"
+  "slock-AF_SMC"   , "slock-AF_MAX"
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
-  "clock-AF_MAX"
+  "closck-AF_smc"  , "clock-AF_MAX"
 };
 
 /*
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..bc029803e728
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,11 @@
+config SMC
+	tristate "SMC socket protocol family"
+	depends on INET && INFINIBAND
+	---help---
+	  SMC-R provides a "sockets over RDMA" solution making use of
+	  RDMA over Converged Ethernet (RoCE) technology to upgrade
+	  AF_INET TCP connections transparently.
+	  The Linux implementation of the SMC-R solution is designed as
+	  a separate socket family SMC.
+
+	  Select this option if you want to run SMC socket applications
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..c285c868dd82
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_SMC)	+= smc.o
+smc-y := af_smc.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..7fd773fc6238
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,620 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ *  applies to SOCK_STREAM sockets only
+ *  offers an alternative communication option for TCP-protocol sockets
+ *  applicable with RoCE-cards only
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ *              based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include "smc.h"
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct proto smc_proto = {
+	.name		= "SMC",
+	.owner		= THIS_MODULE,
+	.keepalive	= smc_set_keepalive,
+	.obj_size	= sizeof(struct smc_sock),
+	.slab_flags	= SLAB_DESTROY_BY_RCU,
+};
+
+static int smc_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+
+	if (!sk)
+		goto out;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+
+	sk->sk_state = SMC_CLOSED;
+	if (smc->clcsock) {
+		sock_release(smc->clcsock);
+		smc->clcsock = NULL;
+	}
+
+	/* detach socket */
+	sock_orphan(sk);
+	sock->sk = NULL;
+	release_sock(sk);
+
+	sock_put(sk);
+out:
+	return 0;
+}
+
+static void smc_destruct(struct sock *sk)
+{
+	if (sk->sk_state != SMC_CLOSED)
+		return;
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+{
+	struct smc_sock *smc;
+	struct sock *sk;
+
+	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+	sk->sk_state = SMC_INIT;
+	sk->sk_destruct = smc_destruct;
+	sk->sk_protocol = SMCPROTO_SMC;
+	sk_refcnt_debug_inc(sk);
+
+	smc = smc_sk(sk);
+
+	return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+		    int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc;
+
+	smc = smc_sk(sk);
+
+	/* replicate tests from inet_bind(), to be safe wrt. future changes */
+	rc = -EINVAL;
+	if (addr_len < sizeof(struct sockaddr_in))
+		goto out;
+
+	rc = -EAFNOSUPPORT;
+	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+	if ((addr->sin_family != AF_INET) &&
+	    ((addr->sin_family != AF_UNSPEC) ||
+	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+		goto out;
+
+	lock_sock(sk);
+
+	/* Check if socket is already active */
+	rc = -EINVAL;
+	if (sk->sk_state != SMC_INIT)
+		goto out_rel;
+
+	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+	release_sock(sk);
+out:
+	return rc;
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+				   unsigned long mask)
+{
+	/* options we don't get control via setsockopt for */
+	nsk->sk_type = osk->sk_type;
+	nsk->sk_sndbuf = osk->sk_sndbuf;
+	nsk->sk_rcvbuf = osk->sk_rcvbuf;
+	nsk->sk_sndtimeo = osk->sk_sndtimeo;
+	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+	nsk->sk_mark = osk->sk_mark;
+	nsk->sk_priority = osk->sk_priority;
+	nsk->sk_rcvlowat = osk->sk_rcvlowat;
+	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+	nsk->sk_err = osk->sk_err;
+
+	nsk->sk_flags &= ~mask;
+	nsk->sk_flags |= osk->sk_flags & mask;
+}
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+			     (1UL << SOCK_KEEPOPEN) | \
+			     (1UL << SOCK_LINGER) | \
+			     (1UL << SOCK_BROADCAST) | \
+			     (1UL << SOCK_TIMESTAMP) | \
+			     (1UL << SOCK_DBG) | \
+			     (1UL << SOCK_RCVTSTAMP) | \
+			     (1UL << SOCK_RCVTSTAMPNS) | \
+			     (1UL << SOCK_LOCALROUTE) | \
+			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+			     (1UL << SOCK_RXQ_OVFL) | \
+			     (1UL << SOCK_WIFI_STATUS) | \
+			     (1UL << SOCK_NOFCS) | \
+			     (1UL << SOCK_FILTER_LOCKED))
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+			     (1UL << SOCK_KEEPOPEN) | \
+			     (1UL << SOCK_LINGER) | \
+			     (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+		       int alen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EINVAL;
+
+	smc = smc_sk(sk);
+
+	/* separate smc parameter checking to be safe */
+	if (alen < sizeof(addr->sa_family))
+		goto out_err;
+	if (addr->sa_family != AF_INET)
+		goto out_err;
+
+	lock_sock(sk);
+	switch (sk->sk_state) {
+	default:
+		goto out;
+	case SMC_ACTIVE:
+		rc = -EISCONN;
+		goto out;
+	case SMC_INIT:
+		rc = 0;
+		break;
+	}
+
+	smc_copy_sock_settings_to_clc(smc);
+	rc = kernel_connect(smc->clcsock, addr, alen, flags);
+	if (rc)
+		goto out;
+
+	sk->sk_state = SMC_ACTIVE;
+
+	/* always use TCP fallback as transport mechanism for now;
+	 * This will change once RDMA transport is implemented
+	 */
+	smc->use_fallback = true;
+
+out:
+	release_sock(sk);
+out_err:
+	return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+	struct sock *sk = &lsmc->sk;
+	struct socket *new_clcsock;
+	struct sock *new_sk;
+	int rc;
+
+	new_sk = smc_sock_alloc(sock_net(sk), NULL);
+	if (!new_sk) {
+		rc = -ENOMEM;
+		lsmc->sk.sk_err = ENOMEM;
+		*new_smc = NULL;
+		goto out;
+	}
+	*new_smc = smc_sk(new_sk);
+
+	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+	if (rc) {
+		sock_put(new_sk);
+		*new_smc = NULL;
+		goto out;
+	}
+
+	(*new_smc)->clcsock = new_clcsock;
+out:
+	return rc;
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+
+	rc = -EINVAL;
+	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+		goto out;
+
+	rc = 0;
+	if (sk->sk_state == SMC_LISTEN) {
+		sk->sk_max_ack_backlog = backlog;
+		goto out;
+	}
+	/* some socket options are handled in core, so we could not apply
+	 * them to the clc socket -- copy smc socket options to clc socket
+	 */
+	smc_copy_sock_settings_to_clc(smc);
+
+	rc = kernel_listen(smc->clcsock, backlog);
+	if (rc)
+		goto out;
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+	sk->sk_state = SMC_LISTEN;
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+		      int flags)
+{
+	struct smc_sock *new_smc;
+	struct sock *sk = sock->sk;
+	struct smc_sock *lsmc;
+	int rc;
+
+	lsmc = smc_sk(sk);
+	lock_sock(sk);
+
+	if (lsmc->sk.sk_state != SMC_LISTEN) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = smc_clcsock_accept(lsmc, &new_smc);
+	if (rc)
+		goto out;
+	sock_graft(&new_smc->sk, new_sock);
+	new_smc->sk.sk_state = SMC_ACTIVE;
+
+	smc_copy_sock_settings_to_smc(new_smc);
+
+	/* always use TCP fallback as transport mechanism for now;
+	 * This will change once RDMA transport is implemented
+	 */
+	new_smc->use_fallback = true;
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+		       int *len, int peer)
+{
+	struct smc_sock *smc;
+
+	if (peer && (sock->sk->sk_state != SMC_ACTIVE))
+		return -ENOTCONN;
+
+	smc = smc_sk(sock->sk);
+
+	return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EPIPE;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if (sk->sk_state != SMC_ACTIVE)
+		goto out;
+	if (smc->use_fallback)
+		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+	else
+		rc = sock_no_sendmsg(sock, msg, len);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+		       int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -ENOTCONN;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+		goto out;
+
+	if (smc->use_fallback)
+		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+	else
+		rc = sock_no_recvmsg(sock, msg, len, flags);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static unsigned int smc_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask = 0;
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
+	    smc->use_fallback) {
+		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+		/* if non-blocking connect finished ... */
+		lock_sock(sk);
+		if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+			sk->sk_state = SMC_ACTIVE;
+			/* always use TCP fallback as transport mechanism;
+			 * This will change once RDMA transport is implemented
+			 */
+			smc->use_fallback = true;
+		}
+		release_sock(sk);
+	} else {
+		mask = sock_no_poll(file, sock, wait);
+	}
+
+	return mask;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EINVAL;
+
+	smc = smc_sk(sk);
+
+	if ((how < SHUT_RD) || (how > SHUT_RDWR))
+		goto out_err;
+
+	lock_sock(sk);
+
+	rc = -ENOTCONN;
+	if (sk->sk_state == SMC_CLOSED)
+		goto out;
+	if (smc->use_fallback) {
+		rc = kernel_sock_shutdown(smc->clcsock, how);
+		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+		if (sk->sk_shutdown == SHUTDOWN_MASK)
+			sk->sk_state = SMC_CLOSED;
+	} else {
+		rc = sock_no_shutdown(sock, how);
+	}
+
+out:
+	release_sock(sk);
+
+out_err:
+	return rc;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+
+	smc = smc_sk(sk);
+
+	/* generic setsockopts reaching us here always apply to the
+	 * CLC socket
+	 */
+	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+					     optval, optlen);
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	/* socket options apply to the CLC socket */
+	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+					     optval, optlen);
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+		     unsigned long arg)
+{
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	if (smc->use_fallback)
+		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+	else
+		return sock_no_ioctl(sock, cmd, arg);
+}
+
+static ssize_t smc_sendpage(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EPIPE;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if (sk->sk_state != SMC_ACTIVE)
+		goto out;
+	if (smc->use_fallback)
+		rc = kernel_sendpage(smc->clcsock, page, offset,
+				     size, flags);
+	else
+		rc = sock_no_sendpage(sock, page, offset, size, flags);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+			       struct pipe_inode_info *pipe, size_t len,
+				    unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -ENOTCONN;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+		goto out;
+	if (smc->use_fallback) {
+		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+						    pipe, len, flags);
+	} else {
+		rc = -EOPNOTSUPP;
+	}
+out:
+	release_sock(sk);
+	return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+	.family		= PF_SMC,
+	.owner		= THIS_MODULE,
+	.release	= smc_release,
+	.bind		= smc_bind,
+	.connect	= smc_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= smc_accept,
+	.getname	= smc_getname,
+	.poll		= smc_poll,
+	.ioctl		= smc_ioctl,
+	.listen		= smc_listen,
+	.shutdown	= smc_shutdown,
+	.setsockopt	= smc_setsockopt,
+	.getsockopt	= smc_getsockopt,
+	.sendmsg	= smc_sendmsg,
+	.recvmsg	= smc_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= smc_sendpage,
+	.splice_read	= smc_splice_read,
+};
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct smc_sock *smc;
+	struct sock *sk;
+	int rc;
+
+	rc = -ESOCKTNOSUPPORT;
+	if (sock->type != SOCK_STREAM)
+		goto out;
+
+	rc = -EPROTONOSUPPORT;
+	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+		goto out;
+
+	rc = -ENOBUFS;
+	sock->ops = &smc_sock_ops;
+	sk = smc_sock_alloc(net, sock);
+	if (!sk)
+		goto out;
+
+	/* create internal TCP socket for CLC handshake and fallback */
+	smc = smc_sk(sk);
+	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
+			      IPPROTO_TCP, &smc->clcsock);
+	if (rc)
+		sk_common_release(sk);
+
+out:
+	return rc;
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+	.family	= PF_SMC,
+	.owner	= THIS_MODULE,
+	.create	= smc_create,
+};
+
+static int __init smc_init(void)
+{
+	int rc;
+
+	rc = proto_register(&smc_proto, 1);
+	if (rc) {
+		pr_err("%s: proto_register fails with %d\n", __func__, rc);
+		goto out;
+	}
+
+	rc = sock_register(&smc_sock_family_ops);
+	if (rc) {
+		pr_err("%s: sock_register fails with %d\n", __func__, rc);
+		goto out_proto;
+	}
+
+	return 0;
+
+out_proto:
+	proto_unregister(&smc_proto);
+out:
+	return rc;
+}
+
+static void __exit smc_exit(void)
+{
+	sock_unregister(PF_SMC);
+	proto_unregister(&smc_proto);
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..fcea7399e2eb
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,37 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for the SMC module (socket related)
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef __SMC_H
+#define __SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/sock.h>
+
+#define SMCPROTO_SMC		0	/* SMC protocol */
+
+enum smc_state {		/* possible states of an SMC socket */
+	SMC_ACTIVE	= 1,
+	SMC_INIT	= 2,
+	SMC_CLOSED	= 7,
+	SMC_LISTEN	= 10,
+};
+
+struct smc_sock {				/* smc sock container */
+	struct sock		sk;
+	struct socket		*clcsock;	/* internal tcp socket */
+	bool			use_fallback;	/* fallback to tcp */
+};
+
+static inline struct smc_sock *smc_sk(const struct sock *sk)
+{
+	return (struct smc_sock *)sk;
+}
+
+#endif	/* __SMC_H */
-- 
cgit v1.2.3


From a4cf0443c4143b19e42389a1674b5b65224544ce Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:14 +0100
Subject: smc: introduce SMC as an IB-client

* create a list of SMC IB-devices

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile |   2 +-
 net/smc/af_smc.c |  10 ++++
 net/smc/smc.h    |   6 +++
 net/smc/smc_ib.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_ib.h |  40 ++++++++++++++++
 5 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 net/smc/smc_ib.c
 create mode 100644 net/smc/smc_ib.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index c285c868dd82..56f2170e6f64 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_SMC)	+= smc.o
-smc-y := af_smc.o
+smc-y := af_smc.o smc_ib.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 7fd773fc6238..50492ee495ce 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -20,6 +20,7 @@
 #include <net/sock.h>
 
 #include "smc.h"
+#include "smc_ib.h"
 
 static void smc_set_keepalive(struct sock *sk, int val)
 {
@@ -597,8 +598,16 @@ static int __init smc_init(void)
 		goto out_proto;
 	}
 
+	rc = smc_ib_register_client();
+	if (rc) {
+		pr_err("%s: ib_register fails with %d\n", __func__, rc);
+		goto out_sock;
+	}
+
 	return 0;
 
+out_sock:
+	sock_unregister(PF_SMC);
 out_proto:
 	proto_unregister(&smc_proto);
 out:
@@ -607,6 +616,7 @@ out:
 
 static void __exit smc_exit(void)
 {
+	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
 	proto_unregister(&smc_proto);
 }
diff --git a/net/smc/smc.h b/net/smc/smc.h
index fcea7399e2eb..e87d79867099 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -16,6 +16,8 @@
 
 #define SMCPROTO_SMC		0	/* SMC protocol */
 
+#define SMC_MAX_PORTS		2	/* Max # of ports */
+
 enum smc_state {		/* possible states of an SMC socket */
 	SMC_ACTIVE	= 1,
 	SMC_INIT	= 2,
@@ -34,4 +36,8 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
 	return (struct smc_sock *)sk;
 }
 
+#define SMC_SYSTEMID_LEN		8
+
+extern u8	local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
 #endif	/* __SMC_H */
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000000..7222b7ede900
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,143 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  IB infrastructure:
+ *  Establish SMC-R as an Infiniband Client to be notified about added and
+ *  removed IB devices of type RDMA.
+ *  Determine device and port characteristics for these IB devices.
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/random.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc_ib.h"
+#include "smc.h"
+
+struct smc_ib_devices smc_ib_devices = {	/* smc-registered ib devices */
+	.lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
+	.list = LIST_HEAD_INIT(smc_ib_devices.list),
+};
+
+#define SMC_LOCAL_SYSTEMID_RESET	"%%%%%%%"
+
+u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET;	/* unique system
+								 * identifier
+								 */
+
+static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct net_device *ndev;
+	int rc;
+
+	rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
+			  &smcibdev->gid[ibport - 1], NULL);
+	/* the SMC protocol requires specification of the roce MAC address;
+	 * if net_device cannot be determined, it can be derived from gid 0
+	 */
+	ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
+	if (ndev) {
+		memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
+	} else if (!rc) {
+		memcpy(&smcibdev->mac[ibport - 1][0],
+		       &smcibdev->gid[ibport - 1].raw[8], 3);
+		memcpy(&smcibdev->mac[ibport - 1][3],
+		       &smcibdev->gid[ibport - 1].raw[13], 3);
+		smcibdev->mac[ibport - 1][0] &= ~0x02;
+	}
+	return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+						u8 ibport)
+{
+	memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+	       sizeof(smcibdev->mac[ibport - 1]));
+	get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	int rc;
+
+	memset(&smcibdev->pattr[ibport - 1], 0,
+	       sizeof(smcibdev->pattr[ibport - 1]));
+	rc = ib_query_port(smcibdev->ibdev, ibport,
+			   &smcibdev->pattr[ibport - 1]);
+	if (rc)
+		goto out;
+	rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
+	if (rc)
+		goto out;
+	if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+		     sizeof(local_systemid)) &&
+	    smc_ib_port_active(smcibdev, ibport))
+		/* create unique system identifier */
+		smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+	return rc;
+}
+
+static struct ib_client smc_ib_client;
+
+/* callback function for ib_register_client() */
+static void smc_ib_add_dev(struct ib_device *ibdev)
+{
+	struct smc_ib_device *smcibdev;
+
+	if (ibdev->node_type != RDMA_NODE_IB_CA)
+		return;
+
+	smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
+	if (!smcibdev)
+		return;
+
+	smcibdev->ibdev = ibdev;
+
+	spin_lock(&smc_ib_devices.lock);
+	list_add_tail(&smcibdev->list, &smc_ib_devices.list);
+	spin_unlock(&smc_ib_devices.lock);
+	ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+}
+
+/* callback function for ib_register_client() */
+static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
+{
+	struct smc_ib_device *smcibdev;
+
+	smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+	ib_set_client_data(ibdev, &smc_ib_client, NULL);
+	spin_lock(&smc_ib_devices.lock);
+	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+	spin_unlock(&smc_ib_devices.lock);
+	kfree(smcibdev);
+}
+
+static struct ib_client smc_ib_client = {
+	.name	= "smc_ib",
+	.add	= smc_ib_add_dev,
+	.remove = smc_ib_remove_dev,
+};
+
+int __init smc_ib_register_client(void)
+{
+	return ib_register_client(&smc_ib_client);
+}
+
+void smc_ib_unregister_client(void)
+{
+	ib_unregister_client(&smc_ib_client);
+}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000000..63613e715f4f
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,40 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for IB environment
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_IB_H
+#define _SMC_IB_H
+
+#include <rdma/ib_verbs.h>
+
+#define SMC_MAX_PORTS			2	/* Max # of ports */
+#define SMC_GID_SIZE			sizeof(union ib_gid)
+
+struct smc_ib_devices {			/* list of smc ib devices definition */
+	struct list_head	list;
+	spinlock_t		lock;	/* protects list of smc ib devices */
+};
+
+extern struct smc_ib_devices	smc_ib_devices; /* list of smc ib devices */
+
+struct smc_ib_device {				/* ib-device infos for smc */
+	struct list_head	list;
+	struct ib_device	*ibdev;
+	struct ib_port_attr	pattr[SMC_MAX_PORTS];	/* ib dev. port attrs */
+	char			mac[SMC_MAX_PORTS][6]; /* mac address per port*/
+	union ib_gid		gid[SMC_MAX_PORTS]; /* gid per port */
+	u8			initialized : 1; /* ib dev CQ, evthdl done */
+};
+
+int smc_ib_register_client(void) __init;
+void smc_ib_unregister_client(void);
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
+
+#endif
-- 
cgit v1.2.3


From 6812baabf24d5c299c13223366a23c269408f4d0 Mon Sep 17 00:00:00 2001
From: Thomas Richter
Date: Mon, 9 Jan 2017 16:55:15 +0100
Subject: smc: establish pnet table management

Connection creation with SMC-R starts through an internal
TCP-connection. The Ethernet interface for this TCP-connection is not
restricted to the Ethernet interface of a RoCE device. Any existing
Ethernet interface belonging to the same physical net can be used, as
long as there is a defined relation between the Ethernet interface and
some RoCE devices. This relation is defined with the help of an
identification string called "Physical Net ID" or short "pnet ID".
Information about defined pnet IDs and their related Ethernet
interfaces and RoCE devices is stored in the SMC-R pnet table.

A pnet table entry consists of the identifying pnet ID and the
associated network and IB device.
This patch adds pnet table configuration support using the
generic netlink message interface referring to network and IB device
by their names. Commands exist to add, delete, and display pnet table
entries, and to flush or display the entire pnet table.

There are cross-checks to verify whether the ethernet interfaces
or infiniband devices really exist in the system. If either device
is not available, the pnet ID entry is not created.
Loss of network devices and IB devices is also monitored;
a pnet ID entry is removed when an associated network or
IB device is removed.

Signed-off-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/smc.h |  35 ++++
 net/smc/Makefile         |   2 +-
 net/smc/af_smc.c         |  11 +-
 net/smc/smc_ib.c         |   2 +
 net/smc/smc_pnet.c       | 534 +++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_pnet.h       |  23 ++
 6 files changed, 604 insertions(+), 3 deletions(-)
 create mode 100644 include/uapi/linux/smc.h
 create mode 100644 net/smc/smc_pnet.c
 create mode 100644 net/smc/smc_pnet.h

(limited to 'net')

diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
new file mode 100644
index 000000000000..ab1dea8e53ee
--- /dev/null
+++ b/include/uapi/linux/smc.h
@@ -0,0 +1,35 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for generic netlink based configuration of an SMC-R PNET table
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _UAPI_LINUX_SMC_H_
+#define _UAPI_LINUX_SMC_H_
+
+/* Netlink SMC_PNETID attributes */
+enum {
+	SMC_PNETID_UNSPEC,
+	SMC_PNETID_NAME,
+	SMC_PNETID_ETHNAME,
+	SMC_PNETID_IBNAME,
+	SMC_PNETID_IBPORT,
+	__SMC_PNETID_MAX,
+	SMC_PNETID_MAX = __SMC_PNETID_MAX - 1
+};
+
+enum {				/* SMC PNET Table commands */
+	SMC_PNETID_GET = 1,
+	SMC_PNETID_ADD,
+	SMC_PNETID_DEL,
+	SMC_PNETID_FLUSH
+};
+
+#define SMCR_GENL_FAMILY_NAME		"SMC_PNETID"
+#define SMCR_GENL_FAMILY_VERSION	1
+
+#endif /* _UAPI_LINUX_SMC_H */
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 56f2170e6f64..50f39ffec8c9 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_SMC)	+= smc.o
-smc-y := af_smc.o smc_ib.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 50492ee495ce..8b059b2fc34d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -21,6 +21,7 @@
 
 #include "smc.h"
 #include "smc_ib.h"
+#include "smc_pnet.h"
 
 static void smc_set_keepalive(struct sock *sk, int val)
 {
@@ -586,10 +587,14 @@ static int __init smc_init(void)
 {
 	int rc;
 
+	rc = smc_pnet_init();
+	if (rc)
+		return rc;
+
 	rc = proto_register(&smc_proto, 1);
 	if (rc) {
 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
-		goto out;
+		goto out_pnet;
 	}
 
 	rc = sock_register(&smc_sock_family_ops);
@@ -610,7 +615,8 @@ out_sock:
 	sock_unregister(PF_SMC);
 out_proto:
 	proto_unregister(&smc_proto);
-out:
+out_pnet:
+	smc_pnet_exit();
 	return rc;
 }
 
@@ -619,6 +625,7 @@ static void __exit smc_exit(void)
 	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
 	proto_unregister(&smc_proto);
+	smc_pnet_exit();
 }
 
 module_init(smc_init);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 7222b7ede900..5b037f435bc1 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -14,6 +14,7 @@
 #include <linux/random.h>
 #include <rdma/ib_verbs.h>
 
+#include "smc_pnet.h"
 #include "smc_ib.h"
 #include "smc.h"
 
@@ -123,6 +124,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
 	spin_lock(&smc_ib_devices.lock);
 	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
 	spin_unlock(&smc_ib_devices.lock);
+	smc_pnet_remove_by_ibdev(smcibdev);
 	kfree(smcibdev);
 }
 
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..9d3e7fb8348d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,534 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Generic netlink support functions to configure an SMC-R PNET table
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/if.h>
+#include <uapi/linux/smc.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+
+#define SMC_MAX_PNET_ID_LEN	16	/* Max. length of PNET id */
+
+static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+	[SMC_PNETID_NAME] = {
+		.type = NLA_NUL_STRING,
+		.len = SMC_MAX_PNET_ID_LEN - 1
+	},
+	[SMC_PNETID_ETHNAME] = {
+		.type = NLA_NUL_STRING,
+		.len = IFNAMSIZ - 1
+	},
+	[SMC_PNETID_IBNAME] = {
+		.type = NLA_NUL_STRING,
+		.len = IB_DEVICE_NAME_MAX - 1
+	},
+	[SMC_PNETID_IBPORT] = { .type = NLA_U8 }
+};
+
+static struct genl_family smc_pnet_nl_family;
+
+/**
+ * struct smc_pnettable - SMC PNET table anchor
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+static struct smc_pnettable {
+	rwlock_t lock;
+	struct list_head pnetlist;
+} smc_pnettable = {
+	.pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
+	.lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
+};
+
+/**
+ * struct smc_pnetentry - pnet identifier name entry
+ * @list: List node.
+ * @pnet_name: Pnet identifier name
+ * @ndev: pointer to network device.
+ * @smcibdev: Pointer to IB device.
+ */
+struct smc_pnetentry {
+	struct list_head list;
+	char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+	struct net_device *ndev;
+	struct smc_ib_device *smcibdev;
+	u8 ib_port;
+};
+
+/* Check if two RDMA device entries are identical. Use device name and port
+ * number for comparison.
+ */
+static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname,
+				 u8 ibport)
+{
+	return pnetelem->ib_port == ibport &&
+	       !strncmp(pnetelem->smcibdev->ibdev->name, ibname,
+			sizeof(pnetelem->smcibdev->ibdev->name));
+}
+
+/* Find a pnetid in the pnet table.
+ */
+static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name)
+{
+	struct smc_pnetentry *pnetelem, *found_pnetelem = NULL;
+
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (!strncmp(pnetelem->pnet_name, pnet_name,
+			     sizeof(pnetelem->pnet_name))) {
+			found_pnetelem = pnetelem;
+			break;
+		}
+	}
+	read_unlock(&smc_pnettable.lock);
+	return found_pnetelem;
+}
+
+/* Remove a pnetid from the pnet table.
+ */
+static int smc_pnet_remove_by_pnetid(char *pnet_name)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (!strncmp(pnetelem->pnet_name, pnet_name,
+			     sizeof(pnetelem->pnet_name))) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Remove a pnet entry mentioning a given network device from the pnet table.
+ */
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (pnetelem->ndev == ndev) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Remove a pnet entry mentioning a given ib device from the pnet table.
+ */
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (pnetelem->smcibdev == ibdev) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
+ */
+static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)
+{
+	struct smc_pnetentry *pnetelem;
+	int rc = -EEXIST;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name,
+			     sizeof(new_pnetelem->pnet_name)) ||
+		    !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name,
+			     sizeof(new_pnetelem->ndev->name)) ||
+		    smc_pnet_same_ibname(pnetelem,
+					 new_pnetelem->smcibdev->ibdev->name,
+					 new_pnetelem->ib_port))
+			goto found;
+	}
+	list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);
+	rc = 0;
+found:
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* The limit for pnetid is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
+{
+	char *bf = skip_spaces(pnet_name);
+	size_t len = strlen(bf);
+	char *end = bf + len;
+
+	if (!len)
+		return false;
+	while (--end >= bf && isspace(*end))
+		;
+	if (end - bf >= SMC_MAX_PNET_ID_LEN)
+		return false;
+	while (bf <= end) {
+		if (!isalnum(*bf))
+			return false;
+		*pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+		bf++;
+	}
+	*pnetid = '\0';
+	return true;
+}
+
+/* Find an infiniband device by a given name. The device might not exist. */
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
+{
+	struct smc_ib_device *ibdev;
+
+	spin_lock(&smc_ib_devices.lock);
+	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+		if (!strncmp(ibdev->ibdev->name, ib_name,
+			     sizeof(ibdev->ibdev->name))) {
+			goto out;
+		}
+	}
+	ibdev = NULL;
+out:
+	spin_unlock(&smc_ib_devices.lock);
+	return ibdev;
+}
+
+/* Parse the supplied netlink attributes and fill a pnetentry structure.
+ * For ethernet and infiniband device names verify that the devices exist.
+ */
+static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
+			       struct nlattr *tb[])
+{
+	char *string, *ibname = NULL;
+	int rc = 0;
+
+	memset(pnetelem, 0, sizeof(*pnetelem));
+	INIT_LIST_HEAD(&pnetelem->list);
+	if (tb[SMC_PNETID_NAME]) {
+		string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+		if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
+			rc = -EINVAL;
+			goto error;
+		}
+	}
+	if (tb[SMC_PNETID_ETHNAME]) {
+		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+		pnetelem->ndev = dev_get_by_name(net, string);
+		if (!pnetelem->ndev)
+			return -ENOENT;
+	}
+	if (tb[SMC_PNETID_IBNAME]) {
+		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+		ibname = strim(ibname);
+		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+		if (!pnetelem->smcibdev) {
+			rc = -ENOENT;
+			goto error;
+		}
+	}
+	if (tb[SMC_PNETID_IBPORT]) {
+		pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+		if (pnetelem->ib_port > SMC_MAX_PORTS) {
+			rc = -EINVAL;
+			goto error;
+		}
+	}
+	return 0;
+
+error:
+	if (pnetelem->ndev)
+		dev_put(pnetelem->ndev);
+	return rc;
+}
+
+/* Convert an smc_pnetentry to a netlink attribute sequence */
+static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem)
+{
+	if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) ||
+	    nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) ||
+	    nla_put_string(msg, SMC_PNETID_IBNAME,
+			   pnetelem->smcibdev->ibdev->name) ||
+	    nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
+		return -1;
+	return 0;
+}
+
+/* Retrieve one PNETID entry */
+static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct smc_pnetentry *pnetelem;
+	struct sk_buff *msg;
+	void *hdr;
+	int rc;
+
+	pnetelem = smc_pnet_find_pnetid(
+				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+	if (!pnetelem)
+		return -ENOENT;
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &smc_pnet_nl_family, 0, SMC_PNETID_GET);
+	if (!hdr) {
+		rc = -EMSGSIZE;
+		goto err_out;
+	}
+
+	if (smc_pnet_set_nla(msg, pnetelem)) {
+		rc = -ENOBUFS;
+		goto err_out;
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+err_out:
+	nlmsg_free(msg);
+	return rc;
+}
+
+static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct smc_pnetentry *pnetelem;
+	int rc;
+
+	pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
+	if (!pnetelem)
+		return -ENOMEM;
+	rc = smc_pnet_fill_entry(net, pnetelem, info->attrs);
+	if (!rc)
+		rc = smc_pnet_enter(pnetelem);
+	if (rc) {
+		kfree(pnetelem);
+		return rc;
+	}
+	rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
+	if (rc)
+		smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
+	return rc;
+}
+
+static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
+{
+	return smc_pnet_remove_by_pnetid(
+				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+}
+
+static int smc_pnet_dump_start(struct netlink_callback *cb)
+{
+	cb->args[0] = 0;
+	return 0;
+}
+
+static int smc_pnet_dumpinfo(struct sk_buff *skb,
+			     u32 portid, u32 seq, u32 flags,
+			     struct smc_pnetentry *pnetelem)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
+			  flags, SMC_PNETID_GET);
+	if (!hdr)
+		return -ENOMEM;
+	if (smc_pnet_set_nla(skb, pnetelem) < 0) {
+		genlmsg_cancel(skb, hdr);
+		return -EMSGSIZE;
+	}
+	genlmsg_end(skb, hdr);
+	return 0;
+}
+
+static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct smc_pnetentry *pnetelem;
+	int idx = 0;
+
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (idx++ < cb->args[0])
+			continue;
+		if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				      pnetelem)) {
+			--idx;
+			break;
+		}
+	}
+	cb->args[0] = idx;
+	read_unlock(&smc_pnettable.lock);
+	return skb->len;
+}
+
+/* Remove and delete all pnetids from pnet table.
+ */
+static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		list_del(&pnetelem->list);
+		dev_put(pnetelem->ndev);
+		kfree(pnetelem);
+	}
+	write_unlock(&smc_pnettable.lock);
+	return 0;
+}
+
+/* SMC_PNETID generic netlink operation definition */
+static const struct genl_ops smc_pnet_ops[] = {
+	{
+		.cmd = SMC_PNETID_GET,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_get,
+		.dumpit = smc_pnet_dump,
+		.start = smc_pnet_dump_start
+	},
+	{
+		.cmd = SMC_PNETID_ADD,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_add
+	},
+	{
+		.cmd = SMC_PNETID_DEL,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_del
+	},
+	{
+		.cmd = SMC_PNETID_FLUSH,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_flush
+	}
+};
+
+/* SMC_PNETID family definition */
+static struct genl_family smc_pnet_nl_family = {
+	.hdrsize = 0,
+	.name = SMCR_GENL_FAMILY_NAME,
+	.version = SMCR_GENL_FAMILY_VERSION,
+	.maxattr = SMC_PNETID_MAX,
+	.netnsok = true,
+	.module = THIS_MODULE,
+	.ops = smc_pnet_ops,
+	.n_ops =  ARRAY_SIZE(smc_pnet_ops)
+};
+
+static int smc_pnet_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_REBOOT:
+	case NETDEV_UNREGISTER:
+		smc_pnet_remove_by_ndev(event_dev);
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block smc_netdev_notifier = {
+	.notifier_call = smc_pnet_netdev_event
+};
+
+int __init smc_pnet_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&smc_pnet_nl_family);
+	if (rc)
+		return rc;
+	rc = register_netdevice_notifier(&smc_netdev_notifier);
+	if (rc)
+		genl_unregister_family(&smc_pnet_nl_family);
+	return rc;
+}
+
+void smc_pnet_exit(void)
+{
+	smc_pnet_flush(NULL, NULL);
+	unregister_netdevice_notifier(&smc_netdev_notifier);
+	genl_unregister_family(&smc_pnet_nl_family);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+				 struct smc_ib_device **smcibdev, u8 *ibport)
+{
+	struct dst_entry *dst = sk_dst_get(sk);
+	struct smc_pnetentry *pnetelem;
+
+	*smcibdev = NULL;
+	*ibport = 0;
+
+	if (!dst)
+		return;
+	if (!dst->dev)
+		goto out_rel;
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (dst->dev == pnetelem->ndev) {
+			*smcibdev = pnetelem->smcibdev;
+			*ibport = pnetelem->ib_port;
+			break;
+		}
+	}
+	read_unlock(&smc_pnettable.lock);
+out_rel:
+	dst_release(dst);
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..32ab3df928ca
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  PNET table queries
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+struct smc_ib_device;
+
+int smc_pnet_init(void) __init;
+void smc_pnet_exit(void);
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name);
+void smc_pnet_find_roce_resource(struct sock *sk,
+				 struct smc_ib_device **smcibdev, u8 *ibport);
+
+#endif
-- 
cgit v1.2.3


From a046d57da19f812216f393e7c535f5858f793ac3 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:16 +0100
Subject: smc: CLC handshake (incl. preparation steps)

* CLC (Connection Layer Control) handshake

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile  |   2 +-
 net/smc/af_smc.c  | 464 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 net/smc/smc.h     |  22 +++
 net/smc/smc_clc.c | 252 +++++++++++++++++++++++++++++
 net/smc/smc_clc.h | 114 ++++++++++++++
 5 files changed, 822 insertions(+), 32 deletions(-)
 create mode 100644 net/smc/smc_clc.c
 create mode 100644 net/smc/smc_clc.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index 50f39ffec8c9..c0ad5883e888 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_SMC)	+= smc.o
-smc-y := af_smc.o smc_pnet.o smc_ib.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 8b059b2fc34d..05c705a688e5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -6,6 +6,13 @@
  *  offers an alternative communication option for TCP-protocol sockets
  *  applicable with RoCE-cards only
  *
+ *  Initial restrictions:
+ *    - non-blocking connect postponed
+ *    - IPv6 support postponed
+ *    - support for alternate links postponed
+ *    - partial support for non-blocking sockets only
+ *    - support for urgent data postponed
+ *
  *  Copyright IBM Corp. 2016
  *
  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
@@ -17,12 +24,18 @@
 
 #include <linux/module.h>
 #include <linux/socket.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
 #include <net/sock.h>
+#include <net/tcp.h>
 
 #include "smc.h"
+#include "smc_clc.h"
 #include "smc_ib.h"
 #include "smc_pnet.h"
 
+static void smc_tcp_listen_work(struct work_struct *);
+
 static void smc_set_keepalive(struct sock *sk, int val)
 {
 	struct smc_sock *smc = smc_sk(sk);
@@ -88,9 +101,11 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	sk->sk_state = SMC_INIT;
 	sk->sk_destruct = smc_destruct;
 	sk->sk_protocol = SMCPROTO_SMC;
-	sk_refcnt_debug_inc(sk);
-
 	smc = smc_sk(sk);
+	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	INIT_LIST_HEAD(&smc->accept_q);
+	spin_lock_init(&smc->accept_q_lock);
+	sk_refcnt_debug_inc(sk);
 
 	return sk;
 }
@@ -184,6 +199,119 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 }
 
+/* determine subnet and mask of internal TCP socket */
+int smc_netinfo_by_tcpsk(struct socket *clcsock,
+			 __be32 *subnet, u8 *prefix_len)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	struct sockaddr_in addr;
+	int rc = -ENOENT;
+	int len;
+
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+
+	/* get address to which the internal TCP socket is bound */
+	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
+	/* analyze IPv4 specific data of net_device belonging to TCP socket */
+	for_ifa(dst->dev->ip_ptr) {
+		if (ifa->ifa_address != addr.sin_addr.s_addr)
+			continue;
+		*prefix_len = inet_mask_len(ifa->ifa_mask);
+		*subnet = ifa->ifa_address & ifa->ifa_mask;
+		rc = 0;
+		break;
+	} endfor_ifa(dst->dev->ip_ptr);
+
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc)
+{
+	struct smc_clc_msg_accept_confirm aclc;
+	struct smc_ib_device *smcibdev;
+	int reason_code = 0;
+	int rc = 0;
+	u8 ibport;
+
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(smc)) {
+		reason_code = SMC_CLC_DECL_IPSEC;
+		goto decline_rdma;
+	}
+
+	/* PNET table look up: search active ib_device and port
+	 * within same PNETID that also contains the ethernet device
+	 * used for the internal TCP socket
+	 */
+	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
+	if (!smcibdev) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* do inband token exchange */
+	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
+	if (reason_code < 0) {
+		rc = reason_code;
+		goto out_err;
+	}
+	if (reason_code > 0) /* configuration error */
+		goto decline_rdma;
+	/* receive SMC Accept CLC message */
+	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
+				       SMC_CLC_ACCEPT);
+	if (reason_code < 0) {
+		rc = reason_code;
+		goto out_err;
+	}
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create connection, link group, link
+	 */
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create rmbs, map rmbs, rtoken_handling, modify_qp
+	 */
+
+	rc = smc_clc_send_confirm(smc);
+	if (rc)
+		goto out_err;
+
+	/* tbd in follow-on patch: llc_confirm */
+
+out_connected:
+	smc_copy_sock_settings_to_clc(smc);
+	smc->sk.sk_state = SMC_ACTIVE;
+
+	return rc;
+
+decline_rdma:
+	/* RDMA setup failed, switch back to TCP */
+	smc->use_fallback = true;
+	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+		rc = smc_clc_send_decline(smc, reason_code, 0);
+		if (rc < sizeof(struct smc_clc_msg_decline))
+			goto out_err;
+	}
+	goto out_connected;
+
+out_err:
+	return rc;
+}
+
 static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		       int alen, int flags)
 {
@@ -198,6 +326,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		goto out_err;
 	if (addr->sa_family != AF_INET)
 		goto out_err;
+	smc->addr = addr;	/* needed for nonblocking connect */
 
 	lock_sock(sk);
 	switch (sk->sk_state) {
@@ -216,12 +345,12 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 	if (rc)
 		goto out;
 
-	sk->sk_state = SMC_ACTIVE;
-
-	/* always use TCP fallback as transport mechanism for now;
-	 * This will change once RDMA transport is implemented
-	 */
-	smc->use_fallback = true;
+	/* setup RDMA connection */
+	rc = smc_connect_rdma(smc);
+	if (rc < 0)
+		goto out;
+	else
+		rc = 0; /* success cases including fallback */
 
 out:
 	release_sock(sk);
@@ -236,17 +365,32 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 	struct sock *new_sk;
 	int rc;
 
+	release_sock(&lsmc->sk);
 	new_sk = smc_sock_alloc(sock_net(sk), NULL);
 	if (!new_sk) {
 		rc = -ENOMEM;
 		lsmc->sk.sk_err = ENOMEM;
 		*new_smc = NULL;
+		lock_sock(&lsmc->sk);
 		goto out;
 	}
 	*new_smc = smc_sk(new_sk);
 
 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
-	if (rc) {
+	lock_sock(&lsmc->sk);
+	if  (rc < 0) {
+		lsmc->sk.sk_err = -rc;
+		new_sk->sk_state = SMC_CLOSED;
+		sock_set_flag(new_sk, SOCK_DEAD);
+		sock_put(new_sk);
+		*new_smc = NULL;
+		goto out;
+	}
+	if (lsmc->sk.sk_state == SMC_CLOSED) {
+		if (new_clcsock)
+			sock_release(new_clcsock);
+		new_sk->sk_state = SMC_CLOSED;
+		sock_set_flag(new_sk, SOCK_DEAD);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -257,6 +401,216 @@ out:
 	return rc;
 }
 
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+	struct smc_sock *par = smc_sk(parent);
+
+	sock_hold(sk);
+	spin_lock(&par->accept_q_lock);
+	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+	spin_unlock(&par->accept_q_lock);
+	sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+	struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+	spin_lock(&par->accept_q_lock);
+	list_del_init(&smc_sk(sk)->accept_q);
+	spin_unlock(&par->accept_q_lock);
+	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+	sock_put(sk);
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+static struct sock *smc_accept_dequeue(struct sock *parent,
+				       struct socket *new_sock)
+{
+	struct smc_sock *isk, *n;
+	struct sock *new_sk;
+
+	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+		new_sk = (struct sock *)isk;
+
+		smc_accept_unlink(new_sk);
+		if (new_sk->sk_state == SMC_CLOSED) {
+			/* tbd in follow-on patch: close this sock */
+			continue;
+		}
+		if (new_sock)
+			sock_graft(new_sk, new_sock);
+		return new_sk;
+	}
+	return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+static void smc_close_non_accepted(struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	sock_hold(sk);
+	if (smc->clcsock) {
+		struct socket *tcp;
+
+		tcp = smc->clcsock;
+		smc->clcsock = NULL;
+		sock_release(tcp);
+	}
+	/* more closing stuff to be added with socket closing patch */
+	sock_put(sk);
+}
+
+/* setup for RDMA connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+	struct smc_sock *new_smc = container_of(work, struct smc_sock,
+						smc_listen_work);
+	struct socket *newclcsock = new_smc->clcsock;
+	struct smc_sock *lsmc = new_smc->listen_smc;
+	struct smc_clc_msg_accept_confirm cclc;
+	struct sock *newsmcsk = &new_smc->sk;
+	struct smc_clc_msg_proposal pclc;
+	struct smc_ib_device *smcibdev;
+	struct sockaddr_in peeraddr;
+	int reason_code = 0;
+	int rc = 0, len;
+	__be32 subnet;
+	u8 prefix_len;
+	u8 ibport;
+
+	/* do inband token exchange -
+	 *wait for and receive SMC Proposal CLC message
+	 */
+	reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+				       SMC_CLC_PROPOSAL);
+	if (reason_code < 0)
+		goto out_err;
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(new_smc)) {
+		reason_code = SMC_CLC_DECL_IPSEC;
+		goto decline_rdma;
+	}
+
+	/* PNET table look up: search active ib_device and port
+	 * within same PNETID that also contains the ethernet device
+	 * used for the internal TCP socket
+	 */
+	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
+	if (!smcibdev) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* determine subnet and mask from internal TCP socket */
+	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+	if ((pclc.outgoing_subnet != subnet) ||
+	    (pclc.prefix_len != prefix_len)) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* get address of the peer connected to the internal TCP socket */
+	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create connection, link_group, link
+	 */
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * create rmbs, map rmbs
+	 */
+
+	rc = smc_clc_send_accept(new_smc);
+	if (rc)
+		goto out_err;
+
+	/* receive SMC Confirm CLC message */
+	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+				       SMC_CLC_CONFIRM);
+	if (reason_code < 0)
+		goto out_err;
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	/* tbd in follow-on patch: more steps to setup RDMA communcication,
+	 * rtoken_handling, modify_qp
+	 */
+
+out_connected:
+	sk_refcnt_debug_inc(newsmcsk);
+	newsmcsk->sk_state = SMC_ACTIVE;
+enqueue:
+	lock_sock(&lsmc->sk);
+	if (lsmc->sk.sk_state == SMC_LISTEN) {
+		smc_accept_enqueue(&lsmc->sk, newsmcsk);
+	} else { /* no longer listening */
+		smc_close_non_accepted(newsmcsk);
+	}
+	release_sock(&lsmc->sk);
+
+	/* Wake up accept */
+	lsmc->sk.sk_data_ready(&lsmc->sk);
+	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+	return;
+
+decline_rdma:
+	/* RDMA setup failed, switch back to TCP */
+	new_smc->use_fallback = true;
+	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+		rc = smc_clc_send_decline(new_smc, reason_code, 0);
+		if (rc < sizeof(struct smc_clc_msg_decline))
+			goto out_err;
+	}
+	goto out_connected;
+
+out_err:
+	newsmcsk->sk_state = SMC_CLOSED;
+	goto enqueue; /* queue new sock with sk_err set */
+}
+
+static void smc_tcp_listen_work(struct work_struct *work)
+{
+	struct smc_sock *lsmc = container_of(work, struct smc_sock,
+					     tcp_listen_work);
+	struct smc_sock *new_smc;
+	int rc = 0;
+
+	lock_sock(&lsmc->sk);
+	while (lsmc->sk.sk_state == SMC_LISTEN) {
+		rc = smc_clcsock_accept(lsmc, &new_smc);
+		if (rc)
+			goto out;
+		if (!new_smc)
+			continue;
+
+		new_smc->listen_smc = lsmc;
+		new_smc->use_fallback = false; /* assume rdma capability first*/
+		sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
+		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
+		smc_copy_sock_settings_to_smc(new_smc);
+		schedule_work(&new_smc->smc_listen_work);
+	}
+
+out:
+	release_sock(&lsmc->sk);
+	lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
+}
+
 static int smc_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
@@ -286,6 +640,8 @@ static int smc_listen(struct socket *sock, int backlog)
 	sk->sk_max_ack_backlog = backlog;
 	sk->sk_ack_backlog = 0;
 	sk->sk_state = SMC_LISTEN;
+	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	schedule_work(&smc->tcp_listen_work);
 
 out:
 	release_sock(sk);
@@ -295,10 +651,11 @@ out:
 static int smc_accept(struct socket *sock, struct socket *new_sock,
 		      int flags)
 {
-	struct smc_sock *new_smc;
-	struct sock *sk = sock->sk;
+	struct sock *sk = sock->sk, *nsk;
+	DECLARE_WAITQUEUE(wait, current);
 	struct smc_sock *lsmc;
-	int rc;
+	long timeo;
+	int rc = 0;
 
 	lsmc = smc_sk(sk);
 	lock_sock(sk);
@@ -308,18 +665,30 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 		goto out;
 	}
 
-	rc = smc_clcsock_accept(lsmc, &new_smc);
-	if (rc)
-		goto out;
-	sock_graft(&new_smc->sk, new_sock);
-	new_smc->sk.sk_state = SMC_ACTIVE;
-
-	smc_copy_sock_settings_to_smc(new_smc);
+	/* Wait for an incoming connection */
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!timeo) {
+			rc = -EAGAIN;
+			break;
+		}
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		/* wakeup by sk_data_ready in smc_listen_work() */
+		sched_annotate_sleep();
+		lock_sock(sk);
+		if (signal_pending(current)) {
+			rc = sock_intr_errno(timeo);
+			break;
+		}
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
-	/* always use TCP fallback as transport mechanism for now;
-	 * This will change once RDMA transport is implemented
-	 */
-	new_smc->use_fallback = true;
+	if (!rc)
+		rc = sock_error(nsk);
 
 out:
 	release_sock(sk);
@@ -379,29 +748,61 @@ out:
 	return rc;
 }
 
+static unsigned int smc_accept_poll(struct sock *parent)
+{
+	struct smc_sock *isk;
+	struct sock *sk;
+
+	lock_sock(parent);
+	list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
+		sk = (struct sock *)isk;
+
+		if (sk->sk_state == SMC_ACTIVE) {
+			release_sock(parent);
+			return POLLIN | POLLRDNORM;
+		}
+	}
+	release_sock(parent);
+
+	return 0;
+}
+
 static unsigned int smc_poll(struct file *file, struct socket *sock,
 			     poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	unsigned int mask = 0;
 	struct smc_sock *smc;
+	int rc;
 
 	smc = smc_sk(sock->sk);
-	if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
-	    smc->use_fallback) {
+	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
+		/* delegate to CLC child sock */
 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		/* if non-blocking connect finished ... */
 		lock_sock(sk);
 		if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
-			sk->sk_state = SMC_ACTIVE;
-			/* always use TCP fallback as transport mechanism;
-			 * This will change once RDMA transport is implemented
-			 */
-			smc->use_fallback = true;
+			sk->sk_err = smc->clcsock->sk->sk_err;
+			if (sk->sk_err) {
+				mask |= POLLERR;
+			} else {
+				rc = smc_connect_rdma(smc);
+				if (rc < 0)
+					mask |= POLLERR;
+				else
+					/* success cases including fallback */
+					mask |= POLLOUT | POLLWRNORM;
+			}
 		}
 		release_sock(sk);
 	} else {
-		mask = sock_no_poll(file, sock, wait);
+		sock_poll_wait(file, sk_sleep(sk), wait);
+		if (sk->sk_state == SMC_LISTEN)
+			/* woken up by sk_data_ready in smc_listen_work() */
+			mask |= smc_accept_poll(sk);
+		if (sk->sk_err)
+			mask |= POLLERR;
+		/* for now - to be enhanced in follow-on patch */
 	}
 
 	return mask;
@@ -568,6 +969,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
 
 	/* create internal TCP socket for CLC handshake and fallback */
 	smc = smc_sk(sk);
+	smc->use_fallback = false; /* assume rdma capability first */
 	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
 			      IPPROTO_TCP, &smc->clcsock);
 	if (rc)
diff --git a/net/smc/smc.h b/net/smc/smc.h
index e87d79867099..5946aaecdebf 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -28,6 +28,12 @@ enum smc_state {		/* possible states of an SMC socket */
 struct smc_sock {				/* smc sock container */
 	struct sock		sk;
 	struct socket		*clcsock;	/* internal tcp socket */
+	struct sockaddr		*addr;		/* inet connect address */
+	struct smc_sock		*listen_smc;	/* listen parent */
+	struct work_struct	tcp_listen_work;/* handle tcp socket accepts */
+	struct work_struct	smc_listen_work;/* prepare new accept socket */
+	struct list_head	accept_q;	/* sockets to be accepted */
+	spinlock_t		accept_q_lock;	/* protects accept_q */
 	bool			use_fallback;	/* fallback to tcp */
 };
 
@@ -40,4 +46,20 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
 
 extern u8	local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
 
+#ifdef CONFIG_XFRM
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+	return (smc->clcsock->sk->sk_policy[0] ||
+		smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+}
+#else
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+	return 0;
+}
+#endif
+
+int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
+			 u8 *prefix_len);
+
 #endif	/* __SMC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000000..b613b866a2ef
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,252 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  CLC (connection layer control) handshake over initial TCP socket to
+ *  prepare for RDMA traffic
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_ib.h"
+
+/* Wait for data on the tcp-socket, analyze received data
+ * Returns:
+ * 0 if success and it was not a decline that we received.
+ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
+ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
+ */
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+		     u8 expected_type)
+{
+	struct sock *clc_sk = smc->clcsock->sk;
+	struct smc_clc_msg_hdr *clcm = buf;
+	struct msghdr msg = {NULL, 0};
+	int reason_code = 0;
+	struct kvec vec;
+	int len, datlen;
+	int krflags;
+
+	/* peek the first few bytes to determine length of data to receive
+	 * so we don't consume any subsequent CLC message or payload data
+	 * in the TCP byte stream
+	 */
+	vec.iov_base = buf;
+	vec.iov_len = buflen;
+	krflags = MSG_PEEK | MSG_WAITALL;
+	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+	len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
+			     sizeof(struct smc_clc_msg_hdr), krflags);
+	if (signal_pending(current)) {
+		reason_code = -EINTR;
+		clc_sk->sk_err = EINTR;
+		smc->sk.sk_err = EINTR;
+		goto out;
+	}
+	if (clc_sk->sk_err) {
+		reason_code = -clc_sk->sk_err;
+		smc->sk.sk_err = clc_sk->sk_err;
+		goto out;
+	}
+	if (!len) { /* peer has performed orderly shutdown */
+		smc->sk.sk_err = ECONNRESET;
+		reason_code = -ECONNRESET;
+		goto out;
+	}
+	if (len < 0) {
+		smc->sk.sk_err = -len;
+		reason_code = len;
+		goto out;
+	}
+	datlen = ntohs(clcm->length);
+	if ((len < sizeof(struct smc_clc_msg_hdr)) ||
+	    (datlen < sizeof(struct smc_clc_msg_decline)) ||
+	    (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
+	    memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+	    ((clcm->type != SMC_CLC_DECLINE) &&
+	     (clcm->type != expected_type))) {
+		smc->sk.sk_err = EPROTO;
+		reason_code = -EPROTO;
+		goto out;
+	}
+
+	/* receive the complete CLC message */
+	vec.iov_base = buf;
+	vec.iov_len = buflen;
+	memset(&msg, 0, sizeof(struct msghdr));
+	krflags = MSG_WAITALL;
+	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+	len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
+	if (len < datlen) {
+		smc->sk.sk_err = EPROTO;
+		reason_code = -EPROTO;
+		goto out;
+	}
+	if (clcm->type == SMC_CLC_DECLINE)
+		reason_code = SMC_CLC_DECL_REPLY;
+out:
+	return reason_code;
+}
+
+/* send CLC DECLINE message across internal TCP socket */
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+			 u8 out_of_sync)
+{
+	struct smc_clc_msg_decline dclc;
+	struct msghdr msg;
+	struct kvec vec;
+	int len;
+
+	memset(&dclc, 0, sizeof(dclc));
+	memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	dclc.hdr.type = SMC_CLC_DECLINE;
+	dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
+	dclc.hdr.version = SMC_CLC_V1;
+	dclc.hdr.flag = out_of_sync ? 1 : 0;
+	memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+	dclc.peer_diagnosis = htonl(peer_diag_info);
+	memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &dclc;
+	vec.iov_len = sizeof(struct smc_clc_msg_decline);
+	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+			     sizeof(struct smc_clc_msg_decline));
+	if (len < sizeof(struct smc_clc_msg_decline))
+		smc->sk.sk_err = EPROTO;
+	if (len < 0)
+		smc->sk.sk_err = -len;
+	return len;
+}
+
+/* send CLC PROPOSAL message across internal TCP socket */
+int smc_clc_send_proposal(struct smc_sock *smc,
+			  struct smc_ib_device *smcibdev,
+			  u8 ibport)
+{
+	struct smc_clc_msg_proposal pclc;
+	int reason_code = 0;
+	struct msghdr msg;
+	struct kvec vec;
+	int len, rc;
+
+	/* send SMC Proposal CLC message */
+	memset(&pclc, 0, sizeof(pclc));
+	memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	pclc.hdr.type = SMC_CLC_PROPOSAL;
+	pclc.hdr.length = htons(sizeof(pclc));
+	pclc.hdr.version = SMC_CLC_V1;		/* SMC version */
+	memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+	memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
+	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1],
+	       sizeof(smcibdev->mac[ibport - 1]));
+
+	/* determine subnet and mask from internal TCP socket */
+	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
+				  &pclc.prefix_len);
+	if (rc)
+		return SMC_CLC_DECL_CNFERR; /* configuration error */
+	memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &pclc;
+	vec.iov_len = sizeof(pclc);
+	/* due to the few bytes needed for clc-handshake this cannot block */
+	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+	if (len < sizeof(pclc)) {
+		if (len >= 0) {
+			reason_code = -ENETUNREACH;
+			smc->sk.sk_err = -reason_code;
+		} else {
+			smc->sk.sk_err = smc->clcsock->sk->sk_err;
+			reason_code = -smc->sk.sk_err;
+		}
+	}
+
+	return reason_code;
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc)
+{
+	struct smc_clc_msg_accept_confirm cclc;
+	int reason_code = 0;
+	struct msghdr msg;
+	struct kvec vec;
+	int len;
+
+	/* send SMC Confirm CLC msg */
+	memset(&cclc, 0, sizeof(cclc));
+	memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	cclc.hdr.type = SMC_CLC_CONFIRM;
+	cclc.hdr.length = htons(sizeof(cclc));
+	cclc.hdr.version = SMC_CLC_V1;		/* SMC version */
+	memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+
+	/* tbd in follow-on patch: fill in link-related values */
+
+	/* tbd in follow-on patch: fill in rmb-related values */
+
+	cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+
+	memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &cclc;
+	vec.iov_len = sizeof(cclc);
+	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
+	if (len < sizeof(cclc)) {
+		if (len >= 0) {
+			reason_code = -ENETUNREACH;
+			smc->sk.sk_err = -reason_code;
+		} else {
+			smc->sk.sk_err = smc->clcsock->sk->sk_err;
+			reason_code = -smc->sk.sk_err;
+		}
+	}
+	return reason_code;
+}
+
+/* send CLC ACCEPT message across internal TCP socket */
+int smc_clc_send_accept(struct smc_sock *new_smc)
+{
+	struct smc_clc_msg_accept_confirm aclc;
+	struct msghdr msg;
+	struct kvec vec;
+	int rc = 0;
+	int len;
+
+	memset(&aclc, 0, sizeof(aclc));
+	memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	aclc.hdr.type = SMC_CLC_ACCEPT;
+	aclc.hdr.length = htons(sizeof(aclc));
+	aclc.hdr.version = SMC_CLC_V1;		/* SMC version */
+	memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+
+	/* tbd in follow-on patch: fill in link-related values */
+
+	/* tbd in follow-on patch: fill in rmb-related values */
+
+	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */
+	memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &aclc;
+	vec.iov_len = sizeof(aclc);
+	len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
+	if (len < sizeof(aclc)) {
+		if (len >= 0)
+			new_smc->sk.sk_err = EPROTO;
+		else
+			new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
+		rc = sock_error(&new_smc->sk);
+	}
+
+	return rc;
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000000..60bfc8bd8d89
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,114 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  CLC (connection layer control) handshake over initial TCP socket to
+ *  prepare for RDMA traffic
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CLC_H
+#define _SMC_CLC_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+
+#define SMC_CLC_PROPOSAL	0x01
+#define SMC_CLC_ACCEPT		0x02
+#define SMC_CLC_CONFIRM		0x03
+#define SMC_CLC_DECLINE		0x04
+
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+
+#define SMC_CLC_V1		0x1		/* SMC version                */
+#define CLC_WAIT_TIME		(6 * HZ)	/* max. wait time on clcsock  */
+#define SMC_CLC_DECL_MEM	0x01010000  /* insufficient memory resources  */
+#define SMC_CLC_DECL_TIMEOUT	0x02000000  /* timeout                        */
+#define SMC_CLC_DECL_CNFERR	0x03000000  /* configuration error            */
+#define SMC_CLC_DECL_IPSEC	0x03030000  /* IPsec usage                    */
+#define SMC_CLC_DECL_SYNCERR	0x04000000  /* synchronization error          */
+#define SMC_CLC_DECL_REPLY	0x06000000  /* reply to a received decline    */
+#define SMC_CLC_DECL_INTERR	0x99990000  /* internal error                 */
+
+struct smc_clc_msg_hdr {	/* header1 of clc messages */
+	u8 eyecatcher[4];	/* eye catcher */
+	u8 type;		/* proposal / accept / confirm / decline */
+	__be16 length;
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 version : 4,
+	   flag    : 1,
+	   rsvd	   : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 rsvd    : 3,
+	   flag    : 1,
+	   version : 4;
+#endif
+} __packed;			/* format defined in RFC7609 */
+
+struct smc_clc_msg_trail {	/* trailer of clc messages */
+	u8 eyecatcher[4];
+};
+
+struct smc_clc_msg_local {	/* header2 of clc messages */
+	u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
+	u8 gid[16];		/* gid of ib_device port */
+	u8 mac[6];		/* mac of ib_device port */
+};
+
+struct smc_clc_msg_proposal {	/* clc proposal message */
+	struct smc_clc_msg_hdr hdr;
+	struct smc_clc_msg_local lcl;
+	__be16 iparea_offset;	/* offset to IP address information area */
+	__be32 outgoing_subnet;	/* subnet mask */
+	u8 prefix_len;		/* number of significant bits in mask */
+	u8 reserved[2];
+	u8 ipv6_prefixes_cnt;	/* number of IPv6 prefixes in prefix array */
+	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_clc_msg_accept_confirm {	/* clc accept / confirm message */
+	struct smc_clc_msg_hdr hdr;
+	struct smc_clc_msg_local lcl;
+	u8 qpn[3];		/* QP number */
+	__be32 rmb_rkey;	/* RMB rkey */
+	u8 conn_idx;		/* Connection index, which RMBE in RMB */
+	__be32 rmbe_alert_token;/* unique connection id */
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 rmbe_size : 4,	/* RMBE buf size (compressed notation) */
+	   qp_mtu   : 4;	/* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 qp_mtu   : 4,
+	   rmbe_size : 4;
+#endif
+	u8 reserved;
+	__be64 rmb_dma_addr;	/* RMB virtual address */
+	u8 reserved2;
+	u8 psn[3];		/* initial packet sequence number */
+	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __packed;			/* format defined in RFC7609 */
+
+struct smc_clc_msg_decline {	/* clc decline message */
+	struct smc_clc_msg_hdr hdr;
+	u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+	__be32 peer_diagnosis;	/* diagnosis information */
+	u8 reserved2[4];
+	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_sock;
+struct smc_ib_device;
+
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+		     u8 expected_type);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+			 u8 out_of_sync);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
+			  u8 ibport);
+int smc_clc_send_confirm(struct smc_sock *smc);
+int smc_clc_send_accept(struct smc_sock *smc);
+
+#endif
-- 
cgit v1.2.3


From 0cfdd8f92cac01afbb12e4500514036a2b78756b Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:17 +0100
Subject: smc: connection and link group creation

* create smc_connection for SMC-sockets
* determine suitable link group for a connection
* create a new link group if necessary

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile   |   2 +-
 net/smc/af_smc.c   | 102 ++++++++++++++--
 net/smc/smc.h      |  36 ++++++
 net/smc/smc_clc.c  |  38 +++++-
 net/smc/smc_clc.h  |   2 +-
 net/smc/smc_core.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_core.h | 106 +++++++++++++++++
 7 files changed, 605 insertions(+), 17 deletions(-)
 create mode 100644 net/smc/smc_core.c
 create mode 100644 net/smc/smc_core.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index c0ad5883e888..cb8bcd9df53e 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_SMC)	+= smc.o
-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 05c705a688e5..5fda37decc55 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -31,9 +31,19 @@
 
 #include "smc.h"
 #include "smc_clc.h"
+#include "smc_core.h"
 #include "smc_ib.h"
 #include "smc_pnet.h"
 
+static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
+						 * creation
+						 */
+
+struct smc_lgr_list smc_lgr_list = {		/* established link groups */
+	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+	.list = LIST_HEAD_INIT(smc_lgr_list.list),
+};
+
 static void smc_tcp_listen_work(struct work_struct *);
 
 static void smc_set_keepalive(struct sock *sk, int val)
@@ -235,11 +245,31 @@ out:
 	return rc;
 }
 
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+				    struct smc_clc_msg_accept_confirm *clc)
+{
+	smc->conn.peer_conn_idx = clc->conn_idx;
+}
+
+static void smc_link_save_peer_info(struct smc_link *link,
+				    struct smc_clc_msg_accept_confirm *clc)
+{
+	link->peer_qpn = ntoh24(clc->qpn);
+	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
+	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
+	link->peer_psn = ntoh24(clc->psn);
+	link->peer_mtu = clc->qp_mtu;
+}
+
 /* setup for RDMA connection of client */
 static int smc_connect_rdma(struct smc_sock *smc)
 {
+	struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
 	struct smc_clc_msg_accept_confirm aclc;
+	int local_contact = SMC_FIRST_CONTACT;
 	struct smc_ib_device *smcibdev;
+	struct smc_link *link;
+	u8 srv_first_contact;
 	int reason_code = 0;
 	int rc = 0;
 	u8 ibport;
@@ -278,26 +308,43 @@ static int smc_connect_rdma(struct smc_sock *smc)
 	if (reason_code > 0)
 		goto decline_rdma;
 
-	/* tbd in follow-on patch: more steps to setup RDMA communcication,
-	 * create connection, link group, link
-	 */
+	srv_first_contact = aclc.hdr.flag;
+	mutex_lock(&smc_create_lgr_pending);
+	local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
+					ibport, &aclc.lcl, srv_first_contact);
+	if (local_contact < 0) {
+		rc = local_contact;
+		if (rc == -ENOMEM)
+			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+		else if (rc == -ENOLINK)
+			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+		goto decline_rdma_unlock;
+	}
+	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
+	smc_conn_save_peer_info(smc, &aclc);
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_link_save_peer_info(link, &aclc);
 	/* tbd in follow-on patch: more steps to setup RDMA communcication,
 	 * create rmbs, map rmbs, rtoken_handling, modify_qp
 	 */
 
 	rc = smc_clc_send_confirm(smc);
 	if (rc)
-		goto out_err;
+		goto out_err_unlock;
 
 	/* tbd in follow-on patch: llc_confirm */
 
+	mutex_unlock(&smc_create_lgr_pending);
 out_connected:
 	smc_copy_sock_settings_to_clc(smc);
 	smc->sk.sk_state = SMC_ACTIVE;
 
-	return rc;
+	return rc ? rc : local_contact;
 
+decline_rdma_unlock:
+	mutex_unlock(&smc_create_lgr_pending);
+	smc_conn_free(&smc->conn);
 decline_rdma:
 	/* RDMA setup failed, switch back to TCP */
 	smc->use_fallback = true;
@@ -308,6 +355,9 @@ decline_rdma:
 	}
 	goto out_connected;
 
+out_err_unlock:
+	mutex_unlock(&smc_create_lgr_pending);
+	smc_conn_free(&smc->conn);
 out_err:
 	return rc;
 }
@@ -476,10 +526,12 @@ static void smc_listen_work(struct work_struct *work)
 	struct socket *newclcsock = new_smc->clcsock;
 	struct smc_sock *lsmc = new_smc->listen_smc;
 	struct smc_clc_msg_accept_confirm cclc;
+	int local_contact = SMC_REUSE_CONTACT;
 	struct sock *newsmcsk = &new_smc->sk;
 	struct smc_clc_msg_proposal pclc;
 	struct smc_ib_device *smcibdev;
 	struct sockaddr_in peeraddr;
+	struct smc_link *link;
 	int reason_code = 0;
 	int rc = 0, len;
 	__be32 subnet;
@@ -527,15 +579,30 @@ static void smc_listen_work(struct work_struct *work)
 	/* get address of the peer connected to the internal TCP socket */
 	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
 
-	/* tbd in follow-on patch: more steps to setup RDMA communcication,
-	 * create connection, link_group, link
-	 */
+	/* allocate connection / link group */
+	mutex_lock(&smc_create_lgr_pending);
+	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
+					smcibdev, ibport, &pclc.lcl, 0);
+	if (local_contact == SMC_REUSE_CONTACT)
+		/* lock no longer needed, free it due to following
+		 * smc_clc_wait_msg() call
+		 */
+		mutex_unlock(&smc_create_lgr_pending);
+	if (local_contact < 0) {
+		rc = local_contact;
+		if (rc == -ENOMEM)
+			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+		else if (rc == -ENOLINK)
+			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+		goto decline_rdma;
+	}
+	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
 	/* tbd in follow-on patch: more steps to setup RDMA communcication,
 	 * create rmbs, map rmbs
 	 */
 
-	rc = smc_clc_send_accept(new_smc);
+	rc = smc_clc_send_accept(new_smc, local_contact);
 	if (rc)
 		goto out_err;
 
@@ -546,6 +613,9 @@ static void smc_listen_work(struct work_struct *work)
 		goto out_err;
 	if (reason_code > 0)
 		goto decline_rdma;
+	smc_conn_save_peer_info(new_smc, &cclc);
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_link_save_peer_info(link, &cclc);
 
 	/* tbd in follow-on patch: more steps to setup RDMA communcication,
 	 * rtoken_handling, modify_qp
@@ -555,6 +625,8 @@ out_connected:
 	sk_refcnt_debug_inc(newsmcsk);
 	newsmcsk->sk_state = SMC_ACTIVE;
 enqueue:
+	if (local_contact == SMC_FIRST_CONTACT)
+		mutex_unlock(&smc_create_lgr_pending);
 	lock_sock(&lsmc->sk);
 	if (lsmc->sk.sk_state == SMC_LISTEN) {
 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
@@ -570,6 +642,7 @@ enqueue:
 
 decline_rdma:
 	/* RDMA setup failed, switch back to TCP */
+	smc_conn_free(&new_smc->conn);
 	new_smc->use_fallback = true;
 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 		rc = smc_clc_send_decline(new_smc, reason_code, 0);
@@ -1024,6 +1097,17 @@ out_pnet:
 
 static void __exit smc_exit(void)
 {
+	struct smc_link_group *lgr, *lg;
+	LIST_HEAD(lgr_freeing_list);
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	if (!list_empty(&smc_lgr_list.list))
+		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
+		list_del_init(&lgr->list);
+		smc_lgr_free(lgr); /* free link group */
+	}
 	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
 	proto_unregister(&smc_proto);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 5946aaecdebf..11265bde4655 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -14,6 +14,8 @@
 #include <linux/types.h>
 #include <net/sock.h>
 
+#include "smc_ib.h"
+
 #define SMCPROTO_SMC		0	/* SMC protocol */
 
 #define SMC_MAX_PORTS		2	/* Max # of ports */
@@ -25,9 +27,19 @@ enum smc_state {		/* possible states of an SMC socket */
 	SMC_LISTEN	= 10,
 };
 
+struct smc_link_group;
+
+struct smc_connection {
+	struct rb_node		alert_node;
+	struct smc_link_group	*lgr;		/* link group of connection */
+	u32			alert_token_local; /* unique conn. id */
+	u8			peer_conn_idx;	/* from tcp handshake */
+};
+
 struct smc_sock {				/* smc sock container */
 	struct sock		sk;
 	struct socket		*clcsock;	/* internal tcp socket */
+	struct smc_connection	conn;		/* smc connection */
 	struct sockaddr		*addr;		/* inet connect address */
 	struct smc_sock		*listen_smc;	/* listen parent */
 	struct work_struct	tcp_listen_work;/* handle tcp socket accepts */
@@ -46,6 +58,24 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
 
 extern u8	local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
 
+/* convert an u32 value into network byte order, store it into a 3 byte field */
+static inline void hton24(u8 *net, u32 host)
+{
+	__be32 t;
+
+	t = cpu_to_be32(host);
+	memcpy(net, ((u8 *)&t) + 1, 3);
+}
+
+/* convert a received 3 byte field into host byte order*/
+static inline u32 ntoh24(u8 *net)
+{
+	__be32 t = 0;
+
+	memcpy(((u8 *)&t) + 1, net, 3);
+	return be32_to_cpu(t);
+}
+
 #ifdef CONFIG_XFRM
 static inline bool using_ipsec(struct smc_sock *smc)
 {
@@ -59,7 +89,13 @@ static inline bool using_ipsec(struct smc_sock *smc)
 }
 #endif
 
+struct smc_clc_msg_local;
+
 int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
 			 u8 *prefix_len);
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+		    struct smc_ib_device *smcibdev, u8 ibport,
+		    struct smc_clc_msg_local *lcl, int srv_first_contact);
 
 #endif	/* __SMC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index b613b866a2ef..f8e47c06d2ed 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -14,6 +14,7 @@
 #include <net/tcp.h>
 
 #include "smc.h"
+#include "smc_core.h"
 #include "smc_clc.h"
 #include "smc_ib.h"
 
@@ -89,8 +90,13 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		reason_code = -EPROTO;
 		goto out;
 	}
-	if (clcm->type == SMC_CLC_DECLINE)
+	if (clcm->type == SMC_CLC_DECLINE) {
 		reason_code = SMC_CLC_DECL_REPLY;
+		if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis)
+			== SMC_CLC_DECL_SYNCERR)
+			smc->conn.lgr->sync_err = true;
+	}
+
 out:
 	return reason_code;
 }
@@ -175,12 +181,15 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 /* send CLC CONFIRM message across internal TCP socket */
 int smc_clc_send_confirm(struct smc_sock *smc)
 {
+	struct smc_connection *conn = &smc->conn;
 	struct smc_clc_msg_accept_confirm cclc;
+	struct smc_link *link;
 	int reason_code = 0;
 	struct msghdr msg;
 	struct kvec vec;
 	int len;
 
+	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
 	/* send SMC Confirm CLC msg */
 	memset(&cclc, 0, sizeof(cclc));
 	memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
@@ -188,12 +197,18 @@ int smc_clc_send_confirm(struct smc_sock *smc)
 	cclc.hdr.length = htons(sizeof(cclc));
 	cclc.hdr.version = SMC_CLC_V1;		/* SMC version */
 	memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
-
-	/* tbd in follow-on patch: fill in link-related values */
+	memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+	       SMC_GID_SIZE);
+	memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+	       sizeof(link->smcibdev->mac));
 
 	/* tbd in follow-on patch: fill in rmb-related values */
 
+	hton24(cclc.qpn, link->roce_qp->qp_num);
 	cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+	cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+	cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+	hton24(cclc.psn, link->psn_initial);
 
 	memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 
@@ -214,26 +229,37 @@ int smc_clc_send_confirm(struct smc_sock *smc)
 }
 
 /* send CLC ACCEPT message across internal TCP socket */
-int smc_clc_send_accept(struct smc_sock *new_smc)
+int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
 {
+	struct smc_connection *conn = &new_smc->conn;
 	struct smc_clc_msg_accept_confirm aclc;
+	struct smc_link *link;
 	struct msghdr msg;
 	struct kvec vec;
 	int rc = 0;
 	int len;
 
+	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
 	memset(&aclc, 0, sizeof(aclc));
 	memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	aclc.hdr.type = SMC_CLC_ACCEPT;
 	aclc.hdr.length = htons(sizeof(aclc));
 	aclc.hdr.version = SMC_CLC_V1;		/* SMC version */
+	if (srv_first_contact)
+		aclc.hdr.flag = 1;
 	memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
-
-	/* tbd in follow-on patch: fill in link-related values */
+	memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+	       SMC_GID_SIZE);
+	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
+	       sizeof(link->smcibdev->mac[link->ibport - 1]));
 
 	/* tbd in follow-on patch: fill in rmb-related values */
 
+	hton24(aclc.qpn, link->roce_qp->qp_num);
 	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */
+	aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+	aclc.qp_mtu = link->path_mtu;
+	hton24(aclc.psn, link->psn_initial);
 	memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 
 	memset(&msg, 0, sizeof(msg));
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 60bfc8bd8d89..5924d998b5ca 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -109,6 +109,6 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
 int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
 			  u8 ibport);
 int smc_clc_send_confirm(struct smc_sock *smc);
-int smc_clc_send_accept(struct smc_sock *smc);
+int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
 
 #endif
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000000..b88a82918c82
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,336 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Basic Transport Functions exploiting Infiniband API
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+
+#define SMC_LGR_FREE_DELAY	(600 * HZ)
+
+/* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+ * Requires @conns_lock
+ * @smc		connection to register
+ * Returns 0 on success, != otherwise.
+ */
+static void smc_lgr_add_alert_token(struct smc_connection *conn)
+{
+	struct rb_node **link, *parent = NULL;
+	u32 token = conn->alert_token_local;
+
+	link = &conn->lgr->conns_all.rb_node;
+	while (*link) {
+		struct smc_connection *cur = rb_entry(*link,
+					struct smc_connection, alert_node);
+
+		parent = *link;
+		if (cur->alert_token_local > token)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	/* Put the new node there */
+	rb_link_node(&conn->alert_node, parent, link);
+	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
+}
+
+/* Register connection in link group by assigning an alert token
+ * registered in a search tree.
+ * Requires @conns_lock
+ * Note that '0' is a reserved value and not assigned.
+ */
+static void smc_lgr_register_conn(struct smc_connection *conn)
+{
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+	static atomic_t nexttoken = ATOMIC_INIT(0);
+
+	/* find a new alert_token_local value not yet used by some connection
+	 * in this link group
+	 */
+	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
+	while (!conn->alert_token_local) {
+		conn->alert_token_local = atomic_inc_return(&nexttoken);
+		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
+			conn->alert_token_local = 0;
+	}
+	smc_lgr_add_alert_token(conn);
+	conn->lgr->conns_num++;
+}
+
+/* Unregister connection and reset the alert token of the given connection<
+ */
+static void __smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+	struct smc_link_group *lgr = conn->lgr;
+
+	rb_erase(&conn->alert_node, &lgr->conns_all);
+	lgr->conns_num--;
+	conn->alert_token_local = 0;
+	conn->lgr = NULL;
+	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
+}
+
+/* Unregister connection and trigger lgr freeing if applicable
+ */
+static void smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+	struct smc_link_group *lgr = conn->lgr;
+	int reduced = 0;
+
+	write_lock_bh(&lgr->conns_lock);
+	if (conn->alert_token_local) {
+		reduced = 1;
+		__smc_lgr_unregister_conn(conn);
+	}
+	write_unlock_bh(&lgr->conns_lock);
+	if (reduced && !lgr->conns_num)
+		schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
+}
+
+static void smc_lgr_free_work(struct work_struct *work)
+{
+	struct smc_link_group *lgr = container_of(to_delayed_work(work),
+						  struct smc_link_group,
+						  free_work);
+	bool conns;
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	read_lock_bh(&lgr->conns_lock);
+	conns = RB_EMPTY_ROOT(&lgr->conns_all);
+	read_unlock_bh(&lgr->conns_lock);
+	if (!conns) { /* number of lgr connections is no longer zero */
+		spin_unlock_bh(&smc_lgr_list.lock);
+		return;
+	}
+	list_del_init(&lgr->list); /* remove from smc_lgr_list */
+	spin_unlock_bh(&smc_lgr_list.lock);
+	smc_lgr_free(lgr);
+}
+
+/* create a new SMC link group */
+static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
+			  struct smc_ib_device *smcibdev, u8 ibport,
+			  char *peer_systemid, unsigned short vlan_id)
+{
+	struct smc_link_group *lgr;
+	struct smc_link *lnk;
+	u8 rndvec[3];
+	int rc = 0;
+
+	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+	if (!lgr) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+	lgr->sync_err = false;
+	lgr->daddr = peer_in_addr;
+	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+	lgr->vlan_id = vlan_id;
+	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
+	lgr->conns_all = RB_ROOT;
+
+	lnk = &lgr->lnk[SMC_SINGLE_LINK];
+	/* initialize link */
+	lnk->smcibdev = smcibdev;
+	lnk->ibport = ibport;
+	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+	get_random_bytes(rndvec, sizeof(rndvec));
+	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
+
+	smc->conn.lgr = lgr;
+	rwlock_init(&lgr->conns_lock);
+	spin_lock_bh(&smc_lgr_list.lock);
+	list_add(&lgr->list, &smc_lgr_list.list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+out:
+	return rc;
+}
+
+/* remove a finished connection from its link group */
+void smc_conn_free(struct smc_connection *conn)
+{
+	struct smc_link_group *lgr = conn->lgr;
+
+	if (!lgr)
+		return;
+	smc_lgr_unregister_conn(conn);
+}
+
+static void smc_link_clear(struct smc_link *lnk)
+{
+	lnk->peer_qpn = 0;
+}
+
+/* remove a link group */
+void smc_lgr_free(struct smc_link_group *lgr)
+{
+	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+	kfree(lgr);
+}
+
+/* terminate linkgroup abnormally */
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+	struct smc_connection *conn;
+	struct rb_node *node;
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	if (list_empty(&lgr->list)) {
+		/* termination already triggered */
+		spin_unlock_bh(&smc_lgr_list.lock);
+		return;
+	}
+	/* do not use this link group for new connections */
+	list_del_init(&lgr->list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+
+	write_lock_bh(&lgr->conns_lock);
+	node = rb_first(&lgr->conns_all);
+	while (node) {
+		conn = rb_entry(node, struct smc_connection, alert_node);
+		__smc_lgr_unregister_conn(conn);
+		node = rb_first(&lgr->conns_all);
+	}
+	write_unlock_bh(&lgr->conns_lock);
+	schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
+}
+
+/* Determine vlan of internal TCP socket.
+ * @vlan_id: address to store the determined vlan id into
+ */
+static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	int rc = 0;
+
+	*vlan_id = 0;
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+
+	if (is_vlan_dev(dst->dev))
+		*vlan_id = vlan_dev_vlan_id(dst->dev);
+
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
+/* determine the link gid matching the vlan id of the link group */
+static int smc_link_determine_gid(struct smc_link_group *lgr)
+{
+	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+	struct ib_gid_attr gattr;
+	union ib_gid gid;
+	int i;
+
+	if (!lgr->vlan_id) {
+		lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
+		return 0;
+	}
+
+	for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
+	     i++) {
+		if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
+				 &gattr))
+			continue;
+		if (gattr.ndev &&
+		    (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
+			lnk->gid = gid;
+			return 0;
+		}
+	}
+	return -ENODEV;
+}
+
+/* create a new SMC connection (and a new link group if necessary) */
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+		    struct smc_ib_device *smcibdev, u8 ibport,
+		    struct smc_clc_msg_local *lcl, int srv_first_contact)
+{
+	struct smc_connection *conn = &smc->conn;
+	struct smc_link_group *lgr;
+	unsigned short vlan_id;
+	enum smc_lgr_role role;
+	int local_contact = SMC_FIRST_CONTACT;
+	int rc = 0;
+
+	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+	rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
+	if (rc)
+		return rc;
+
+	if ((role == SMC_CLNT) && srv_first_contact)
+		/* create new link group as well */
+		goto create;
+
+	/* determine if an existing link group can be reused */
+	spin_lock_bh(&smc_lgr_list.lock);
+	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+		write_lock_bh(&lgr->conns_lock);
+		if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
+			    SMC_SYSTEMID_LEN) &&
+		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
+			    SMC_GID_SIZE) &&
+		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
+			    sizeof(lcl->mac)) &&
+		    !lgr->sync_err &&
+		    (lgr->role == role) &&
+		    (lgr->vlan_id == vlan_id)) {
+			/* link group found */
+			local_contact = SMC_REUSE_CONTACT;
+			conn->lgr = lgr;
+			smc_lgr_register_conn(conn); /* add smc conn to lgr */
+			write_unlock_bh(&lgr->conns_lock);
+			break;
+		}
+		write_unlock_bh(&lgr->conns_lock);
+	}
+	spin_unlock_bh(&smc_lgr_list.lock);
+
+	if (role == SMC_CLNT && !srv_first_contact &&
+	    (local_contact == SMC_FIRST_CONTACT)) {
+		/* Server reuses a link group, but Client wants to start
+		 * a new one
+		 * send out_of_sync decline, reason synchr. error
+		 */
+		return -ENOLINK;
+	}
+
+create:
+	if (local_contact == SMC_FIRST_CONTACT) {
+		rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
+				    lcl->id_for_peer, vlan_id);
+		if (rc)
+			goto out;
+		smc_lgr_register_conn(conn); /* add smc conn to lgr */
+		rc = smc_link_determine_gid(conn->lgr);
+	}
+
+out:
+	return rc ? rc : local_contact;
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000000..14b787abae02
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,106 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for SMC Connections, Link Groups and Links
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CORE_H
+#define _SMC_CORE_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_ib.h"
+
+struct smc_lgr_list {			/* list of link group definition */
+	struct list_head	list;
+	spinlock_t		lock;	/* protects list of link groups */
+};
+
+extern struct smc_lgr_list	smc_lgr_list; /* list of link groups */
+
+enum smc_lgr_role {		/* possible roles of a link group */
+	SMC_CLNT,	/* client */
+	SMC_SERV	/* server */
+};
+
+struct smc_link {
+	struct smc_ib_device	*smcibdev;	/* ib-device */
+	u8			ibport;		/* port - values 1 | 2 */
+	struct ib_qp		*roce_qp;	/* IB queue pair */
+	struct ib_qp_attr	qp_attr;	/* IB queue pair attributes */
+	union ib_gid		gid;		/* gid matching used vlan id */
+	u32			peer_qpn;	/* QP number of peer */
+	enum ib_mtu		path_mtu;	/* used mtu */
+	enum ib_mtu		peer_mtu;	/* mtu size of peer */
+	u32			psn_initial;	/* QP tx initial packet seqno */
+	u32			peer_psn;	/* QP rx initial packet seqno */
+	u8			peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
+	u8			peer_gid[sizeof(union ib_gid)];	/* gid of peer*/
+};
+
+/* For now we just allow one parallel link per link group. The SMC protocol
+ * allows more (up to 8).
+ */
+#define SMC_LINKS_PER_LGR_MAX	1
+#define SMC_SINGLE_LINK		0
+
+#define SMC_FIRST_CONTACT	1		/* first contact to a peer */
+#define SMC_REUSE_CONTACT	0		/* follow-on contact to a peer*/
+
+struct smc_link_group {
+	struct list_head	list;
+	enum smc_lgr_role	role;		/* client or server */
+	__be32			daddr;		/* destination ip address */
+	struct smc_link		lnk[SMC_LINKS_PER_LGR_MAX];	/* smc link */
+	char			peer_systemid[SMC_SYSTEMID_LEN];
+						/* unique system_id of peer */
+	struct rb_root		conns_all;	/* connection tree */
+	rwlock_t		conns_lock;	/* protects conns_all */
+	unsigned int		conns_num;	/* current # of connections */
+	unsigned short		vlan_id;	/* vlan id of link group */
+	struct delayed_work	free_work;	/* delayed freeing of an lgr */
+	bool			sync_err;	/* lgr no longer fits to peer */
+};
+
+/* Find the connection associated with the given alert token in the link group.
+ * To use rbtrees we have to implement our own search core.
+ * Requires @conns_lock
+ * @token	alert token to search for
+ * @lgr		 link group to search in
+ * Returns connection associated with token if found, NULL otherwise.
+ */
+static inline struct smc_connection *smc_lgr_find_conn(
+	u32 token, struct smc_link_group *lgr)
+{
+	struct smc_connection *res = NULL;
+	struct rb_node *node;
+
+	node = lgr->conns_all.rb_node;
+	while (node) {
+		struct smc_connection *cur = rb_entry(node,
+					struct smc_connection, alert_node);
+
+		if (cur->alert_token_local > token) {
+			node = node->rb_left;
+		} else {
+			if (cur->alert_token_local < token) {
+				node = node->rb_right;
+			} else {
+				res = cur;
+				break;
+			}
+		}
+	}
+
+	return res;
+}
+
+void smc_lgr_free(struct smc_link_group *lgr);
+void smc_lgr_terminate(struct smc_link_group *lgr);
+
+#endif
-- 
cgit v1.2.3


From cd6851f30386e5e04b5c2253f8e1647ba0ebcd31 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:18 +0100
Subject: smc: remote memory buffers (RMBs)

* allocate data RMB memory for sending and receiving
* size depends on the maximum socket send and receive buffers
* allocated RMBs are kept during life time of the owning link group
* map the allocated RMBs to DMA

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  29 ++++++-
 net/smc/smc.h      |  45 +++++++++++
 net/smc/smc_clc.c  |   6 +-
 net/smc/smc_core.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_core.h |  21 +++++
 net/smc/smc_ib.c   |  19 +++++
 net/smc/smc_ib.h   |   5 ++
 7 files changed, 342 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5fda37decc55..a38f470130d3 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -249,6 +249,8 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
 				    struct smc_clc_msg_accept_confirm *clc)
 {
 	smc->conn.peer_conn_idx = clc->conn_idx;
+	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
+	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 }
 
 static void smc_link_save_peer_info(struct smc_link *link,
@@ -323,6 +325,18 @@ static int smc_connect_rdma(struct smc_sock *smc)
 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
 	smc_conn_save_peer_info(smc, &aclc);
+
+	rc = smc_sndbuf_create(smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma_unlock;
+	}
+	rc = smc_rmb_create(smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma_unlock;
+	}
+
 	if (local_contact == SMC_FIRST_CONTACT)
 		smc_link_save_peer_info(link, &aclc);
 	/* tbd in follow-on patch: more steps to setup RDMA communcication,
@@ -598,9 +612,16 @@ static void smc_listen_work(struct work_struct *work)
 	}
 	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
-	/* tbd in follow-on patch: more steps to setup RDMA communcication,
-	 * create rmbs, map rmbs
-	 */
+	rc = smc_sndbuf_create(new_smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma;
+	}
+	rc = smc_rmb_create(new_smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma;
+	}
 
 	rc = smc_clc_send_accept(new_smc, local_contact);
 	if (rc)
@@ -1047,6 +1068,8 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
 			      IPPROTO_TCP, &smc->clcsock);
 	if (rc)
 		sk_common_release(sk);
+	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
+	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
 
 out:
 	return rc;
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 11265bde4655..2bf504492133 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -34,6 +34,16 @@ struct smc_connection {
 	struct smc_link_group	*lgr;		/* link group of connection */
 	u32			alert_token_local; /* unique conn. id */
 	u8			peer_conn_idx;	/* from tcp handshake */
+	int			peer_rmbe_size;	/* size of peer rx buffer */
+	atomic_t		peer_rmbe_space;/* remaining free bytes in peer
+						 * rmbe
+						 */
+
+	struct smc_buf_desc	*sndbuf_desc;	/* send buffer descriptor */
+	int			sndbuf_size;	/* sndbuf size <== sock wmem */
+	struct smc_buf_desc	*rmb_desc;	/* RMBE descriptor */
+	int			rmbe_size;	/* RMBE size <== sock rmem */
+	int			rmbe_size_short;/* compressed notation */
 };
 
 struct smc_sock {				/* smc sock container */
@@ -76,6 +86,41 @@ static inline u32 ntoh24(u8 *net)
 	return be32_to_cpu(t);
 }
 
+#define SMC_BUF_MIN_SIZE 16384		/* minimum size of an RMB */
+
+#define SMC_RMBE_SIZES	16	/* number of distinct sizes for an RMBE */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
+
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static inline u8 smc_compress_bufsize(int size)
+{
+	u8 compressed;
+
+	if (size <= SMC_BUF_MIN_SIZE)
+		return 0;
+
+	size = (size - 1) >> 14;
+	compressed = ilog2(size) + 1;
+	if (compressed >= SMC_RMBE_SIZES)
+		compressed = SMC_RMBE_SIZES - 1;
+	return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+static inline int smc_uncompress_bufsize(u8 compressed)
+{
+	u32 size;
+
+	size = 0x00000001 << (((int)compressed) + 14);
+	return (int)size;
+}
+
 #ifdef CONFIG_XFRM
 static inline bool using_ipsec(struct smc_sock *smc)
 {
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index f8e47c06d2ed..4b475dddef16 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -252,13 +252,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
 	       SMC_GID_SIZE);
 	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
 	       sizeof(link->smcibdev->mac[link->ibport - 1]));
-
-	/* tbd in follow-on patch: fill in rmb-related values */
-
 	hton24(aclc.qpn, link->roce_qp->qp_num);
 	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */
 	aclc.rmbe_alert_token = htonl(conn->alert_token_local);
 	aclc.qp_mtu = link->path_mtu;
+	aclc.rmbe_size = conn->rmbe_size_short,
+	aclc.rmb_dma_addr =
+		cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
 	hton24(aclc.psn, link->psn_initial);
 	memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index b88a82918c82..e1b95728ca81 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -133,6 +133,7 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 	struct smc_link *lnk;
 	u8 rndvec[3];
 	int rc = 0;
+	int i;
 
 	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
 	if (!lgr) {
@@ -144,6 +145,12 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 	lgr->daddr = peer_in_addr;
 	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
 	lgr->vlan_id = vlan_id;
+	rwlock_init(&lgr->sndbufs_lock);
+	rwlock_init(&lgr->rmbs_lock);
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		INIT_LIST_HEAD(&lgr->sndbufs[i]);
+		INIT_LIST_HEAD(&lgr->rmbs[i]);
+	}
 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
 	lgr->conns_all = RB_ROOT;
 
@@ -164,6 +171,22 @@ out:
 	return rc;
 }
 
+static void smc_sndbuf_unuse(struct smc_connection *conn)
+{
+	if (conn->sndbuf_desc) {
+		conn->sndbuf_desc->used = 0;
+		conn->sndbuf_size = 0;
+	}
+}
+
+static void smc_rmb_unuse(struct smc_connection *conn)
+{
+	if (conn->rmb_desc) {
+		conn->rmb_desc->used = 0;
+		conn->rmbe_size = 0;
+	}
+}
+
 /* remove a finished connection from its link group */
 void smc_conn_free(struct smc_connection *conn)
 {
@@ -172,6 +195,8 @@ void smc_conn_free(struct smc_connection *conn)
 	if (!lgr)
 		return;
 	smc_lgr_unregister_conn(conn);
+	smc_rmb_unuse(conn);
+	smc_sndbuf_unuse(conn);
 }
 
 static void smc_link_clear(struct smc_link *lnk)
@@ -179,9 +204,39 @@ static void smc_link_clear(struct smc_link *lnk)
 	lnk->peer_qpn = 0;
 }
 
+static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
+{
+	struct smc_buf_desc *sndbuf_desc, *bf_desc;
+	int i;
+
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
+					 list) {
+			kfree(sndbuf_desc->cpu_addr);
+			kfree(sndbuf_desc);
+		}
+	}
+}
+
+static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
+{
+	struct smc_buf_desc *rmb_desc, *bf_desc;
+	int i;
+
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
+					 list) {
+			kfree(rmb_desc->cpu_addr);
+			kfree(rmb_desc);
+		}
+	}
+}
+
 /* remove a link group */
 void smc_lgr_free(struct smc_link_group *lgr)
 {
+	smc_lgr_free_rmbs(lgr);
+	smc_lgr_free_sndbufs(lgr);
 	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
 	kfree(lgr);
 }
@@ -300,7 +355,9 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
 			    sizeof(lcl->mac)) &&
 		    !lgr->sync_err &&
 		    (lgr->role == role) &&
-		    (lgr->vlan_id == vlan_id)) {
+		    (lgr->vlan_id == vlan_id) &&
+		    ((role == SMC_CLNT) ||
+		     (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
 			/* link group found */
 			local_contact = SMC_REUSE_CONTACT;
 			conn->lgr = lgr;
@@ -334,3 +391,168 @@ create:
 out:
 	return rc ? rc : local_contact;
 }
+
+/* try to reuse a sndbuf description slot of the sndbufs list for a certain
+ * buf_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
+					 int compressed_bufsize)
+{
+	struct smc_buf_desc *sndbuf_slot;
+
+	read_lock_bh(&lgr->sndbufs_lock);
+	list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
+			    list) {
+		if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
+			read_unlock_bh(&lgr->sndbufs_lock);
+			return sndbuf_slot;
+		}
+	}
+	read_unlock_bh(&lgr->sndbufs_lock);
+	return NULL;
+}
+
+/* try to reuse an rmb description slot of the rmbs list for a certain
+ * rmbe_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
+				      int compressed_bufsize)
+{
+	struct smc_buf_desc *rmb_slot;
+
+	read_lock_bh(&lgr->rmbs_lock);
+	list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
+			    list) {
+		if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
+			read_unlock_bh(&lgr->rmbs_lock);
+			return rmb_slot;
+		}
+	}
+	read_unlock_bh(&lgr->rmbs_lock);
+	return NULL;
+}
+
+/* create the tx buffer for an SMC socket */
+int smc_sndbuf_create(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	struct smc_link_group *lgr = conn->lgr;
+	int tmp_bufsize, tmp_bufsize_short;
+	struct smc_buf_desc *sndbuf_desc;
+	int rc;
+
+	/* use socket send buffer size (w/o overhead) as start value */
+	for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
+	     tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+		tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+		/* check for reusable sndbuf_slot in the link group */
+		sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
+		if (sndbuf_desc) {
+			memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
+			break; /* found reusable slot */
+		}
+		/* try to alloc a new send buffer */
+		sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
+		if (!sndbuf_desc)
+			break; /* give up with -ENOMEM */
+		sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
+						GFP_KERNEL | __GFP_NOWARN |
+						__GFP_NOMEMALLOC |
+						__GFP_NORETRY);
+		if (!sndbuf_desc->cpu_addr) {
+			kfree(sndbuf_desc);
+			/* if send buffer allocation has failed,
+			 * try a smaller one
+			 */
+			continue;
+		}
+		rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+				    tmp_bufsize, sndbuf_desc,
+				    DMA_TO_DEVICE);
+		if (rc) {
+			kfree(sndbuf_desc->cpu_addr);
+			kfree(sndbuf_desc);
+			continue; /* if mapping failed, try smaller one */
+		}
+		sndbuf_desc->used = 1;
+		write_lock_bh(&lgr->sndbufs_lock);
+		list_add(&sndbuf_desc->list,
+			 &lgr->sndbufs[tmp_bufsize_short]);
+		write_unlock_bh(&lgr->sndbufs_lock);
+		break;
+	}
+	if (sndbuf_desc && sndbuf_desc->cpu_addr) {
+		conn->sndbuf_desc = sndbuf_desc;
+		conn->sndbuf_size = tmp_bufsize;
+		smc->sk.sk_sndbuf = tmp_bufsize * 2;
+		return 0;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+/* create the RMB for an SMC socket (even though the SMC protocol
+ * allows more than one RMB-element per RMB, the Linux implementation
+ * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
+ * connection in a link group
+ */
+int smc_rmb_create(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	struct smc_link_group *lgr = conn->lgr;
+	int tmp_bufsize, tmp_bufsize_short;
+	struct smc_buf_desc *rmb_desc;
+	int rc;
+
+	/* use socket recv buffer size (w/o overhead) as start value */
+	for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
+	     tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+		tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+		/* check for reusable rmb_slot in the link group */
+		rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
+		if (rmb_desc) {
+			memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
+			break; /* found reusable slot */
+		}
+		/* try to alloc a new RMB */
+		rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
+		if (!rmb_desc)
+			break; /* give up with -ENOMEM */
+		rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
+					     GFP_KERNEL | __GFP_NOWARN |
+					     __GFP_NOMEMALLOC |
+					     __GFP_NORETRY);
+		if (!rmb_desc->cpu_addr) {
+			kfree(rmb_desc);
+			/* if RMB allocation has failed,
+			 * try a smaller one
+			 */
+			continue;
+		}
+		rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+				    tmp_bufsize, rmb_desc,
+				    DMA_FROM_DEVICE);
+		if (rc) {
+			kfree(rmb_desc->cpu_addr);
+			kfree(rmb_desc);
+			continue; /* if mapping failed, try smaller one */
+		}
+		rmb_desc->used = 1;
+		write_lock_bh(&lgr->rmbs_lock);
+		list_add(&rmb_desc->list,
+			 &lgr->rmbs[tmp_bufsize_short]);
+		write_unlock_bh(&lgr->rmbs_lock);
+		break;
+	}
+	if (rmb_desc && rmb_desc->cpu_addr) {
+		conn->rmb_desc = rmb_desc;
+		conn->rmbe_size = tmp_bufsize;
+		conn->rmbe_size_short = tmp_bufsize_short;
+		smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+		return 0;
+	} else {
+		return -ENOMEM;
+	}
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 14b787abae02..bf0026db44e9 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -16,6 +16,8 @@
 #include "smc.h"
 #include "smc_ib.h"
 
+#define SMC_RMBS_PER_LGR_MAX	255	/* max. # of RMBs per link group */
+
 struct smc_lgr_list {			/* list of link group definition */
 	struct list_head	list;
 	spinlock_t		lock;	/* protects list of link groups */
@@ -52,6 +54,15 @@ struct smc_link {
 #define SMC_FIRST_CONTACT	1		/* first contact to a peer */
 #define SMC_REUSE_CONTACT	0		/* follow-on contact to a peer*/
 
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+	struct list_head	list;
+	u64			dma_addr[SMC_LINKS_PER_LGR_MAX];
+						/* mapped address of buffer */
+	void			*cpu_addr;	/* virtual address of buffer */
+	u32			used;		/* currently used / unused */
+};
+
 struct smc_link_group {
 	struct list_head	list;
 	enum smc_lgr_role	role;		/* client or server */
@@ -63,6 +74,11 @@ struct smc_link_group {
 	rwlock_t		conns_lock;	/* protects conns_all */
 	unsigned int		conns_num;	/* current # of connections */
 	unsigned short		vlan_id;	/* vlan id of link group */
+
+	struct list_head	sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+	rwlock_t		sndbufs_lock;	/* protects tx buffers */
+	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
+	rwlock_t		rmbs_lock;	/* protects rx buffers */
 	struct delayed_work	free_work;	/* delayed freeing of an lgr */
 	bool			sync_err;	/* lgr no longer fits to peer */
 };
@@ -100,7 +116,12 @@ static inline struct smc_connection *smc_lgr_find_conn(
 	return res;
 }
 
+struct smc_sock;
+struct smc_clc_msg_accept_confirm;
+
 void smc_lgr_free(struct smc_link_group *lgr);
 void smc_lgr_terminate(struct smc_link_group *lgr);
+int smc_sndbuf_create(struct smc_sock *smc);
+int smc_rmb_create(struct smc_sock *smc);
 
 #endif
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 5b037f435bc1..762b7e13c93d 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -16,6 +16,7 @@
 
 #include "smc_pnet.h"
 #include "smc_ib.h"
+#include "smc_core.h"
 #include "smc.h"
 
 struct smc_ib_devices smc_ib_devices = {	/* smc-registered ib devices */
@@ -29,6 +30,24 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET;	/* unique system
 								 * identifier
 								 */
 
+/* map a new TX or RX buffer to DMA */
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+		   struct smc_buf_desc *buf_slot,
+		   enum dma_data_direction data_direction)
+{
+	int rc = 0;
+
+	if (buf_slot->dma_addr[SMC_SINGLE_LINK])
+		return rc; /* already mapped */
+	buf_slot->dma_addr[SMC_SINGLE_LINK] =
+		ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
+				  buf_size, data_direction);
+	if (ib_dma_mapping_error(smcibdev->ibdev,
+				 buf_slot->dma_addr[SMC_SINGLE_LINK]))
+		rc = -EIO;
+	return rc;
+}
+
 static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
 {
 	struct net_device *ndev;
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 63613e715f4f..c3b61726861a 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -32,9 +32,14 @@ struct smc_ib_device {				/* ib-device infos for smc */
 	u8			initialized : 1; /* ib dev CQ, evthdl done */
 };
 
+struct smc_buf_desc;
+
 int smc_ib_register_client(void) __init;
 void smc_ib_unregister_client(void);
 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
 int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+		   struct smc_buf_desc *buf_slot,
+		   enum dma_data_direction data_direction);
 
 #endif
-- 
cgit v1.2.3


From f38ba179c6ca94ebeb0ac6a0956c4ea533151ad8 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:19 +0100
Subject: smc: work request (WR) base for use by LLC and CDC

The base containers for RDMA transport are work requests and completion
queue entries processed through Infiniband verbs:
* allocate and initialize these areas
* map these areas to DMA
* implement the basic communication consisting of work request posting
  and receival of completion queue events

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile   |   2 +-
 net/smc/smc.h      |   5 +
 net/smc/smc_core.c |  11 ++
 net/smc/smc_core.h |  30 +++
 net/smc/smc_ib.c   |  73 +++++++
 net/smc/smc_ib.h   |  11 ++
 net/smc/smc_wr.c   | 564 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_wr.h   |  95 +++++++++
 8 files changed, 790 insertions(+), 1 deletion(-)
 create mode 100644 net/smc/smc_wr.c
 create mode 100644 net/smc/smc_wr.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index cb8bcd9df53e..b19120ed7102 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_SMC)	+= smc.o
-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 2bf504492133..209a0b5f59cb 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -12,6 +12,7 @@
 
 #include <linux/socket.h>
 #include <linux/types.h>
+#include <linux/compiler.h> /* __aligned */
 #include <net/sock.h>
 
 #include "smc_ib.h"
@@ -29,6 +30,10 @@ enum smc_state {		/* possible states of an SMC socket */
 
 struct smc_link_group;
 
+struct smc_wr_rx_hdr {	/* common prefix part of LLC and CDC to demultiplex */
+	u8			type;
+} __aligned(1);
+
 struct smc_connection {
 	struct rb_node		alert_node;
 	struct smc_link_group	*lgr;		/* link group of connection */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index e1b95728ca81..0eed4c154081 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -20,6 +20,7 @@
 #include "smc_clc.h"
 #include "smc_core.h"
 #include "smc_ib.h"
+#include "smc_wr.h"
 
 #define SMC_LGR_FREE_DELAY	(600 * HZ)
 
@@ -161,12 +162,20 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
 	get_random_bytes(rndvec, sizeof(rndvec));
 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
+	rc = smc_wr_alloc_link_mem(lnk);
+	if (rc)
+		goto free_lgr;
+	init_waitqueue_head(&lnk->wr_tx_wait);
 
 	smc->conn.lgr = lgr;
 	rwlock_init(&lgr->conns_lock);
 	spin_lock_bh(&smc_lgr_list.lock);
 	list_add(&lgr->list, &smc_lgr_list.list);
 	spin_unlock_bh(&smc_lgr_list.lock);
+	return 0;
+
+free_lgr:
+	kfree(lgr);
 out:
 	return rc;
 }
@@ -202,6 +211,8 @@ void smc_conn_free(struct smc_connection *conn)
 static void smc_link_clear(struct smc_link *lnk)
 {
 	lnk->peer_qpn = 0;
+	smc_wr_free_link(lnk);
+	smc_wr_free_link_mem(lnk);
 }
 
 static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index bf0026db44e9..ca4587a95450 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -11,6 +11,7 @@
 #ifndef _SMC_CORE_H
 #define _SMC_CORE_H
 
+#include <linux/atomic.h>
 #include <rdma/ib_verbs.h>
 
 #include "smc.h"
@@ -30,11 +31,40 @@ enum smc_lgr_role {		/* possible roles of a link group */
 	SMC_SERV	/* server */
 };
 
+#define SMC_WR_BUF_SIZE		48	/* size of work request buffer */
+
+struct smc_wr_buf {
+	u8	raw[SMC_WR_BUF_SIZE];
+};
+
 struct smc_link {
 	struct smc_ib_device	*smcibdev;	/* ib-device */
 	u8			ibport;		/* port - values 1 | 2 */
+	struct ib_pd		*roce_pd;	/* IB protection domain,
+						 * unique for every RoCE QP
+						 */
 	struct ib_qp		*roce_qp;	/* IB queue pair */
 	struct ib_qp_attr	qp_attr;	/* IB queue pair attributes */
+
+	struct smc_wr_buf	*wr_tx_bufs;	/* WR send payload buffers */
+	struct ib_send_wr	*wr_tx_ibs;	/* WR send meta data */
+	struct ib_sge		*wr_tx_sges;	/* WR send gather meta data */
+	struct smc_wr_tx_pend	*wr_tx_pends;	/* WR send waiting for CQE */
+	/* above four vectors have wr_tx_cnt elements and use the same index */
+	dma_addr_t		wr_tx_dma_addr;	/* DMA address of wr_tx_bufs */
+	atomic_long_t		wr_tx_id;	/* seq # of last sent WR */
+	unsigned long		*wr_tx_mask;	/* bit mask of used indexes */
+	u32			wr_tx_cnt;	/* number of WR send buffers */
+	wait_queue_head_t	wr_tx_wait;	/* wait for free WR send buf */
+
+	struct smc_wr_buf	*wr_rx_bufs;	/* WR recv payload buffers */
+	struct ib_recv_wr	*wr_rx_ibs;	/* WR recv meta data */
+	struct ib_sge		*wr_rx_sges;	/* WR recv scatter meta data */
+	/* above three vectors have wr_rx_cnt elements and use the same index */
+	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
+	u64			wr_rx_id;	/* seq # of last recv WR */
+	u32			wr_rx_cnt;	/* number of WR recv buffers */
+
 	union ib_gid		gid;		/* gid matching used vlan id */
 	u32			peer_qpn;	/* QP number of peer */
 	enum ib_mtu		path_mtu;	/* used mtu */
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 762b7e13c93d..9fb46a63a17e 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -17,6 +17,7 @@
 #include "smc_pnet.h"
 #include "smc_ib.h"
 #include "smc_core.h"
+#include "smc_wr.h"
 #include "smc.h"
 
 struct smc_ib_devices smc_ib_devices = {	/* smc-registered ib devices */
@@ -30,6 +31,78 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET;	/* unique system
 								 * identifier
 								 */
 
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+{
+	ib_dealloc_pd(lnk->roce_pd);
+	lnk->roce_pd = NULL;
+}
+
+int smc_ib_create_protection_domain(struct smc_link *lnk)
+{
+	int rc;
+
+	lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+	rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
+	if (IS_ERR(lnk->roce_pd))
+		lnk->roce_pd = NULL;
+	return rc;
+}
+
+static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
+{
+	switch (ibevent->event) {
+	case IB_EVENT_DEVICE_FATAL:
+	case IB_EVENT_GID_CHANGE:
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+		/* tbd in follow-on patch:
+		 * abnormal close of corresponding connections
+		 */
+		break;
+	default:
+		break;
+	}
+}
+
+void smc_ib_destroy_queue_pair(struct smc_link *lnk)
+{
+	ib_destroy_qp(lnk->roce_qp);
+	lnk->roce_qp = NULL;
+}
+
+/* create a queue pair within the protection domain for a link */
+int smc_ib_create_queue_pair(struct smc_link *lnk)
+{
+	struct ib_qp_init_attr qp_attr = {
+		.event_handler = smc_ib_qp_event_handler,
+		.qp_context = lnk,
+		.send_cq = lnk->smcibdev->roce_cq_send,
+		.recv_cq = lnk->smcibdev->roce_cq_recv,
+		.srq = NULL,
+		.cap = {
+			.max_send_wr = SMC_WR_BUF_CNT,
+				/* include unsolicited rdma_writes as well,
+				 * there are max. 2 RDMA_WRITE per 1 WR_SEND
+				 */
+			.max_recv_wr = SMC_WR_BUF_CNT * 3,
+			.max_send_sge = SMC_IB_MAX_SEND_SGE,
+			.max_recv_sge = 1,
+			.max_inline_data = SMC_WR_TX_SIZE,
+		},
+		.sq_sig_type = IB_SIGNAL_REQ_WR,
+		.qp_type = IB_QPT_RC,
+	};
+	int rc;
+
+	lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
+	rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
+	if (IS_ERR(lnk->roce_qp))
+		lnk->roce_qp = NULL;
+	else
+		smc_wr_remember_qp_attr(lnk);
+	return rc;
+}
+
 /* map a new TX or RX buffer to DMA */
 int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
 		   struct smc_buf_desc *buf_slot,
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index c3b61726861a..441fd168911d 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -16,6 +16,8 @@
 #define SMC_MAX_PORTS			2	/* Max # of ports */
 #define SMC_GID_SIZE			sizeof(union ib_gid)
 
+#define SMC_IB_MAX_SEND_SGE		2
+
 struct smc_ib_devices {			/* list of smc ib devices definition */
 	struct list_head	list;
 	spinlock_t		lock;	/* protects list of smc ib devices */
@@ -27,12 +29,17 @@ struct smc_ib_device {				/* ib-device infos for smc */
 	struct list_head	list;
 	struct ib_device	*ibdev;
 	struct ib_port_attr	pattr[SMC_MAX_PORTS];	/* ib dev. port attrs */
+	struct ib_cq		*roce_cq_send;	/* send completion queue */
+	struct ib_cq		*roce_cq_recv;	/* recv completion queue */
+	struct tasklet_struct	send_tasklet;	/* called by send cq handler */
+	struct tasklet_struct	recv_tasklet;	/* called by recv cq handler */
 	char			mac[SMC_MAX_PORTS][6]; /* mac address per port*/
 	union ib_gid		gid[SMC_MAX_PORTS]; /* gid per port */
 	u8			initialized : 1; /* ib dev CQ, evthdl done */
 };
 
 struct smc_buf_desc;
+struct smc_link;
 
 int smc_ib_register_client(void) __init;
 void smc_ib_unregister_client(void);
@@ -41,5 +48,9 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
 int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
 		   struct smc_buf_desc *buf_slot,
 		   enum dma_data_direction data_direction);
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
+int smc_ib_create_protection_domain(struct smc_link *lnk);
+void smc_ib_destroy_queue_pair(struct smc_link *lnk);
+int smc_ib_create_queue_pair(struct smc_link *lnk);
 
 #endif
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000000..a2bc6b612d03
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,564 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
+ * are submitted to either RC SQ or RC RQ respectively
+ * (reliably connected send/receive queue)
+ * and become work queue entries (WQEs).
+ * While an SQ WR/WQE is pending, we track it until transmission completion.
+ * Through a send or receive completion queue (CQ) respectively,
+ * we get completion queue entries (CQEs) [aka work completions (WCs)].
+ * Since the CQ callback is called from IRQ context, we split work by using
+ * bottom halves implemented by tasklets.
+ *
+ * SMC uses this to exchange LLC (link layer control)
+ * and CDC (connection data control) messages.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/hashtable.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+
+#define SMC_WR_MAX_POLL_CQE 10	/* max. # of compl. queue elements in 1 poll */
+
+#define SMC_WR_RX_HASH_BITS 4
+static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+struct smc_wr_tx_pend {	/* control data for a pending send request */
+	u64			wr_id;		/* work request id sent */
+	smc_wr_tx_handler	handler;
+	enum ib_wc_status	wc_status;	/* CQE status */
+	struct smc_link		*link;
+	u32			idx;
+	struct smc_wr_tx_pend_priv priv;
+};
+
+/******************************** send queue *********************************/
+
+/*------------------------------- completion --------------------------------*/
+
+static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+{
+	u32 i;
+
+	for (i = 0; i < link->wr_tx_cnt; i++) {
+		if (link->wr_tx_pends[i].wr_id == wr_id)
+			return i;
+	}
+	return link->wr_tx_cnt;
+}
+
+static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+{
+	struct smc_wr_tx_pend pnd_snd;
+	struct smc_link *link;
+	u32 pnd_snd_idx;
+	int i;
+
+	link = wc->qp->qp_context;
+	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
+	if (pnd_snd_idx == link->wr_tx_cnt)
+		return;
+	link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+	memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
+	/* clear the full struct smc_wr_tx_pend including .priv */
+	memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+	       sizeof(link->wr_tx_pends[pnd_snd_idx]));
+	memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+	       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+	if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+		return;
+	if (wc->status) {
+		for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+			/* clear full struct smc_wr_tx_pend including .priv */
+			memset(&link->wr_tx_pends[i], 0,
+			       sizeof(link->wr_tx_pends[i]));
+			memset(&link->wr_tx_bufs[i], 0,
+			       sizeof(link->wr_tx_bufs[i]));
+			clear_bit(i, link->wr_tx_mask);
+		}
+		/* tbd in future patch: terminate connections of this link
+		 * group abnormally
+		 */
+	}
+	if (pnd_snd.handler)
+		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
+	wake_up(&link->wr_tx_wait);
+}
+
+static void smc_wr_tx_tasklet_fn(unsigned long data)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)data;
+	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+	int i = 0, rc;
+	int polled = 0;
+
+again:
+	polled++;
+	do {
+		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
+		if (polled == 1) {
+			ib_req_notify_cq(dev->roce_cq_send,
+					 IB_CQ_NEXT_COMP |
+					 IB_CQ_REPORT_MISSED_EVENTS);
+		}
+		if (!rc)
+			break;
+		for (i = 0; i < rc; i++)
+			smc_wr_tx_process_cqe(&wc[i]);
+	} while (rc > 0);
+	if (polled == 1)
+		goto again;
+}
+
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+	tasklet_schedule(&dev->send_tasklet);
+}
+
+/*---------------------------- request submission ---------------------------*/
+
+static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
+{
+	*idx = link->wr_tx_cnt;
+	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
+		if (!test_and_set_bit(*idx, link->wr_tx_mask))
+			return 0;
+	}
+	*idx = link->wr_tx_cnt;
+	return -EBUSY;
+}
+
+/**
+ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
+ *			and sets info for pending transmit tracking
+ * @link:		Pointer to smc_link used to later send the message.
+ * @handler:		Send completion handler function pointer.
+ * @wr_buf:		Out value returns pointer to message buffer.
+ * @wr_pend_priv:	Out value returns pointer serving as handler context.
+ *
+ * Return: 0 on success, or -errno on error.
+ */
+int smc_wr_tx_get_free_slot(struct smc_link *link,
+			    smc_wr_tx_handler handler,
+			    struct smc_wr_buf **wr_buf,
+			    struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+	struct smc_wr_tx_pend *wr_pend;
+	struct ib_send_wr *wr_ib;
+	u64 wr_id;
+	u32 idx;
+	int rc;
+
+	*wr_buf = NULL;
+	*wr_pend_priv = NULL;
+	if (in_softirq()) {
+		rc = smc_wr_tx_get_free_slot_index(link, &idx);
+		if (rc)
+			return rc;
+	} else {
+		rc = wait_event_interruptible_timeout(
+			link->wr_tx_wait,
+			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
+			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
+		if (!rc) {
+			/* tbd in future patch: timeout - terminate connections
+			 * of this link group abnormally
+			 */
+			return -EPIPE;
+		}
+		if (rc == -ERESTARTSYS)
+			return -EINTR;
+		if (idx == link->wr_tx_cnt)
+			return -EPIPE;
+	}
+	wr_id = smc_wr_tx_get_next_wr_id(link);
+	wr_pend = &link->wr_tx_pends[idx];
+	wr_pend->wr_id = wr_id;
+	wr_pend->handler = handler;
+	wr_pend->link = link;
+	wr_pend->idx = idx;
+	wr_ib = &link->wr_tx_ibs[idx];
+	wr_ib->wr_id = wr_id;
+	*wr_buf = &link->wr_tx_bufs[idx];
+	*wr_pend_priv = &wr_pend->priv;
+	return 0;
+}
+
+int smc_wr_tx_put_slot(struct smc_link *link,
+		       struct smc_wr_tx_pend_priv *wr_pend_priv)
+{
+	struct smc_wr_tx_pend *pend;
+
+	pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+	if (pend->idx < link->wr_tx_cnt) {
+		/* clear the full struct smc_wr_tx_pend including .priv */
+		memset(&link->wr_tx_pends[pend->idx], 0,
+		       sizeof(link->wr_tx_pends[pend->idx]));
+		memset(&link->wr_tx_bufs[pend->idx], 0,
+		       sizeof(link->wr_tx_bufs[pend->idx]));
+		test_and_clear_bit(pend->idx, link->wr_tx_mask);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Send prepared WR slot via ib_post_send.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
+{
+	struct ib_send_wr *failed_wr = NULL;
+	struct smc_wr_tx_pend *pend;
+	int rc;
+
+	ib_req_notify_cq(link->smcibdev->roce_cq_send,
+			 IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
+	pend = container_of(priv, struct smc_wr_tx_pend, priv);
+	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
+			  &failed_wr);
+	if (rc)
+		smc_wr_tx_put_slot(link, priv);
+	return rc;
+}
+
+/****************************** receive queue ********************************/
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
+{
+	struct smc_wr_rx_handler *h_iter;
+	int rc = 0;
+
+	spin_lock(&smc_wr_rx_hash_lock);
+	hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
+		if (h_iter->type == handler->type) {
+			rc = -EEXIST;
+			goto out_unlock;
+		}
+	}
+	hash_add(smc_wr_rx_hash, &handler->list, handler->type);
+out_unlock:
+	spin_unlock(&smc_wr_rx_hash_lock);
+	return rc;
+}
+
+/* Demultiplex a received work request based on the message type to its handler.
+ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
+ * and not being modified any more afterwards so we don't need to lock it.
+ */
+static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	struct smc_wr_rx_handler *handler;
+	struct smc_wr_rx_hdr *wr_rx;
+	u64 temp_wr_id;
+	u32 index;
+
+	if (wc->byte_len < sizeof(*wr_rx))
+		return; /* short message */
+	temp_wr_id = wc->wr_id;
+	index = do_div(temp_wr_id, link->wr_rx_cnt);
+	wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+	hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
+		if (handler->type == wr_rx->type)
+			handler->handler(wc, wr_rx);
+	}
+}
+
+static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+{
+	struct smc_link *link;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		link = wc[i].qp->qp_context;
+		if (wc[i].status == IB_WC_SUCCESS) {
+			smc_wr_rx_demultiplex(&wc[i]);
+			smc_wr_rx_post(link); /* refill WR RX */
+		} else {
+			/* handle status errors */
+			switch (wc[i].status) {
+			case IB_WC_RETRY_EXC_ERR:
+			case IB_WC_RNR_RETRY_EXC_ERR:
+			case IB_WC_WR_FLUSH_ERR:
+			/* tbd in future patch: terminate connections of this
+			 * link group abnormally
+			 */
+				break;
+			default:
+				smc_wr_rx_post(link); /* refill WR RX */
+				break;
+			}
+		}
+	}
+}
+
+static void smc_wr_rx_tasklet_fn(unsigned long data)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)data;
+	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+	int polled = 0;
+	int rc;
+
+again:
+	polled++;
+	do {
+		memset(&wc, 0, sizeof(wc));
+		rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
+		if (polled == 1) {
+			ib_req_notify_cq(dev->roce_cq_recv,
+					 IB_CQ_SOLICITED_MASK
+					 | IB_CQ_REPORT_MISSED_EVENTS);
+		}
+		if (!rc)
+			break;
+		smc_wr_rx_process_cqes(&wc[0], rc);
+	} while (rc > 0);
+	if (polled == 1)
+		goto again;
+}
+
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+	tasklet_schedule(&dev->recv_tasklet);
+}
+
+int smc_wr_rx_post_init(struct smc_link *link)
+{
+	u32 i;
+	int rc = 0;
+
+	for (i = 0; i < link->wr_rx_cnt; i++)
+		rc = smc_wr_rx_post(link);
+	return rc;
+}
+
+/***************************** init, exit, misc ******************************/
+
+void smc_wr_remember_qp_attr(struct smc_link *lnk)
+{
+	struct ib_qp_attr *attr = &lnk->qp_attr;
+	struct ib_qp_init_attr init_attr;
+
+	memset(attr, 0, sizeof(*attr));
+	memset(&init_attr, 0, sizeof(init_attr));
+	ib_query_qp(lnk->roce_qp, attr,
+		    IB_QP_STATE |
+		    IB_QP_CUR_STATE |
+		    IB_QP_PKEY_INDEX |
+		    IB_QP_PORT |
+		    IB_QP_QKEY |
+		    IB_QP_AV |
+		    IB_QP_PATH_MTU |
+		    IB_QP_TIMEOUT |
+		    IB_QP_RETRY_CNT |
+		    IB_QP_RNR_RETRY |
+		    IB_QP_RQ_PSN |
+		    IB_QP_ALT_PATH |
+		    IB_QP_MIN_RNR_TIMER |
+		    IB_QP_SQ_PSN |
+		    IB_QP_PATH_MIG_STATE |
+		    IB_QP_CAP |
+		    IB_QP_DEST_QPN,
+		    &init_attr);
+
+	lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+			       lnk->qp_attr.cap.max_send_wr);
+	lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+			       lnk->qp_attr.cap.max_recv_wr);
+}
+
+static void smc_wr_init_sge(struct smc_link *lnk)
+{
+	u32 i;
+
+	for (i = 0; i < lnk->wr_tx_cnt; i++) {
+		lnk->wr_tx_sges[i].addr =
+			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+		lnk->wr_tx_ibs[i].next = NULL;
+		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+		lnk->wr_tx_ibs[i].num_sge = 1;
+		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
+		lnk->wr_tx_ibs[i].send_flags =
+			IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
+	}
+	for (i = 0; i < lnk->wr_rx_cnt; i++) {
+		lnk->wr_rx_sges[i].addr =
+			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+		lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
+		lnk->wr_rx_ibs[i].next = NULL;
+		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
+		lnk->wr_rx_ibs[i].num_sge = 1;
+	}
+}
+
+void smc_wr_free_link(struct smc_link *lnk)
+{
+	struct ib_device *ibdev;
+
+	memset(lnk->wr_tx_mask, 0,
+	       BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+
+	if (!lnk->smcibdev)
+		return;
+	ibdev = lnk->smcibdev->ibdev;
+
+	if (lnk->wr_rx_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+				    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+				    DMA_FROM_DEVICE);
+		lnk->wr_rx_dma_addr = 0;
+	}
+	if (lnk->wr_tx_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
+				    SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+				    DMA_TO_DEVICE);
+		lnk->wr_tx_dma_addr = 0;
+	}
+}
+
+void smc_wr_free_link_mem(struct smc_link *lnk)
+{
+	kfree(lnk->wr_tx_pends);
+	lnk->wr_tx_pends = NULL;
+	kfree(lnk->wr_tx_mask);
+	lnk->wr_tx_mask = NULL;
+	kfree(lnk->wr_tx_sges);
+	lnk->wr_tx_sges = NULL;
+	kfree(lnk->wr_rx_sges);
+	lnk->wr_rx_sges = NULL;
+	kfree(lnk->wr_rx_ibs);
+	lnk->wr_rx_ibs = NULL;
+	kfree(lnk->wr_tx_ibs);
+	lnk->wr_tx_ibs = NULL;
+	kfree(lnk->wr_tx_bufs);
+	lnk->wr_tx_bufs = NULL;
+	kfree(lnk->wr_rx_bufs);
+	lnk->wr_rx_bufs = NULL;
+}
+
+int smc_wr_alloc_link_mem(struct smc_link *link)
+{
+	/* allocate link related memory */
+	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+	if (!link->wr_tx_bufs)
+		goto no_mem;
+	link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+				   GFP_KERNEL);
+	if (!link->wr_rx_bufs)
+		goto no_mem_wr_tx_bufs;
+	link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
+				  GFP_KERNEL);
+	if (!link->wr_tx_ibs)
+		goto no_mem_wr_rx_bufs;
+	link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+				  sizeof(link->wr_rx_ibs[0]),
+				  GFP_KERNEL);
+	if (!link->wr_rx_ibs)
+		goto no_mem_wr_tx_ibs;
+	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+				   GFP_KERNEL);
+	if (!link->wr_tx_sges)
+		goto no_mem_wr_rx_ibs;
+	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+				   sizeof(link->wr_rx_sges[0]),
+				   GFP_KERNEL);
+	if (!link->wr_rx_sges)
+		goto no_mem_wr_tx_sges;
+	link->wr_tx_mask = kzalloc(
+		BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
+		GFP_KERNEL);
+	if (!link->wr_tx_mask)
+		goto no_mem_wr_rx_sges;
+	link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+				    sizeof(link->wr_tx_pends[0]),
+				    GFP_KERNEL);
+	if (!link->wr_tx_pends)
+		goto no_mem_wr_tx_mask;
+	return 0;
+
+no_mem_wr_tx_mask:
+	kfree(link->wr_tx_mask);
+no_mem_wr_rx_sges:
+	kfree(link->wr_rx_sges);
+no_mem_wr_tx_sges:
+	kfree(link->wr_tx_sges);
+no_mem_wr_rx_ibs:
+	kfree(link->wr_rx_ibs);
+no_mem_wr_tx_ibs:
+	kfree(link->wr_tx_ibs);
+no_mem_wr_rx_bufs:
+	kfree(link->wr_rx_bufs);
+no_mem_wr_tx_bufs:
+	kfree(link->wr_tx_bufs);
+no_mem:
+	return -ENOMEM;
+}
+
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
+{
+	tasklet_kill(&smcibdev->recv_tasklet);
+	tasklet_kill(&smcibdev->send_tasklet);
+}
+
+void smc_wr_add_dev(struct smc_ib_device *smcibdev)
+{
+	tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
+		     (unsigned long)smcibdev);
+	tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
+		     (unsigned long)smcibdev);
+}
+
+int smc_wr_create_link(struct smc_link *lnk)
+{
+	struct ib_device *ibdev = lnk->smcibdev->ibdev;
+	int rc = 0;
+
+	smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
+	lnk->wr_rx_id = 0;
+	lnk->wr_rx_dma_addr = ib_dma_map_single(
+		ibdev, lnk->wr_rx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+		DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
+		lnk->wr_rx_dma_addr = 0;
+		rc = -EIO;
+		goto out;
+	}
+	lnk->wr_tx_dma_addr = ib_dma_map_single(
+		ibdev, lnk->wr_tx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+		DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
+		rc = -EIO;
+		goto dma_unmap;
+	}
+	smc_wr_init_sge(lnk);
+	memset(lnk->wr_tx_mask, 0,
+	       BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+	return rc;
+
+dma_unmap:
+	ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+			    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+			    DMA_FROM_DEVICE);
+	lnk->wr_rx_dma_addr = 0;
+out:
+	return rc;
+}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000000..0b626728ce44
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,95 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_WR_H
+#define SMC_WR_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+#define SMC_WR_MAX_CQE 32768	/* max. # of completion queue elements */
+#define SMC_WR_BUF_CNT 16	/* # of ctrl buffers per link */
+
+#define SMC_WR_TX_WAIT_FREE_SLOT_TIME	(10 * HZ)
+#define SMC_WR_TX_WAIT_PENDING_TIME	(5 * HZ)
+
+#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
+
+#define SMC_WR_TX_PEND_PRIV_SIZE 32
+
+struct smc_wr_tx_pend_priv {
+	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
+};
+
+typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
+				  struct smc_link *,
+				  enum ib_wc_status);
+
+struct smc_wr_rx_handler {
+	struct hlist_node	list;	/* hash table collision resolution */
+	void			(*handler)(struct ib_wc *, void *);
+	u8			type;
+};
+
+/* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
+static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
+{
+	return atomic_long_inc_return(&link->wr_tx_id);
+}
+
+static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
+{
+	atomic_long_set(wr_tx_id, val);
+}
+
+/* post a new receive work request to fill a completed old work request entry */
+static inline int smc_wr_rx_post(struct smc_link *link)
+{
+	struct ib_recv_wr *bad_recv_wr = NULL;
+	int rc;
+	u64 wr_id, temp_wr_id;
+	u32 index;
+
+	wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
+	temp_wr_id = wr_id;
+	index = do_div(temp_wr_id, link->wr_rx_cnt);
+	link->wr_rx_ibs[index].wr_id = wr_id;
+	rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
+	return rc;
+}
+
+int smc_wr_create_link(struct smc_link *lnk);
+int smc_wr_alloc_link_mem(struct smc_link *lnk);
+void smc_wr_free_link(struct smc_link *lnk);
+void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_remember_qp_attr(struct smc_link *lnk);
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
+void smc_wr_add_dev(struct smc_ib_device *smcibdev);
+
+int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
+			    struct smc_wr_buf **wr_buf,
+			    struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_put_slot(struct smc_link *link,
+		       struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send(struct smc_link *link,
+		   struct smc_wr_tx_pend_priv *wr_pend_priv);
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
+int smc_wr_rx_post_init(struct smc_link *link);
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+
+#endif /* SMC_WR_H */
-- 
cgit v1.2.3


From bd4ad57718cc86d2972a20f9791cd079996a4dd6 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:20 +0100
Subject: smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR

Prepare the link for RDMA transport:
Create a queue pair (QP) and move it into the state Ready-To-Receive (RTR).

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  34 ++++++--
 net/smc/smc.h      |   1 +
 net/smc/smc_clc.c  |  10 ++-
 net/smc/smc_core.c |  76 ++++++++++++++++++
 net/smc/smc_core.h |  18 +++++
 net/smc/smc_ib.c   | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_ib.h   |  13 +++
 net/smc/smc_wr.c   |   2 +
 8 files changed, 374 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index a38f470130d3..1026fad35998 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -339,9 +339,20 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 	if (local_contact == SMC_FIRST_CONTACT)
 		smc_link_save_peer_info(link, &aclc);
-	/* tbd in follow-on patch: more steps to setup RDMA communcication,
-	 * create rmbs, map rmbs, rtoken_handling, modify_qp
-	 */
+
+	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_INTERR;
+		goto decline_rdma_unlock;
+	}
+
+	if (local_contact == SMC_FIRST_CONTACT) {
+		rc = smc_ib_ready_link(link);
+		if (rc) {
+			reason_code = SMC_CLC_DECL_INTERR;
+			goto decline_rdma_unlock;
+		}
+	}
 
 	rc = smc_clc_send_confirm(smc);
 	if (rc)
@@ -638,9 +649,20 @@ static void smc_listen_work(struct work_struct *work)
 	if (local_contact == SMC_FIRST_CONTACT)
 		smc_link_save_peer_info(link, &cclc);
 
-	/* tbd in follow-on patch: more steps to setup RDMA communcication,
-	 * rtoken_handling, modify_qp
-	 */
+	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_INTERR;
+		goto decline_rdma;
+	}
+
+	/* tbd in follow-on patch: modify_qp, llc_confirm */
+	if (local_contact == SMC_FIRST_CONTACT) {
+		rc = smc_ib_ready_link(link);
+		if (rc) {
+			reason_code = SMC_CLC_DECL_INTERR;
+			goto decline_rdma;
+		}
+	}
 
 out_connected:
 	sk_refcnt_debug_inc(newsmcsk);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 209a0b5f59cb..4d2e54d176f2 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -43,6 +43,7 @@ struct smc_connection {
 	atomic_t		peer_rmbe_space;/* remaining free bytes in peer
 						 * rmbe
 						 */
+	int			rtoken_idx;	/* idx to peer RMB rkey/addr */
 
 	struct smc_buf_desc	*sndbuf_desc;	/* send buffer descriptor */
 	int			sndbuf_size;	/* sndbuf size <== sock wmem */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 4b475dddef16..e1e684c562b8 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -201,13 +201,15 @@ int smc_clc_send_confirm(struct smc_sock *smc)
 	       SMC_GID_SIZE);
 	memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
 	       sizeof(link->smcibdev->mac));
-
-	/* tbd in follow-on patch: fill in rmb-related values */
-
 	hton24(cclc.qpn, link->roce_qp->qp_num);
+	cclc.rmb_rkey =
+		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
 	cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
 	cclc.rmbe_alert_token = htonl(conn->alert_token_local);
 	cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+	cclc.rmbe_size = conn->rmbe_size_short;
+	cclc.rmb_dma_addr =
+		cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
 	hton24(cclc.psn, link->psn_initial);
 
 	memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
@@ -253,6 +255,8 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
 	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
 	       sizeof(link->smcibdev->mac[link->ibport - 1]));
 	hton24(aclc.qpn, link->roce_qp->qp_num);
+	aclc.rmb_rkey =
+		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
 	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */
 	aclc.rmbe_alert_token = htonl(conn->alert_token_local);
 	aclc.qp_mtu = link->path_mtu;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 0eed4c154081..0e9adbd9cd68 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -160,12 +160,23 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 	lnk->smcibdev = smcibdev;
 	lnk->ibport = ibport;
 	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+	if (!smcibdev->initialized)
+		smc_ib_setup_per_ibdev(smcibdev);
 	get_random_bytes(rndvec, sizeof(rndvec));
 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
 	rc = smc_wr_alloc_link_mem(lnk);
 	if (rc)
 		goto free_lgr;
 	init_waitqueue_head(&lnk->wr_tx_wait);
+	rc = smc_ib_create_protection_domain(lnk);
+	if (rc)
+		goto free_link_mem;
+	rc = smc_ib_create_queue_pair(lnk);
+	if (rc)
+		goto dealloc_pd;
+	rc = smc_wr_create_link(lnk);
+	if (rc)
+		goto destroy_qp;
 
 	smc->conn.lgr = lgr;
 	rwlock_init(&lgr->conns_lock);
@@ -174,6 +185,12 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 	spin_unlock_bh(&smc_lgr_list.lock);
 	return 0;
 
+destroy_qp:
+	smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+	smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+	smc_wr_free_link_mem(lnk);
 free_lgr:
 	kfree(lgr);
 out:
@@ -211,7 +228,10 @@ void smc_conn_free(struct smc_connection *conn)
 static void smc_link_clear(struct smc_link *lnk)
 {
 	lnk->peer_qpn = 0;
+	smc_ib_modify_qp_reset(lnk);
 	smc_wr_free_link(lnk);
+	smc_ib_destroy_queue_pair(lnk);
+	smc_ib_dealloc_protection_domain(lnk);
 	smc_wr_free_link_mem(lnk);
 }
 
@@ -223,6 +243,10 @@ static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
 		list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
 					 list) {
+			list_del(&sndbuf_desc->list);
+			smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+					 smc_uncompress_bufsize(i),
+					 sndbuf_desc, DMA_TO_DEVICE);
 			kfree(sndbuf_desc->cpu_addr);
 			kfree(sndbuf_desc);
 		}
@@ -232,11 +256,16 @@ static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
 static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
 {
 	struct smc_buf_desc *rmb_desc, *bf_desc;
+	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
 	int i;
 
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
 		list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
 					 list) {
+			list_del(&rmb_desc->list);
+			smc_ib_buf_unmap(lnk->smcibdev,
+					 smc_uncompress_bufsize(i),
+					 rmb_desc, DMA_FROM_DEVICE);
 			kfree(rmb_desc->cpu_addr);
 			kfree(rmb_desc);
 		}
@@ -550,6 +579,18 @@ int smc_rmb_create(struct smc_sock *smc)
 			kfree(rmb_desc);
 			continue; /* if mapping failed, try smaller one */
 		}
+		rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
+					      IB_ACCESS_REMOTE_WRITE |
+					      IB_ACCESS_LOCAL_WRITE,
+					     &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
+		if (rc) {
+			smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+					 tmp_bufsize, rmb_desc,
+					 DMA_FROM_DEVICE);
+			kfree(rmb_desc->cpu_addr);
+			kfree(rmb_desc);
+			continue;
+		}
 		rmb_desc->used = 1;
 		write_lock_bh(&lgr->rmbs_lock);
 		list_add(&rmb_desc->list,
@@ -567,3 +608,38 @@ int smc_rmb_create(struct smc_sock *smc)
 		return -ENOMEM;
 	}
 }
+
+static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
+{
+	int i;
+
+	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
+		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
+			return i;
+	}
+	return -ENOSPC;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+			    struct smc_clc_msg_accept_confirm *clc)
+{
+	u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
+	struct smc_link_group *lgr = conn->lgr;
+	u32 rkey = ntohl(clc->rmb_rkey);
+	int i;
+
+	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
+		    test_bit(i, lgr->rtokens_used_mask)) {
+			conn->rtoken_idx = i;
+			return 0;
+		}
+	}
+	conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
+	if (conn->rtoken_idx < 0)
+		return conn->rtoken_idx;
+	lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
+	lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
+	return 0;
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index ca4587a95450..f5ea52086d6d 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -90,9 +90,18 @@ struct smc_buf_desc {
 	u64			dma_addr[SMC_LINKS_PER_LGR_MAX];
 						/* mapped address of buffer */
 	void			*cpu_addr;	/* virtual address of buffer */
+	struct ib_mr		*mr_rx[SMC_LINKS_PER_LGR_MAX];
+						/* for rmb only:
+						 * rkey provided to peer
+						 */
 	u32			used;		/* currently used / unused */
 };
 
+struct smc_rtoken {				/* address/key of remote RMB */
+	u64			dma_addr;
+	u32			rkey;
+};
+
 struct smc_link_group {
 	struct list_head	list;
 	enum smc_lgr_role	role;		/* client or server */
@@ -109,6 +118,13 @@ struct smc_link_group {
 	rwlock_t		sndbufs_lock;	/* protects tx buffers */
 	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
 	rwlock_t		rmbs_lock;	/* protects rx buffers */
+	struct smc_rtoken	rtokens[SMC_RMBS_PER_LGR_MAX]
+				       [SMC_LINKS_PER_LGR_MAX];
+						/* remote addr/key pairs */
+	unsigned long		rtokens_used_mask[BITS_TO_LONGS(
+							SMC_RMBS_PER_LGR_MAX)];
+						/* used rtoken elements */
+
 	struct delayed_work	free_work;	/* delayed freeing of an lgr */
 	bool			sync_err;	/* lgr no longer fits to peer */
 };
@@ -153,5 +169,7 @@ void smc_lgr_free(struct smc_link_group *lgr);
 void smc_lgr_terminate(struct smc_link_group *lgr);
 int smc_sndbuf_create(struct smc_sock *smc);
 int smc_rmb_create(struct smc_sock *smc);
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+			    struct smc_clc_msg_accept_confirm *clc);
 
 #endif
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 9fb46a63a17e..e6743c008ac5 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/random.h>
+#include <linux/workqueue.h>
 #include <rdma/ib_verbs.h>
 
 #include "smc_pnet.h"
@@ -20,6 +21,11 @@
 #include "smc_wr.h"
 #include "smc.h"
 
+#define SMC_QP_MIN_RNR_TIMER		5
+#define SMC_QP_TIMEOUT			15 /* 4096 * 2 ** timeout usec */
+#define SMC_QP_RETRY_CNT			7 /* 7: infinite */
+#define SMC_QP_RNR_RETRY			7 /* 7: infinite */
+
 struct smc_ib_devices smc_ib_devices = {	/* smc-registered ib devices */
 	.lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
 	.list = LIST_HEAD_INIT(smc_ib_devices.list),
@@ -31,6 +37,172 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET;	/* unique system
 								 * identifier
 								 */
 
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+			     struct ib_mr **mr)
+{
+	int rc;
+
+	if (*mr)
+		return 0; /* already done */
+
+	/* obtain unique key -
+	 * next invocation of get_dma_mr returns a different key!
+	 */
+	*mr = pd->device->get_dma_mr(pd, access_flags);
+	rc = PTR_ERR_OR_ZERO(*mr);
+	if (IS_ERR(*mr))
+		*mr = NULL;
+	return rc;
+}
+
+static int smc_ib_modify_qp_init(struct smc_link *lnk)
+{
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.pkey_index = 0;
+	qp_attr.port_num = lnk->ibport;
+	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
+				| IB_ACCESS_REMOTE_WRITE;
+	return ib_modify_qp(lnk->roce_qp, &qp_attr,
+			    IB_QP_STATE | IB_QP_PKEY_INDEX |
+			    IB_QP_ACCESS_FLAGS | IB_QP_PORT);
+}
+
+static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
+{
+	enum ib_qp_attr_mask qp_attr_mask =
+		IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
+		IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RTR;
+	qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
+	qp_attr.ah_attr.port_num = lnk->ibport;
+	qp_attr.ah_attr.ah_flags = IB_AH_GRH;
+	qp_attr.ah_attr.grh.hop_limit = 1;
+	memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
+	       sizeof(lnk->peer_gid));
+	memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
+	       sizeof(lnk->peer_mac));
+	qp_attr.dest_qp_num = lnk->peer_qpn;
+	qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
+	qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
+					 * requests
+					 */
+	qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
+
+	return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
+}
+
+int smc_ib_modify_qp_rts(struct smc_link *lnk)
+{
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.timeout = SMC_QP_TIMEOUT;	/* local ack timeout */
+	qp_attr.retry_cnt = SMC_QP_RETRY_CNT;	/* retry count */
+	qp_attr.rnr_retry = SMC_QP_RNR_RETRY;	/* RNR retries, 7=infinite */
+	qp_attr.sq_psn = lnk->psn_initial;	/* starting send packet seq # */
+	qp_attr.max_rd_atomic = 1;	/* # of outstanding RDMA reads and
+					 * atomic ops allowed
+					 */
+	return ib_modify_qp(lnk->roce_qp, &qp_attr,
+			    IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+			    IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
+			    IB_QP_MAX_QP_RD_ATOMIC);
+}
+
+int smc_ib_modify_qp_reset(struct smc_link *lnk)
+{
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RESET;
+	return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
+}
+
+int smc_ib_ready_link(struct smc_link *lnk)
+{
+	struct smc_link_group *lgr =
+		container_of(lnk, struct smc_link_group, lnk[0]);
+	int rc = 0;
+
+	rc = smc_ib_modify_qp_init(lnk);
+	if (rc)
+		goto out;
+
+	rc = smc_ib_modify_qp_rtr(lnk);
+	if (rc)
+		goto out;
+	smc_wr_remember_qp_attr(lnk);
+	rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
+			      IB_CQ_SOLICITED_MASK);
+	if (rc)
+		goto out;
+	rc = smc_wr_rx_post_init(lnk);
+	if (rc)
+		goto out;
+	smc_wr_remember_qp_attr(lnk);
+
+	if (lgr->role == SMC_SERV) {
+		rc = smc_ib_modify_qp_rts(lnk);
+		if (rc)
+			goto out;
+		smc_wr_remember_qp_attr(lnk);
+	}
+out:
+	return rc;
+}
+
+/* process context wrapper for might_sleep smc_ib_remember_port_attr */
+static void smc_ib_port_event_work(struct work_struct *work)
+{
+	struct smc_ib_device *smcibdev = container_of(
+		work, struct smc_ib_device, port_event_work);
+	u8 port_idx;
+
+	for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
+		smc_ib_remember_port_attr(smcibdev, port_idx + 1);
+		clear_bit(port_idx, &smcibdev->port_event_mask);
+	}
+}
+
+/* can be called in IRQ context */
+static void smc_ib_global_event_handler(struct ib_event_handler *handler,
+					struct ib_event *ibevent)
+{
+	struct smc_ib_device *smcibdev;
+	u8 port_idx;
+
+	smcibdev = container_of(handler, struct smc_ib_device, event_handler);
+	if (!smc_pnet_find_ib(smcibdev->ibdev->name))
+		return;
+
+	switch (ibevent->event) {
+	case IB_EVENT_PORT_ERR:
+		port_idx = ibevent->element.port_num - 1;
+		set_bit(port_idx, &smcibdev->port_event_mask);
+		schedule_work(&smcibdev->port_event_work);
+		/* fall through */
+	case IB_EVENT_DEVICE_FATAL:
+		/* tbd in follow-on patch:
+		 * abnormal close of corresponding connections
+		 */
+		break;
+	case IB_EVENT_PORT_ACTIVE:
+		port_idx = ibevent->element.port_num - 1;
+		set_bit(port_idx, &smcibdev->port_event_mask);
+		schedule_work(&smcibdev->port_event_work);
+		break;
+	default:
+		break;
+	}
+}
+
 void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
 {
 	ib_dealloc_pd(lnk->roce_pd);
@@ -121,6 +293,17 @@ int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
 	return rc;
 }
 
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
+		      struct smc_buf_desc *buf_slot,
+		      enum dma_data_direction data_direction)
+{
+	if (!buf_slot->dma_addr[SMC_SINGLE_LINK])
+		return; /* already unmapped */
+	ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
+			    data_direction);
+	buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
+}
+
 static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
 {
 	struct net_device *ndev;
@@ -184,6 +367,50 @@ out:
 	return rc;
 }
 
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+	struct ib_cq_init_attr cqattr =	{
+		.cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
+	long rc;
+
+	smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
+					      smc_wr_tx_cq_handler, NULL,
+					      smcibdev, &cqattr);
+	rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
+	if (IS_ERR(smcibdev->roce_cq_send)) {
+		smcibdev->roce_cq_send = NULL;
+		return rc;
+	}
+	smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
+					      smc_wr_rx_cq_handler, NULL,
+					      smcibdev, &cqattr);
+	rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
+	if (IS_ERR(smcibdev->roce_cq_recv)) {
+		smcibdev->roce_cq_recv = NULL;
+		goto err;
+	}
+	INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+			      smc_ib_global_event_handler);
+	ib_register_event_handler(&smcibdev->event_handler);
+	smc_wr_add_dev(smcibdev);
+	smcibdev->initialized = 1;
+	return rc;
+
+err:
+	ib_destroy_cq(smcibdev->roce_cq_send);
+	return rc;
+}
+
+static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+	if (!smcibdev->initialized)
+		return;
+	smc_wr_remove_dev(smcibdev);
+	ib_unregister_event_handler(&smcibdev->event_handler);
+	ib_destroy_cq(smcibdev->roce_cq_recv);
+	ib_destroy_cq(smcibdev->roce_cq_send);
+}
+
 static struct ib_client smc_ib_client;
 
 /* callback function for ib_register_client() */
@@ -199,6 +426,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
 		return;
 
 	smcibdev->ibdev = ibdev;
+	INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
 
 	spin_lock(&smc_ib_devices.lock);
 	list_add_tail(&smcibdev->list, &smc_ib_devices.list);
@@ -217,6 +445,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
 	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
 	spin_unlock(&smc_ib_devices.lock);
 	smc_pnet_remove_by_ibdev(smcibdev);
+	smc_ib_cleanup_per_ibdev(smcibdev);
 	kfree(smcibdev);
 }
 
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 441fd168911d..3fe2d55c6051 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -29,6 +29,7 @@ struct smc_ib_device {				/* ib-device infos for smc */
 	struct list_head	list;
 	struct ib_device	*ibdev;
 	struct ib_port_attr	pattr[SMC_MAX_PORTS];	/* ib dev. port attrs */
+	struct ib_event_handler	event_handler;	/* global ib_event handler */
 	struct ib_cq		*roce_cq_send;	/* send completion queue */
 	struct ib_cq		*roce_cq_recv;	/* recv completion queue */
 	struct tasklet_struct	send_tasklet;	/* called by send cq handler */
@@ -36,6 +37,8 @@ struct smc_ib_device {				/* ib-device infos for smc */
 	char			mac[SMC_MAX_PORTS][6]; /* mac address per port*/
 	union ib_gid		gid[SMC_MAX_PORTS]; /* gid per port */
 	u8			initialized : 1; /* ib dev CQ, evthdl done */
+	struct work_struct	port_event_work;
+	unsigned long		port_event_mask;
 };
 
 struct smc_buf_desc;
@@ -48,9 +51,19 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
 int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
 		   struct smc_buf_desc *buf_slot,
 		   enum dma_data_direction data_direction);
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize,
+		      struct smc_buf_desc *buf_slot,
+		      enum dma_data_direction data_direction);
 void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
 int smc_ib_create_protection_domain(struct smc_link *lnk);
 void smc_ib_destroy_queue_pair(struct smc_link *lnk);
 int smc_ib_create_queue_pair(struct smc_link *lnk);
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+			     struct ib_mr **mr);
+int smc_ib_ready_link(struct smc_link *lnk);
+int smc_ib_modify_qp_rts(struct smc_link *lnk);
+int smc_ib_modify_qp_reset(struct smc_link *lnk);
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
+
 
 #endif
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index a2bc6b612d03..5cbc1e5f7606 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -393,6 +393,7 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 		lnk->wr_tx_sges[i].addr =
 			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
 		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
 		lnk->wr_tx_ibs[i].next = NULL;
 		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
 		lnk->wr_tx_ibs[i].num_sge = 1;
@@ -404,6 +405,7 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 		lnk->wr_rx_sges[i].addr =
 			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
 		lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
+		lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
 		lnk->wr_rx_ibs[i].next = NULL;
 		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
 		lnk->wr_rx_ibs[i].num_sge = 1;
-- 
cgit v1.2.3


From 9bf9abead28abaf11d0776b6e0c5d34b6525e846 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:21 +0100
Subject: smc: link layer control (LLC)

send and receive LLC messages CONFIRM_LINK (via IB message send and CQE)

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile   |   2 +-
 net/smc/af_smc.c   |  94 ++++++++++++++++++++++++++++++-
 net/smc/smc_clc.h  |   2 +
 net/smc/smc_core.c |   8 +++
 net/smc/smc_core.h |   6 ++
 net/smc/smc_llc.c  | 158 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_llc.h  |  63 +++++++++++++++++++++
 7 files changed, 330 insertions(+), 3 deletions(-)
 create mode 100644 net/smc/smc_llc.c
 create mode 100644 net/smc/smc_llc.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index b19120ed7102..73320bf452b0 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_SMC)	+= smc.o
-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 1026fad35998..1ae986d2762d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -31,6 +31,7 @@
 
 #include "smc.h"
 #include "smc_clc.h"
+#include "smc_llc.h"
 #include "smc_core.h"
 #include "smc_ib.h"
 #include "smc_pnet.h"
@@ -245,6 +246,41 @@ out:
 	return rc;
 }
 
+static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
+{
+	struct smc_link_group *lgr = smc->conn.lgr;
+	struct smc_link *link;
+	int rest;
+	int rc;
+
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	/* receive CONFIRM LINK request from server over RoCE fabric */
+	rest = wait_for_completion_interruptible_timeout(
+		&link->llc_confirm,
+		SMC_LLC_WAIT_FIRST_TIME);
+	if (rest <= 0) {
+		struct smc_clc_msg_decline dclc;
+
+		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+				      SMC_CLC_DECLINE);
+		return rc;
+	}
+
+	rc = smc_ib_modify_qp_rts(link);
+	if (rc)
+		return SMC_CLC_DECL_INTERR;
+
+	smc_wr_remember_qp_attr(link);
+	/* send CONFIRM LINK response over RoCE fabric */
+	rc = smc_llc_send_confirm_link(link,
+				       link->smcibdev->mac[link->ibport - 1],
+				       gid, SMC_LLC_RESP);
+	if (rc < 0)
+		return SMC_CLC_DECL_TCL;
+
+	return rc;
+}
+
 static void smc_conn_save_peer_info(struct smc_sock *smc,
 				    struct smc_clc_msg_accept_confirm *clc)
 {
@@ -358,7 +394,17 @@ static int smc_connect_rdma(struct smc_sock *smc)
 	if (rc)
 		goto out_err_unlock;
 
-	/* tbd in follow-on patch: llc_confirm */
+	if (local_contact == SMC_FIRST_CONTACT) {
+		/* QP confirmation over RoCE fabric */
+		reason_code = smc_clnt_conf_first_link(
+			smc, &smcibdev->gid[ibport - 1]);
+		if (reason_code < 0) {
+			rc = reason_code;
+			goto out_err_unlock;
+		}
+		if (reason_code > 0)
+			goto decline_rdma_unlock;
+	}
 
 	mutex_unlock(&smc_create_lgr_pending);
 out_connected:
@@ -543,6 +589,36 @@ static void smc_close_non_accepted(struct sock *sk)
 	sock_put(sk);
 }
 
+static int smc_serv_conf_first_link(struct smc_sock *smc)
+{
+	struct smc_link_group *lgr = smc->conn.lgr;
+	struct smc_link *link;
+	int rest;
+	int rc;
+
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	/* send CONFIRM LINK request to client over the RoCE fabric */
+	rc = smc_llc_send_confirm_link(link,
+				       link->smcibdev->mac[link->ibport - 1],
+				       &link->smcibdev->gid[link->ibport - 1],
+				       SMC_LLC_REQ);
+	if (rc < 0)
+		return SMC_CLC_DECL_TCL;
+
+	/* receive CONFIRM LINK response from client over the RoCE fabric */
+	rest = wait_for_completion_interruptible_timeout(
+		&link->llc_confirm_resp,
+		SMC_LLC_WAIT_FIRST_TIME);
+	if (rest <= 0) {
+		struct smc_clc_msg_decline dclc;
+
+		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+				      SMC_CLC_DECLINE);
+	}
+
+	return rc;
+}
+
 /* setup for RDMA connection of server */
 static void smc_listen_work(struct work_struct *work)
 {
@@ -655,13 +731,21 @@ static void smc_listen_work(struct work_struct *work)
 		goto decline_rdma;
 	}
 
-	/* tbd in follow-on patch: modify_qp, llc_confirm */
 	if (local_contact == SMC_FIRST_CONTACT) {
 		rc = smc_ib_ready_link(link);
 		if (rc) {
 			reason_code = SMC_CLC_DECL_INTERR;
 			goto decline_rdma;
 		}
+		/* QP confirmation over RoCE fabric */
+		reason_code = smc_serv_conf_first_link(new_smc);
+		if (reason_code < 0) {
+			/* peer is not aware of a problem */
+			rc = reason_code;
+			goto out_err;
+		}
+		if (reason_code > 0)
+			goto decline_rdma;
 	}
 
 out_connected:
@@ -1111,6 +1195,12 @@ static int __init smc_init(void)
 	if (rc)
 		return rc;
 
+	rc = smc_llc_init();
+	if (rc) {
+		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
+		goto out_pnet;
+	}
+
 	rc = proto_register(&smc_proto, 1);
 	if (rc) {
 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 5924d998b5ca..13db8ce177c9 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -33,6 +33,8 @@ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
 #define SMC_CLC_DECL_SYNCERR	0x04000000  /* synchronization error          */
 #define SMC_CLC_DECL_REPLY	0x06000000  /* reply to a received decline    */
 #define SMC_CLC_DECL_INTERR	0x99990000  /* internal error                 */
+#define SMC_CLC_DECL_TCL	0x02040000  /* timeout w4 QP confirm          */
+#define SMC_CLC_DECL_SEND	0x07000000  /* sending problem                */
 
 struct smc_clc_msg_hdr {	/* header1 of clc messages */
 	u8 eyecatcher[4];	/* eye catcher */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 0e9adbd9cd68..906d88c266c0 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -21,9 +21,13 @@
 #include "smc_core.h"
 #include "smc_ib.h"
 #include "smc_wr.h"
+#include "smc_llc.h"
 
+#define SMC_LGR_NUM_INCR	256
 #define SMC_LGR_FREE_DELAY	(600 * HZ)
 
+static u32 smc_lgr_num;			/* unique link group number */
+
 /* Register connection's alert token in our lookup structure.
  * To use rbtrees we have to implement our own insert core.
  * Requires @conns_lock
@@ -152,6 +156,8 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
 		INIT_LIST_HEAD(&lgr->rmbs[i]);
 	}
+	smc_lgr_num += SMC_LGR_NUM_INCR;
+	memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
 	lgr->conns_all = RB_ROOT;
 
@@ -177,6 +183,8 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 	rc = smc_wr_create_link(lnk);
 	if (rc)
 		goto destroy_qp;
+	init_completion(&lnk->llc_confirm);
+	init_completion(&lnk->llc_confirm_resp);
 
 	smc->conn.lgr = lgr;
 	rwlock_init(&lgr->conns_lock);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index f5ea52086d6d..27eb38056a27 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -73,6 +73,9 @@ struct smc_link {
 	u32			peer_psn;	/* QP rx initial packet seqno */
 	u8			peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
 	u8			peer_gid[sizeof(union ib_gid)];	/* gid of peer*/
+	u8			link_id;	/* unique # within link group */
+	struct completion	llc_confirm;	/* wait for rx of conf link */
+	struct completion	llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
 };
 
 /* For now we just allow one parallel link per link group. The SMC protocol
@@ -102,6 +105,8 @@ struct smc_rtoken {				/* address/key of remote RMB */
 	u32			rkey;
 };
 
+#define SMC_LGR_ID_SIZE		4
+
 struct smc_link_group {
 	struct list_head	list;
 	enum smc_lgr_role	role;		/* client or server */
@@ -125,6 +130,7 @@ struct smc_link_group {
 							SMC_RMBS_PER_LGR_MAX)];
 						/* used rtoken elements */
 
+	u8			id[SMC_LGR_ID_SIZE];	/* unique lgr id */
 	struct delayed_work	free_work;	/* delayed freeing of an lgr */
 	bool			sync_err;	/* lgr no longer fits to peer */
 };
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000000..c2f9165d13ef
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,158 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Link Layer Control (LLC)
+ *
+ *  For now, we only support the necessary "confirm link" functionality
+ *  which happens for the first RoCE link after successful CLC handshake.
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ *              Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <net/tcp.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+
+/********************************** send *************************************/
+
+struct smc_llc_tx_pend {
+};
+
+/* handler for send/transmission completion of an LLC msg */
+static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
+			       struct smc_link *link,
+			       enum ib_wc_status wc_status)
+{
+	/* future work: handle wc_status error for recovery and failover */
+}
+
+/**
+ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
+ * @link: Pointer to SMC link used for sending LLC control message.
+ * @wr_buf: Out variable returning pointer to work request payload buffer.
+ * @pend: Out variable returning pointer to private pending WR tracking.
+ *	  It's the context the transmit complete handler will get.
+ *
+ * Reserves and pre-fills an entry for a pending work request send/tx.
+ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
+ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
+ *
+ * Return: 0 on success, otherwise an error value.
+ */
+static int smc_llc_add_pending_send(struct smc_link *link,
+				    struct smc_wr_buf **wr_buf,
+				    struct smc_wr_tx_pend_priv **pend)
+{
+	int rc;
+
+	rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
+	if (rc < 0)
+		return rc;
+	BUILD_BUG_ON_MSG(
+		sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
+		"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
+	BUILD_BUG_ON_MSG(
+		sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
+		"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+	BUILD_BUG_ON_MSG(
+		sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+		"must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
+	return 0;
+}
+
+/* high-level API to send LLC confirm link */
+int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
+			      union ib_gid *gid,
+			      enum smc_llc_reqresp reqresp)
+{
+	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+						  lnk[SMC_SINGLE_LINK]);
+	struct smc_llc_msg_confirm_link *confllc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
+	memset(confllc, 0, sizeof(*confllc));
+	confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
+	confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+	if (reqresp == SMC_LLC_RESP)
+		confllc->hd.flags |= SMC_LLC_FLAG_RESP;
+	memcpy(confllc->sender_mac, mac, ETH_ALEN);
+	memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
+	hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
+	/* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
+	memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
+	confllc->max_links = SMC_LINKS_PER_LGR_MAX;
+	/* send llc message */
+	rc = smc_wr_tx_send(link, pend);
+	return rc;
+}
+
+/********************************* receive ***********************************/
+
+static void smc_llc_rx_confirm_link(struct smc_link *link,
+				    struct smc_llc_msg_confirm_link *llc)
+{
+	struct smc_link_group *lgr;
+
+	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		if (lgr->role == SMC_SERV)
+			complete(&link->llc_confirm_resp);
+	} else {
+		if (lgr->role == SMC_CLNT) {
+			link->link_id = llc->link_num;
+			complete(&link->llc_confirm);
+		}
+	}
+}
+
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	union smc_llc_msg *llc = buf;
+
+	if (wc->byte_len < sizeof(*llc))
+		return; /* short message */
+	if (llc->raw.hdr.length != sizeof(*llc))
+		return; /* invalid message */
+	if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
+		smc_llc_rx_confirm_link(link, &llc->confirm_link);
+}
+
+/***************************** init, exit, misc ******************************/
+
+static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_CONFIRM_LINK
+	},
+	{
+		.handler	= NULL,
+	}
+};
+
+int __init smc_llc_init(void)
+{
+	struct smc_wr_rx_handler *handler;
+	int rc = 0;
+
+	for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
+		INIT_HLIST_NODE(&handler->list);
+		rc = smc_wr_rx_register_handler(handler);
+		if (rc)
+			break;
+	}
+	return rc;
+}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000000..b472f853953a
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,63 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for LLC (link layer control) message handling
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ *              Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_LLC_H
+#define SMC_LLC_H
+
+#include "smc_wr.h"
+
+#define SMC_LLC_FLAG_RESP		0x80
+
+#define SMC_LLC_WAIT_FIRST_TIME		(5 * HZ)
+
+enum smc_llc_reqresp {
+	SMC_LLC_REQ,
+	SMC_LLC_RESP
+};
+
+enum smc_llc_msg_type {
+	SMC_LLC_CONFIRM_LINK		= 0x01,
+};
+
+#define SMC_LLC_DATA_LEN		40
+
+struct smc_llc_hdr {
+	struct smc_wr_rx_hdr common;
+	u8 length;	/* 44 */
+	u8 reserved;
+	u8 flags;
+};
+
+struct smc_llc_msg_confirm_link {	/* type 0x01 */
+	struct smc_llc_hdr hd;
+	u8 sender_mac[ETH_ALEN];
+	u8 sender_gid[SMC_GID_SIZE];
+	u8 sender_qp_num[3];
+	u8 link_num;
+	u8 link_uid[SMC_LGR_ID_SIZE];
+	u8 max_links;
+	u8 reserved[9];
+};
+
+union smc_llc_msg {
+	struct smc_llc_msg_confirm_link confirm_link;
+	struct {
+		struct smc_llc_hdr hdr;
+		u8 data[SMC_LLC_DATA_LEN];
+	} raw;
+};
+
+/* transmit */
+int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
+			      enum smc_llc_reqresp reqresp);
+int smc_llc_init(void) __init;
+
+#endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From 5f08318f617b05b6ee389d8bd174c7af921ebf19 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:22 +0100
Subject: smc: connection data control (CDC)

send and receive CDC messages (via IB message send and CQE)

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile   |   1 +
 net/smc/af_smc.c   |   9 ++
 net/smc/smc.h      |  97 +++++++++++++++++++
 net/smc/smc_cdc.c  | 280 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_cdc.h  | 217 +++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_core.c |   9 ++
 net/smc/smc_wr.c   |  19 ++++
 net/smc/smc_wr.h   |   9 ++
 8 files changed, 641 insertions(+)
 create mode 100644 net/smc/smc_cdc.c
 create mode 100644 net/smc/smc_cdc.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index 73320bf452b0..ec0fd0338c6d 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_SMC)	+= smc.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
+smc-y += smc_cdc.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 1ae986d2762d..4b1611d46ef7 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -26,12 +26,14 @@
 #include <linux/socket.h>
 #include <linux/inetdevice.h>
 #include <linux/workqueue.h>
+#include <linux/in.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 
 #include "smc.h"
 #include "smc_clc.h"
 #include "smc_llc.h"
+#include "smc_cdc.h"
 #include "smc_core.h"
 #include "smc_ib.h"
 #include "smc_pnet.h"
@@ -285,6 +287,7 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
 				    struct smc_clc_msg_accept_confirm *clc)
 {
 	smc->conn.peer_conn_idx = clc->conn_idx;
+	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
 	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 }
@@ -1201,6 +1204,12 @@ static int __init smc_init(void)
 		goto out_pnet;
 	}
 
+	rc = smc_cdc_init();
+	if (rc) {
+		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
+		goto out_pnet;
+	}
+
 	rc = proto_register(&smc_proto, 1);
 	if (rc) {
 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 4d2e54d176f2..a8d4a68aac30 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -21,6 +21,10 @@
 
 #define SMC_MAX_PORTS		2	/* Max # of ports */
 
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
 enum smc_state {		/* possible states of an SMC socket */
 	SMC_ACTIVE	= 1,
 	SMC_INIT	= 2,
@@ -34,6 +38,67 @@ struct smc_wr_rx_hdr {	/* common prefix part of LLC and CDC to demultiplex */
 	u8			type;
 } __aligned(1);
 
+struct smc_cdc_conn_state_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8	peer_done_writing : 1;	/* Sending done indicator */
+	u8	peer_conn_closed : 1;	/* Peer connection closed indicator */
+	u8	peer_conn_abort : 1;	/* Abnormal close indicator */
+	u8	reserved : 5;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8	reserved : 5;
+	u8	peer_conn_abort : 1;
+	u8	peer_conn_closed : 1;
+	u8	peer_done_writing : 1;
+#endif
+};
+
+struct smc_cdc_producer_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8	write_blocked : 1;	/* Writing Blocked, no rx buf space */
+	u8	urg_data_pending : 1;	/* Urgent Data Pending */
+	u8	urg_data_present : 1;	/* Urgent Data Present */
+	u8	cons_curs_upd_req : 1;	/* cursor update requested */
+	u8	failover_validation : 1;/* message replay due to failover */
+	u8	reserved : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8	reserved : 3;
+	u8	failover_validation : 1;
+	u8	cons_curs_upd_req : 1;
+	u8	urg_data_present : 1;
+	u8	urg_data_pending : 1;
+	u8	write_blocked : 1;
+#endif
+};
+
+/* in host byte order */
+union smc_host_cursor {	/* SMC cursor - an offset in an RMBE */
+	struct {
+		u16	reserved;
+		u16	wrap;		/* window wrap sequence number */
+		u32	count;		/* cursor (= offset) part */
+	};
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		acurs;	/* for atomic processing */
+#else
+	u64			acurs;	/* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in host byte order, except for flag bitfields in network byte order */
+struct smc_host_cdc_msg {		/* Connection Data Control message */
+	struct smc_wr_rx_hdr		common; /* .type = 0xFE */
+	u8				len;	/* length = 44 */
+	u16				seqno;	/* connection seq # */
+	u32				token;	/* alert_token */
+	union smc_host_cursor		prod;		/* producer cursor */
+	union smc_host_cursor		cons;		/* consumer cursor,
+							 * piggy backed "ack"
+							 */
+	struct smc_cdc_producer_flags	prod_flags;	/* conn. tx/rx status */
+	struct smc_cdc_conn_state_flags	conn_state_flags; /* peer conn. status*/
+	u8				reserved[18];
+} __aligned(8);
+
 struct smc_connection {
 	struct rb_node		alert_node;
 	struct smc_link_group	*lgr;		/* link group of connection */
@@ -50,6 +115,38 @@ struct smc_connection {
 	struct smc_buf_desc	*rmb_desc;	/* RMBE descriptor */
 	int			rmbe_size;	/* RMBE size <== sock rmem */
 	int			rmbe_size_short;/* compressed notation */
+
+	struct smc_host_cdc_msg	local_tx_ctrl;	/* host byte order staging
+						 * buffer for CDC msg send
+						 * .prod cf. TCP snd_nxt
+						 * .cons cf. TCP sends ack
+						 */
+	union smc_host_cursor	tx_curs_prep;	/* tx - prepared data
+						 * snd_max..wmem_alloc
+						 */
+	union smc_host_cursor	tx_curs_sent;	/* tx - sent data
+						 * snd_nxt ?
+						 */
+	union smc_host_cursor	tx_curs_fin;	/* tx - confirmed by peer
+						 * snd-wnd-begin ?
+						 */
+	atomic_t		sndbuf_space;	/* remaining space in sndbuf */
+	u16			tx_cdc_seq;	/* sequence # for CDC send */
+	spinlock_t		send_lock;	/* protect wr_sends */
+
+	struct smc_host_cdc_msg	local_rx_ctrl;	/* filled during event_handl.
+						 * .prod cf. TCP rcv_nxt
+						 * .cons cf. TCP snd_una
+						 */
+	union smc_host_cursor	rx_curs_confirmed; /* confirmed to peer
+						    * source of snd_una ?
+						    */
+	atomic_t		bytes_to_rcv;	/* arrived data,
+						 * not yet received
+						 */
+#ifndef KERNEL_HAS_ATOMIC64
+	spinlock_t		acurs_lock;	/* protect cursors */
+#endif
 };
 
 struct smc_sock {				/* smc sock container */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000000..111d23ff9e17
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,280 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ * handles flow control
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/spinlock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+
+/********************************** send *************************************/
+
+struct smc_cdc_tx_pend {
+	struct smc_connection	*conn;		/* socket connection */
+	union smc_host_cursor	cursor;	/* tx sndbuf cursor sent */
+	union smc_host_cursor	p_cursor;	/* rx RMBE cursor produced */
+	u16			ctrl_seq;	/* conn. tx sequence # */
+};
+
+/* handler for send/transmission completion of a CDC msg */
+static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+			       struct smc_link *link,
+			       enum ib_wc_status wc_status)
+{
+	struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+	struct smc_sock *smc;
+	int diff;
+
+	if (!cdcpend->conn)
+		/* already dismissed */
+		return;
+
+	smc = container_of(cdcpend->conn, struct smc_sock, conn);
+	bh_lock_sock(&smc->sk);
+	if (!wc_status) {
+		diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
+				     &cdcpend->conn->tx_curs_fin,
+				     &cdcpend->cursor);
+		/* sndbuf_space is decreased in smc_sendmsg */
+		smp_mb__before_atomic();
+		atomic_add(diff, &cdcpend->conn->sndbuf_space);
+		/* guarantee 0 <= sndbuf_space <= sndbuf_size */
+		smp_mb__after_atomic();
+		smc_curs_write(&cdcpend->conn->tx_curs_fin,
+			       smc_curs_read(&cdcpend->cursor, cdcpend->conn),
+			       cdcpend->conn);
+	}
+	/* subsequent patch: wake if send buffer space available */
+	bh_unlock_sock(&smc->sk);
+}
+
+int smc_cdc_get_free_slot(struct smc_link *link,
+			  struct smc_wr_buf **wr_buf,
+			  struct smc_cdc_tx_pend **pend)
+{
+	return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+				       (struct smc_wr_tx_pend_priv **)pend);
+}
+
+static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
+					    struct smc_cdc_tx_pend *pend)
+{
+	BUILD_BUG_ON_MSG(
+		sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
+		"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
+	BUILD_BUG_ON_MSG(
+		offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+		"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+	BUILD_BUG_ON_MSG(
+		sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+		"must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
+	pend->conn = conn;
+	pend->cursor = conn->tx_curs_sent;
+	pend->p_cursor = conn->local_tx_ctrl.prod;
+	pend->ctrl_seq = conn->tx_cdc_seq;
+}
+
+int smc_cdc_msg_send(struct smc_connection *conn,
+		     struct smc_wr_buf *wr_buf,
+		     struct smc_cdc_tx_pend *pend)
+{
+	struct smc_link *link;
+	int rc;
+
+	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+	smc_cdc_add_pending_send(conn, pend);
+
+	conn->tx_cdc_seq++;
+	conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+	smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf,
+			    &conn->local_tx_ctrl, conn);
+	rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+	if (!rc)
+		smc_curs_write(&conn->rx_curs_confirmed,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+
+	return rc;
+}
+
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+				   &pend);
+	if (rc)
+		return rc;
+
+	return smc_cdc_msg_send(conn, wr_buf, pend);
+}
+
+static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
+			      unsigned long data)
+{
+	struct smc_connection *conn = (struct smc_connection *)data;
+	struct smc_cdc_tx_pend *cdc_pend =
+		(struct smc_cdc_tx_pend *)tx_pend;
+
+	return cdc_pend->conn == conn;
+}
+
+static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
+{
+	struct smc_cdc_tx_pend *cdc_pend =
+		(struct smc_cdc_tx_pend *)tx_pend;
+
+	cdc_pend->conn = NULL;
+}
+
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
+{
+	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+	smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
+				smc_cdc_tx_filter, smc_cdc_tx_dismisser,
+				(unsigned long)conn);
+}
+
+/********************************* receive ***********************************/
+
+static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+{
+	return (s16)(seq1 - seq2) < 0;
+}
+
+static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+				    struct smc_link *link,
+				    struct smc_cdc_msg *cdc)
+{
+	union smc_host_cursor cons_old, prod_old;
+	struct smc_connection *conn = &smc->conn;
+	int diff_cons, diff_prod;
+
+	if (!cdc->prod_flags.failover_validation) {
+		if (smc_cdc_before(ntohs(cdc->seqno),
+				   conn->local_rx_ctrl.seqno))
+			/* received seqno is old */
+			return;
+	}
+	smc_curs_write(&prod_old,
+		       smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+		       conn);
+	smc_curs_write(&cons_old,
+		       smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+		       conn);
+	smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
+
+	diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
+				  &conn->local_rx_ctrl.cons);
+	if (diff_cons) {
+		/* peer_rmbe_space is decreased during data transfer with RDMA
+		 * write
+		 */
+		smp_mb__before_atomic();
+		atomic_add(diff_cons, &conn->peer_rmbe_space);
+		/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+		smp_mb__after_atomic();
+	}
+
+	diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+				  &conn->local_rx_ctrl.prod);
+	if (diff_prod) {
+		/* bytes_to_rcv is decreased in smc_recvmsg */
+		smp_mb__before_atomic();
+		atomic_add(diff_prod, &conn->bytes_to_rcv);
+		/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+		smp_mb__after_atomic();
+	}
+
+	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+		smc->sk.sk_err = ECONNRESET;
+	if (smc_cdc_rxed_any_close_or_senddone(conn))
+		/* subsequent patch: terminate connection */
+
+	/* piggy backed tx info */
+	/* subsequent patch: wake receivers if receive buffer space available */
+
+	/* subsequent patch: trigger socket release if connection closed */
+
+	/* socket connected but not accepted */
+	if (!smc->sk.sk_socket)
+		return;
+
+	/* data available */
+	/* subsequent patch: send delayed ack, wake receivers */
+}
+
+/* called under tasklet context */
+static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
+				    struct smc_link *link, u64 wr_id)
+{
+	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+						  lnk[SMC_SINGLE_LINK]);
+	struct smc_connection *connection;
+	struct smc_sock *smc;
+
+	/* lookup connection */
+	read_lock_bh(&lgr->conns_lock);
+	connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+	if (!connection) {
+		read_unlock_bh(&lgr->conns_lock);
+		return;
+	}
+	smc = container_of(connection, struct smc_sock, conn);
+	sock_hold(&smc->sk);
+	read_unlock_bh(&lgr->conns_lock);
+	bh_lock_sock(&smc->sk);
+	smc_cdc_msg_recv_action(smc, link, cdc);
+	bh_unlock_sock(&smc->sk);
+	sock_put(&smc->sk); /* no free sk in softirq-context */
+}
+
+/***************************** init, exit, misc ******************************/
+
+static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	struct smc_cdc_msg *cdc = buf;
+
+	if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
+		return; /* short message */
+	if (cdc->len != sizeof(*cdc))
+		return; /* invalid message */
+	smc_cdc_msg_recv(cdc, link, wc->wr_id);
+}
+
+static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
+	{
+		.handler	= smc_cdc_rx_handler,
+		.type		= SMC_CDC_MSG_TYPE
+	},
+	{
+		.handler	= NULL,
+	}
+};
+
+int __init smc_cdc_init(void)
+{
+	struct smc_wr_rx_handler *handler;
+	int rc = 0;
+
+	for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
+		INIT_HLIST_NODE(&handler->list);
+		rc = smc_wr_rx_register_handler(handler);
+		if (rc)
+			break;
+	}
+	return rc;
+}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000000..135f61369a60
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,217 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CDC_H
+#define SMC_CDC_H
+
+#include <linux/kernel.h> /* max_t */
+#include <linux/atomic.h>
+#include <linux/in.h>
+#include <linux/compiler.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+
+#define	SMC_CDC_MSG_TYPE		0xFE
+
+/* in network byte order */
+union smc_cdc_cursor {		/* SMC cursor */
+	struct {
+		__be16	reserved;
+		__be16	wrap;
+		__be32	count;
+	};
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t	acurs;		/* for atomic processing */
+#else
+	u64		acurs;		/* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in network byte order */
+struct smc_cdc_msg {
+	struct smc_wr_rx_hdr		common; /* .type = 0xFE */
+	u8				len;	/* 44 */
+	__be16				seqno;
+	__be32				token;
+	union smc_cdc_cursor		prod;
+	union smc_cdc_cursor		cons;	/* piggy backed "ack" */
+	struct smc_cdc_producer_flags	prod_flags;
+	struct smc_cdc_conn_state_flags	conn_state_flags;
+	u8				reserved[18];
+} __aligned(8);
+
+static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
+{
+	return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
+	       conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+static inline bool smc_cdc_rxed_any_close_or_senddone(
+	struct smc_connection *conn)
+{
+	return smc_cdc_rxed_any_close(conn) ||
+	       conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
+}
+
+static inline void smc_curs_add(int size, union smc_host_cursor *curs,
+				int value)
+{
+	curs->count += value;
+	if (curs->count >= size) {
+		curs->wrap++;
+		curs->count -= size;
+	}
+}
+
+/* SMC cursors are 8 bytes long and require atomic reading and writing */
+static inline u64 smc_curs_read(union smc_host_cursor *curs,
+				struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+	u64 ret;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	ret = curs->acurs;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+	return ret;
+#else
+	return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
+				    struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+	u64 ret;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	ret = curs->acurs;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+	return ret;
+#else
+	return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
+				  struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	curs->acurs = val;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+	atomic64_set(&curs->acurs, val);
+#endif
+}
+
+static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
+				      struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	curs->acurs = val;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+	atomic64_set(&curs->acurs, val);
+#endif
+}
+
+/* calculate cursor difference between old and new, where old <= new */
+static inline int smc_curs_diff(unsigned int size,
+				union smc_host_cursor *old,
+				union smc_host_cursor *new)
+{
+	if (old->wrap != new->wrap)
+		return max_t(int, 0,
+			     ((size - old->count) + new->count));
+
+	return max_t(int, 0, (new->count - old->count));
+}
+
+static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
+					  union smc_host_cursor *local,
+					  struct smc_connection *conn)
+{
+	union smc_host_cursor temp;
+
+	smc_curs_write(&temp, smc_curs_read(local, conn), conn);
+	peer->count = htonl(temp.count);
+	peer->wrap = htons(temp.wrap);
+	/* peer->reserved = htons(0); must be ensured by caller */
+}
+
+static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
+				       struct smc_host_cdc_msg *local,
+				       struct smc_connection *conn)
+{
+	peer->common.type = local->common.type;
+	peer->len = local->len;
+	peer->seqno = htons(local->seqno);
+	peer->token = htonl(local->token);
+	smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn);
+	smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn);
+	peer->prod_flags = local->prod_flags;
+	peer->conn_state_flags = local->conn_state_flags;
+}
+
+static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
+					  union smc_cdc_cursor *peer,
+					  struct smc_connection *conn)
+{
+	union smc_host_cursor temp, old;
+	union smc_cdc_cursor net;
+
+	smc_curs_write(&old, smc_curs_read(local, conn), conn);
+	smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
+	temp.count = ntohl(net.count);
+	temp.wrap = ntohs(net.wrap);
+	if ((old.wrap > temp.wrap) && temp.wrap)
+		return;
+	if ((old.wrap == temp.wrap) &&
+	    (old.count > temp.count))
+		return;
+	smc_curs_write(local, smc_curs_read(&temp, conn), conn);
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+				       struct smc_cdc_msg *peer,
+				       struct smc_connection *conn)
+{
+	local->common.type = peer->common.type;
+	local->len = peer->len;
+	local->seqno = ntohs(peer->seqno);
+	local->token = ntohl(peer->token);
+	smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
+	smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
+	local->prod_flags = peer->prod_flags;
+	local->conn_state_flags = peer->conn_state_flags;
+}
+
+struct smc_cdc_tx_pend;
+
+int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf,
+			  struct smc_cdc_tx_pend **pend);
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
+int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
+		     struct smc_cdc_tx_pend *pend);
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+int smc_cdc_init(void) __init;
+
+#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 906d88c266c0..537e387b9e85 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -22,6 +22,7 @@
 #include "smc_ib.h"
 #include "smc_wr.h"
 #include "smc_llc.h"
+#include "smc_cdc.h"
 
 #define SMC_LGR_NUM_INCR	256
 #define SMC_LGR_FREE_DELAY	(600 * HZ)
@@ -228,6 +229,7 @@ void smc_conn_free(struct smc_connection *conn)
 
 	if (!lgr)
 		return;
+	smc_cdc_tx_dismiss_slots(conn);
 	smc_lgr_unregister_conn(conn);
 	smc_rmb_unuse(conn);
 	smc_sndbuf_unuse(conn);
@@ -435,6 +437,11 @@ create:
 		smc_lgr_register_conn(conn); /* add smc conn to lgr */
 		rc = smc_link_determine_gid(conn->lgr);
 	}
+	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
+	conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&conn->acurs_lock);
+#endif
 
 out:
 	return rc ? rc : local_contact;
@@ -535,6 +542,7 @@ int smc_sndbuf_create(struct smc_sock *smc)
 		conn->sndbuf_desc = sndbuf_desc;
 		conn->sndbuf_size = tmp_bufsize;
 		smc->sk.sk_sndbuf = tmp_bufsize * 2;
+		atomic_set(&conn->sndbuf_space, tmp_bufsize);
 		return 0;
 	} else {
 		return -ENOMEM;
@@ -611,6 +619,7 @@ int smc_rmb_create(struct smc_sock *smc)
 		conn->rmbe_size = tmp_bufsize;
 		conn->rmbe_size_short = tmp_bufsize_short;
 		smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+		atomic_set(&conn->bytes_to_rcv, 0);
 		return 0;
 	} else {
 		return -ENOMEM;
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 5cbc1e5f7606..14f3f3fa6e52 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -237,6 +237,25 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
 	return rc;
 }
 
+void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
+			     smc_wr_tx_filter filter,
+			     smc_wr_tx_dismisser dismisser,
+			     unsigned long data)
+{
+	struct smc_wr_tx_pend_priv *tx_pend;
+	struct smc_wr_rx_hdr *wr_rx;
+	int i;
+
+	for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+		wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+		if (wr_rx->type != wr_rx_hdr_type)
+			continue;
+		tx_pend = &link->wr_tx_pends[i].priv;
+		if (filter(tx_pend, data))
+			dismisser(tx_pend);
+	}
+}
+
 /****************************** receive queue ********************************/
 
 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 0b626728ce44..124f857ddaa8 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -36,6 +36,11 @@ typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
 				  struct smc_link *,
 				  enum ib_wc_status);
 
+typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
+				 unsigned long);
+
+typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
+
 struct smc_wr_rx_handler {
 	struct hlist_node	list;	/* hash table collision resolution */
 	void			(*handler)(struct ib_wc *, void *);
@@ -87,6 +92,10 @@ int smc_wr_tx_put_slot(struct smc_link *link,
 int smc_wr_tx_send(struct smc_link *link,
 		   struct smc_wr_tx_pend_priv *wr_pend_priv);
 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
+			     smc_wr_tx_filter filter,
+			     smc_wr_tx_dismisser dismisser,
+			     unsigned long data);
 
 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
 int smc_wr_rx_post_init(struct smc_link *link);
-- 
cgit v1.2.3


From e6727f39004bd95725342b3b343a14c7d59df07f Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:23 +0100
Subject: smc: send data (through RDMA)

copy data to kernel send buffer, and trigger RDMA write

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile  |   2 +-
 net/smc/af_smc.c  |  13 +-
 net/smc/smc.h     |   1 +
 net/smc/smc_cdc.c |   7 +-
 net/smc/smc_tx.c  | 438 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_tx.h  |  34 +++++
 6 files changed, 491 insertions(+), 4 deletions(-)
 create mode 100644 net/smc/smc_tx.c
 create mode 100644 net/smc/smc_tx.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index ec0fd0338c6d..fc28d79aec7d 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_SMC)	+= smc.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o
+smc-y += smc_cdc.o smc_tx.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 4b1611d46ef7..b62b69c6c718 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -37,6 +37,7 @@
 #include "smc_core.h"
 #include "smc_ib.h"
 #include "smc_pnet.h"
+#include "smc_tx.h"
 
 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 * creation
@@ -410,6 +411,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 	}
 
 	mutex_unlock(&smc_create_lgr_pending);
+	smc_tx_init(smc);
+
 out_connected:
 	smc_copy_sock_settings_to_clc(smc);
 	smc->sk.sk_state = SMC_ACTIVE;
@@ -751,6 +754,8 @@ static void smc_listen_work(struct work_struct *work)
 			goto decline_rdma;
 	}
 
+	smc_tx_init(new_smc);
+
 out_connected:
 	sk_refcnt_debug_inc(newsmcsk);
 	newsmcsk->sk_state = SMC_ACTIVE;
@@ -924,7 +929,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
 	else
-		rc = sock_no_sendmsg(sock, msg, len);
+		rc = smc_tx_sendmsg(smc, msg, len);
 out:
 	release_sock(sk);
 	return rc;
@@ -1005,6 +1010,12 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 			mask |= smc_accept_poll(sk);
 		if (sk->sk_err)
 			mask |= POLLERR;
+		if (atomic_read(&smc->conn.sndbuf_space)) {
+			mask |= POLLOUT | POLLWRNORM;
+		} else {
+			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		}
 		/* for now - to be enhanced in follow-on patch */
 	}
 
diff --git a/net/smc/smc.h b/net/smc/smc.h
index a8d4a68aac30..0c47d84c1db1 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -133,6 +133,7 @@ struct smc_connection {
 	atomic_t		sndbuf_space;	/* remaining space in sndbuf */
 	u16			tx_cdc_seq;	/* sequence # for CDC send */
 	spinlock_t		send_lock;	/* protect wr_sends */
+	struct work_struct	tx_work;	/* retry of smc_cdc_msg_send */
 
 	struct smc_host_cdc_msg	local_rx_ctrl;	/* filled during event_handl.
 						 * .prod cf. TCP rcv_nxt
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 111d23ff9e17..77fe16967376 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -14,6 +14,7 @@
 #include "smc.h"
 #include "smc_wr.h"
 #include "smc_cdc.h"
+#include "smc_tx.h"
 
 /********************************** send *************************************/
 
@@ -52,7 +53,7 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 			       smc_curs_read(&cdcpend->cursor, cdcpend->conn),
 			       cdcpend->conn);
 	}
-	/* subsequent patch: wake if send buffer space available */
+	smc_tx_sndbuf_nonfull(smc);
 	bh_unlock_sock(&smc->sk);
 }
 
@@ -204,7 +205,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* subsequent patch: terminate connection */
 
 	/* piggy backed tx info */
-	/* subsequent patch: wake receivers if receive buffer space available */
+	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+	if (diff_cons && smc_tx_prepared_sends(conn))
+		smc_tx_sndbuf_nonempty(conn);
 
 	/* subsequent patch: trigger socket release if connection closed */
 
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000000..d86bef6cb681
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,438 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer.
+ * Producer:
+ * Copy user space data into send buffer, if send buffer space available.
+ * Consumer:
+ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+
+/***************************** sndbuf producer *******************************/
+
+/* callback implementation for sk.sk_write_space()
+ * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * called under sk_socket lock.
+ */
+static void smc_tx_write_space(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	struct smc_sock *smc = smc_sk(sk);
+	struct socket_wq *wq;
+
+	/* similar to sk_stream_write_space */
+	if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (skwq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait,
+						   POLLOUT | POLLWRNORM |
+						   POLLWRBAND);
+		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+		rcu_read_unlock();
+	}
+}
+
+/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
+ */
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
+{
+	if (smc->sk.sk_socket &&
+	    test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
+		smc->sk.sk_write_space(&smc->sk);
+}
+
+/* blocks sndbuf producer until at least one byte of free space available */
+static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	bool noblock;
+	long timeo;
+	int rc = 0;
+
+	/* similar to sk_stream_wait_memory */
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	noblock = timeo ? false : true;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (1) {
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+		if (sk->sk_err ||
+		    (sk->sk_shutdown & SEND_SHUTDOWN) ||
+		    conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
+			rc = -EPIPE;
+			break;
+		}
+		if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+			rc = -ECONNRESET;
+			break;
+		}
+		if (!timeo) {
+			if (noblock)
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			rc = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			rc = sock_intr_errno(timeo);
+			break;
+		}
+		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+		if (atomic_read(&conn->sndbuf_space))
+			break; /* at least 1 byte of free space available */
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_pending++;
+		sk_wait_event(sk, &timeo,
+			      sk->sk_err ||
+			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
+			      smc_cdc_rxed_any_close_or_senddone(conn) ||
+			      atomic_read(&conn->sndbuf_space),
+			      &wait);
+		sk->sk_write_pending--;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return rc;
+}
+
+/* sndbuf producer: main API called by socket layer.
+ * called under sock lock.
+ */
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
+{
+	size_t copylen, send_done = 0, send_remaining = len;
+	size_t chunk_len, chunk_off, chunk_len_sum;
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor prep;
+	struct sock *sk = &smc->sk;
+	char *sndbuf_base;
+	int tx_cnt_prep;
+	int writespace;
+	int rc, chunk;
+
+	/* This should be in poll */
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+		rc = -EPIPE;
+		goto out_err;
+	}
+
+	while (msg_data_left(msg)) {
+		if (sk->sk_state == SMC_INIT)
+			return -ENOTCONN;
+		if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+		    conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+			return -EPIPE;
+		if (smc_cdc_rxed_any_close(conn))
+			return send_done ?: -ECONNRESET;
+
+		if (!atomic_read(&conn->sndbuf_space)) {
+			rc = smc_tx_wait_memory(smc, msg->msg_flags);
+			if (rc) {
+				if (send_done)
+					return send_done;
+				goto out_err;
+			}
+			continue;
+		}
+
+		/* initialize variables for 1st iteration of subsequent loop */
+		/* could be just 1 byte, even after smc_tx_wait_memory above */
+		writespace = atomic_read(&conn->sndbuf_space);
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, send_remaining, writespace);
+		/* determine start of sndbuf */
+		sndbuf_base = conn->sndbuf_desc->cpu_addr;
+		smc_curs_write(&prep,
+			       smc_curs_read(&conn->tx_curs_prep, conn),
+			       conn);
+		tx_cnt_prep = prep.count;
+		/* determine chunks where to write into sndbuf */
+		/* either unwrapped case, or 1st chunk of wrapped case */
+		chunk_len = min_t(size_t,
+				  copylen, conn->sndbuf_size - tx_cnt_prep);
+		chunk_len_sum = chunk_len;
+		chunk_off = tx_cnt_prep;
+		for (chunk = 0; chunk < 2; chunk++) {
+			rc = memcpy_from_msg(sndbuf_base + chunk_off,
+					     msg, chunk_len);
+			if (rc) {
+				if (send_done)
+					return send_done;
+				goto out_err;
+			}
+			send_done += chunk_len;
+			send_remaining -= chunk_len;
+
+			if (chunk_len_sum == copylen)
+				break; /* either on 1st or 2nd iteration */
+			/* prepare next (== 2nd) iteration */
+			chunk_len = copylen - chunk_len; /* remainder */
+			chunk_len_sum += chunk_len;
+			chunk_off = 0; /* modulo offset in send ring buffer */
+		}
+		/* update cursors */
+		smc_curs_add(conn->sndbuf_size, &prep, copylen);
+		smc_curs_write(&conn->tx_curs_prep,
+			       smc_curs_read(&prep, conn),
+			       conn);
+		/* increased in send tasklet smc_cdc_tx_handler() */
+		smp_mb__before_atomic();
+		atomic_sub(copylen, &conn->sndbuf_space);
+		/* guarantee 0 <= sndbuf_space <= sndbuf_size */
+		smp_mb__after_atomic();
+		/* since we just produced more new data into sndbuf,
+		 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
+		 */
+		smc_tx_sndbuf_nonempty(conn);
+	} /* while (msg_data_left(msg)) */
+
+	return send_done;
+
+out_err:
+	rc = sk_stream_error(sk, msg->msg_flags, rc);
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(rc == -EAGAIN))
+		sk->sk_write_space(sk);
+	return rc;
+}
+
+/***************************** sndbuf consumer *******************************/
+
+/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
+static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
+			     int num_sges, struct ib_sge sges[])
+{
+	struct smc_link_group *lgr = conn->lgr;
+	struct ib_send_wr *failed_wr = NULL;
+	struct ib_rdma_wr rdma_wr;
+	struct smc_link *link;
+	int rc;
+
+	memset(&rdma_wr, 0, sizeof(rdma_wr));
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+	rdma_wr.wr.sg_list = sges;
+	rdma_wr.wr.num_sge = num_sges;
+	rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
+	rdma_wr.remote_addr =
+		lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
+		/* RMBE within RMB */
+		((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
+		/* offset within RMBE */
+		peer_rmbe_offset;
+	rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+	rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
+	if (rc)
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	return rc;
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_advance_cursors(struct smc_connection *conn,
+					  union smc_host_cursor *prod,
+					  union smc_host_cursor *sent,
+					  size_t len)
+{
+	smc_curs_add(conn->peer_rmbe_size, prod, len);
+	/* increased in recv tasklet smc_cdc_msg_rcv() */
+	smp_mb__before_atomic();
+	/* data in flight reduces usable snd_wnd */
+	atomic_sub(len, &conn->peer_rmbe_space);
+	/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+	smp_mb__after_atomic();
+	smc_curs_add(conn->sndbuf_size, sent, len);
+}
+
+/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
+ * usable snd_wnd as max transmit
+ */
+static int smc_tx_rdma_writes(struct smc_connection *conn)
+{
+	size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
+	size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
+	union smc_host_cursor sent, prep, prod, cons;
+	struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
+	struct smc_link_group *lgr = conn->lgr;
+	int to_send, rmbespace;
+	struct smc_link *link;
+	int num_sges;
+	int rc;
+
+	/* source: sndbuf */
+	smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+	smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+	/* cf. wmem_alloc - (snd_max - snd_una) */
+	to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+	if (to_send <= 0)
+		return 0;
+
+	/* destination: RMBE */
+	/* cf. snd_wnd */
+	rmbespace = atomic_read(&conn->peer_rmbe_space);
+	if (rmbespace <= 0)
+		return 0;
+	smc_curs_write(&prod,
+		       smc_curs_read(&conn->local_tx_ctrl.prod, conn),
+		       conn);
+	smc_curs_write(&cons,
+		       smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+		       conn);
+
+	/* if usable snd_wnd closes ask peer to advertise once it opens again */
+	conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+	/* cf. usable snd_wnd */
+	len = min(to_send, rmbespace);
+
+	/* initialize variables for first iteration of subsequent nested loop */
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	dst_off = prod.count;
+	if (prod.wrap == cons.wrap) {
+		/* the filled destination area is unwrapped,
+		 * hence the available free destination space is wrapped
+		 * and we need 2 destination chunks of sum len; start with 1st
+		 * which is limited by what's available in sndbuf
+		 */
+		dst_len = min_t(size_t,
+				conn->peer_rmbe_size - prod.count, len);
+	} else {
+		/* the filled destination area is wrapped,
+		 * hence the available free destination space is unwrapped
+		 * and we need a single destination chunk of entire len
+		 */
+		dst_len = len;
+	}
+	dst_len_sum = dst_len;
+	src_off = sent.count;
+	/* dst_len determines the maximum src_len */
+	if (sent.count + dst_len <= conn->sndbuf_size) {
+		/* unwrapped src case: single chunk of entire dst_len */
+		src_len = dst_len;
+	} else {
+		/* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
+		src_len = conn->sndbuf_size - sent.count;
+	}
+	src_len_sum = src_len;
+	for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+		num_sges = 0;
+		for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+			sges[srcchunk].addr =
+				conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
+				src_off;
+			sges[srcchunk].length = src_len;
+			sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
+			num_sges++;
+			src_off += src_len;
+			if (src_off >= conn->sndbuf_size)
+				src_off -= conn->sndbuf_size;
+						/* modulo in send ring */
+			if (src_len_sum == dst_len)
+				break; /* either on 1st or 2nd iteration */
+			/* prepare next (== 2nd) iteration */
+			src_len = dst_len - src_len; /* remainder */
+			src_len_sum += src_len;
+		}
+		rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
+		if (rc)
+			return rc;
+		if (dst_len_sum == len)
+			break; /* either on 1st or 2nd iteration */
+		/* prepare next (== 2nd) iteration */
+		dst_off = 0; /* modulo offset in RMBE ring buffer */
+		dst_len = len - dst_len; /* remainder */
+		dst_len_sum += dst_len;
+		src_len = min_t(int,
+				dst_len, conn->sndbuf_size - sent.count);
+		src_len_sum = src_len;
+	}
+
+	smc_tx_advance_cursors(conn, &prod, &sent, len);
+	/* update connection's cursors with advanced local cursors */
+	smc_curs_write(&conn->local_tx_ctrl.prod,
+		       smc_curs_read(&prod, conn),
+		       conn);
+							/* dst: peer RMBE */
+	smc_curs_write(&conn->tx_curs_sent,
+		       smc_curs_read(&sent, conn),
+		       conn);
+							/* src: local sndbuf */
+
+	return 0;
+}
+
+/* Wakeup sndbuf consumers from any context (IRQ or process)
+ * since there is more data to transmit; usable snd_wnd as max transmit
+ */
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	spin_lock_bh(&conn->send_lock);
+	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+				   &pend);
+	if (rc < 0) {
+		if (rc == -EBUSY) {
+			rc = 0;
+			schedule_work(&conn->tx_work);
+		}
+		goto out_unlock;
+	}
+
+	rc = smc_tx_rdma_writes(conn);
+	if (rc) {
+		smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+				   (struct smc_wr_tx_pend_priv *)pend);
+		goto out_unlock;
+	}
+
+	rc = smc_cdc_msg_send(conn, wr_buf, pend);
+
+out_unlock:
+	spin_unlock_bh(&conn->send_lock);
+	return rc;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+static void smc_tx_work(struct work_struct *work)
+{
+	struct smc_connection *conn = container_of(work,
+						   struct smc_connection,
+						   tx_work);
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+	lock_sock(&smc->sk);
+	smc_tx_sndbuf_nonempty(conn);
+	release_sock(&smc->sk);
+}
+
+/***************************** send initialize *******************************/
+
+/* Initialize send properties on connection establishment. NB: not __init! */
+void smc_tx_init(struct smc_sock *smc)
+{
+	smc->sk.sk_write_space = smc_tx_write_space;
+	INIT_WORK(&smc->conn.tx_work, smc_tx_work);
+	spin_lock_init(&smc->conn.send_lock);
+}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000000..931c66645624
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,34 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_TX_H
+#define SMC_TX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+#include "smc_cdc.h"
+
+static inline int smc_tx_prepared_sends(struct smc_connection *conn)
+{
+	union smc_host_cursor sent, prep;
+
+	smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+	smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+	return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+}
+
+void smc_tx_init(struct smc_sock *smc);
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+
+#endif /* SMC_TX_H */
-- 
cgit v1.2.3


From 952310ccf2d861966cfb8706f16d5e4eb585edb7 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:24 +0100
Subject: smc: receive data from RMBE

move RMBE data into user space buffer and update managing cursors

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile   |   2 +-
 net/smc/af_smc.c   |   7 +-
 net/smc/smc.h      |   4 +
 net/smc/smc_cdc.c  |   6 +-
 net/smc/smc_core.c |  10 +++
 net/smc/smc_rx.c   | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_rx.h   |  23 ++++++
 net/smc/smc_tx.c   |  37 +++++++++
 net/smc/smc_tx.h   |   1 +
 9 files changed, 304 insertions(+), 3 deletions(-)
 create mode 100644 net/smc/smc_rx.c
 create mode 100644 net/smc/smc_rx.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index fc28d79aec7d..6255e29090b5 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_SMC)	+= smc.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b62b69c6c718..fc9c51c549e5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -38,6 +38,7 @@
 #include "smc_ib.h"
 #include "smc_pnet.h"
 #include "smc_tx.h"
+#include "smc_rx.h"
 
 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 * creation
@@ -412,6 +413,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_tx_init(smc);
+	smc_rx_init(smc);
 
 out_connected:
 	smc_copy_sock_settings_to_clc(smc);
@@ -755,6 +757,7 @@ static void smc_listen_work(struct work_struct *work)
 	}
 
 	smc_tx_init(new_smc);
+	smc_rx_init(new_smc);
 
 out_connected:
 	sk_refcnt_debug_inc(newsmcsk);
@@ -950,7 +953,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
 	else
-		rc = sock_no_recvmsg(sock, msg, len, flags);
+		rc = smc_rx_recvmsg(smc, msg, len, flags);
 out:
 	release_sock(sk);
 	return rc;
@@ -1016,6 +1019,8 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		}
+		if (atomic_read(&smc->conn.bytes_to_rcv))
+			mask |= POLLIN | POLLRDNORM;
 		/* for now - to be enhanced in follow-on patch */
 	}
 
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 0c47d84c1db1..2bb1540b5103 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -115,6 +115,10 @@ struct smc_connection {
 	struct smc_buf_desc	*rmb_desc;	/* RMBE descriptor */
 	int			rmbe_size;	/* RMBE size <== sock rmem */
 	int			rmbe_size_short;/* compressed notation */
+	int			rmbe_update_limit;
+						/* lower limit for consumer
+						 * cursor update
+						 */
 
 	struct smc_host_cdc_msg	local_tx_ctrl;	/* host byte order staging
 						 * buffer for CDC msg send
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 77fe16967376..c0a69300b2f4 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -15,6 +15,7 @@
 #include "smc_wr.h"
 #include "smc_cdc.h"
 #include "smc_tx.h"
+#include "smc_rx.h"
 
 /********************************** send *************************************/
 
@@ -197,6 +198,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		atomic_add(diff_prod, &conn->bytes_to_rcv);
 		/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
 		smp_mb__after_atomic();
+		smc->sk.sk_data_ready(&smc->sk);
 	}
 
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
@@ -216,7 +218,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		return;
 
 	/* data available */
-	/* subsequent patch: send delayed ack, wake receivers */
+	if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+	    (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
+		smc_tx_consumer_update(conn);
 }
 
 /* called under tasklet context */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 537e387b9e85..e5c63950fc28 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -489,6 +489,15 @@ struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
 	return NULL;
 }
 
+/* one of the conditions for announcing a receiver's current window size is
+ * that it "results in a minimum increase in the window size of 10% of the
+ * receive buffer space" [RFC7609]
+ */
+static inline int smc_rmb_wnd_update_limit(int rmbe_size)
+{
+	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+}
+
 /* create the tx buffer for an SMC socket */
 int smc_sndbuf_create(struct smc_sock *smc)
 {
@@ -620,6 +629,7 @@ int smc_rmb_create(struct smc_sock *smc)
 		conn->rmbe_size_short = tmp_bufsize_short;
 		smc->sk.sk_rcvbuf = tmp_bufsize * 2;
 		atomic_set(&conn->bytes_to_rcv, 0);
+		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
 		return 0;
 	} else {
 		return -ENOMEM;
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000000..5d1878732f46
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,217 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+
+/* callback implementation for sk.sk_data_ready()
+ * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_data_ready(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	/* derived from sock_def_readable() */
+	/* called already in smc_listen_work() */
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (skwq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+						POLLRDNORM | POLLRDBAND);
+	if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+	    (sk->sk_state == SMC_CLOSED))
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+	else
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ *   @smc    smc socket
+ *   @timeo  pointer to max seconds to wait, pointer to value 0 for no timeout
+ * Returns:
+ * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
+ * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
+ */
+static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	int rc;
+
+	if (atomic_read(&conn->bytes_to_rcv))
+		return 1;
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	add_wait_queue(sk_sleep(sk), &wait);
+	rc = sk_wait_event(sk, timeo,
+			   sk->sk_err ||
+			   sk->sk_shutdown & RCV_SHUTDOWN ||
+			   sock_flag(sk, SOCK_DONE) ||
+			   atomic_read(&conn->bytes_to_rcv) ||
+			   smc_cdc_rxed_any_close_or_senddone(conn),
+			   &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	return rc;
+}
+
+/* rcvbuf consumer: main API called by socket layer.
+ * called under sk lock.
+ */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+		   int flags)
+{
+	size_t copylen, read_done = 0, read_remaining = len;
+	size_t chunk_len, chunk_off, chunk_len_sum;
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+	int readable, chunk;
+	char *rcvbuf_base;
+	struct sock *sk;
+	long timeo;
+	int target;		/* Read at least these many bytes */
+	int rc;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return -EINVAL; /* future work for sk.sk_family == AF_SMC */
+	if (flags & MSG_OOB)
+		return -EINVAL; /* future work */
+
+	sk = &smc->sk;
+	if (sk->sk_state == SMC_LISTEN)
+		return -ENOTCONN;
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+	msg->msg_namelen = 0;
+	/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
+	rcvbuf_base = conn->rmb_desc->cpu_addr;
+
+	do { /* while (read_remaining) */
+		if (read_done >= target)
+			break;
+
+		if (atomic_read(&conn->bytes_to_rcv))
+			goto copy;
+
+		if (read_done) {
+			if (sk->sk_err ||
+			    sk->sk_state == SMC_CLOSED ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    !timeo ||
+			    signal_pending(current) ||
+			    smc_cdc_rxed_any_close_or_senddone(conn) ||
+			    conn->local_tx_ctrl.conn_state_flags.
+			    peer_conn_abort)
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				read_done = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN ||
+			    smc_cdc_rxed_any_close_or_senddone(conn) ||
+			    conn->local_tx_ctrl.conn_state_flags.
+			    peer_conn_abort)
+				break;
+			if (sk->sk_state == SMC_CLOSED) {
+				if (!sock_flag(sk, SOCK_DONE)) {
+					/* This occurs when user tries to read
+					 * from never connected socket.
+					 */
+					read_done = -ENOTCONN;
+					break;
+				}
+				break;
+			}
+			if (signal_pending(current)) {
+				read_done = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		if (!atomic_read(&conn->bytes_to_rcv)) {
+			smc_rx_wait_data(smc, &timeo);
+			continue;
+		}
+
+copy:
+		/* initialize variables for 1st iteration of subsequent loop */
+		/* could be just 1 byte, even after smc_rx_wait_data above */
+		readable = atomic_read(&conn->bytes_to_rcv);
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, read_remaining, readable);
+		smc_curs_write(&cons,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+		/* determine chunks where to read from rcvbuf */
+		/* either unwrapped case, or 1st chunk of wrapped case */
+		chunk_len = min_t(size_t,
+				  copylen, conn->rmbe_size - cons.count);
+		chunk_len_sum = chunk_len;
+		chunk_off = cons.count;
+		for (chunk = 0; chunk < 2; chunk++) {
+			if (!(flags & MSG_TRUNC)) {
+				rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
+						   chunk_len);
+				if (rc) {
+					if (!read_done)
+						read_done = -EFAULT;
+					goto out;
+				}
+			}
+			read_remaining -= chunk_len;
+			read_done += chunk_len;
+
+			if (chunk_len_sum == copylen)
+				break; /* either on 1st or 2nd iteration */
+			/* prepare next (== 2nd) iteration */
+			chunk_len = copylen - chunk_len; /* remainder */
+			chunk_len_sum += chunk_len;
+			chunk_off = 0; /* modulo offset in recv ring buffer */
+		}
+
+		/* update cursors */
+		if (!(flags & MSG_PEEK)) {
+			smc_curs_add(conn->rmbe_size, &cons, copylen);
+			/* increased in recv tasklet smc_cdc_msg_rcv() */
+			smp_mb__before_atomic();
+			atomic_sub(copylen, &conn->bytes_to_rcv);
+			/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+			smp_mb__after_atomic();
+			smc_curs_write(&conn->local_tx_ctrl.cons,
+				       smc_curs_read(&cons, conn),
+				       conn);
+			/* send consumer cursor update if required */
+			/* similar to advertising new TCP rcv_wnd if required */
+			smc_tx_consumer_update(conn);
+		}
+	} while (read_remaining);
+out:
+	return read_done;
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+	smc->sk.sk_data_ready = smc_rx_data_ready;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000000..b5b80e1f8b0f
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *smc);
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+		   int flags);
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index d86bef6cb681..7e8799fcd3a0 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -427,6 +427,43 @@ static void smc_tx_work(struct work_struct *work)
 	release_sock(&smc->sk);
 }
 
+void smc_tx_consumer_update(struct smc_connection *conn)
+{
+	union smc_host_cursor cfed, cons;
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	int to_confirm, rc;
+
+	smc_curs_write(&cons,
+		       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+		       conn);
+	smc_curs_write(&cfed,
+		       smc_curs_read(&conn->rx_curs_confirmed, conn),
+		       conn);
+	to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+
+	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+	    ((to_confirm > conn->rmbe_update_limit) &&
+	     ((to_confirm > (conn->rmbe_size / 2)) ||
+	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
+		rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+					   &wr_buf, &pend);
+		if (!rc)
+			rc = smc_cdc_msg_send(conn, wr_buf, pend);
+		if (rc < 0) {
+			schedule_work(&conn->tx_work);
+			return;
+		}
+		smc_curs_write(&conn->rx_curs_confirmed,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+		conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+	}
+	if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+	    !atomic_read(&conn->bytes_to_rcv))
+		conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
 /***************************** send initialize *******************************/
 
 /* Initialize send properties on connection establishment. NB: not __init! */
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 931c66645624..1d6a0dcdcfe6 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -30,5 +30,6 @@ void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+void smc_tx_consumer_update(struct smc_connection *conn);
 
 #endif /* SMC_TX_H */
-- 
cgit v1.2.3


From b38d732477e4211351b2680e805d944f66bceec9 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:25 +0100
Subject: smc: socket closing and linkgroup cleanup

smc_shutdown() and smc_release() handling
delayed linkgroup cleanup for linkgroups without connections

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/Makefile    |   2 +-
 net/smc/af_smc.c    | 127 ++++++++++++---
 net/smc/smc.h       |  18 +++
 net/smc/smc_cdc.c   |  23 ++-
 net/smc/smc_cdc.h   |   1 +
 net/smc/smc_close.c | 441 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_close.h |  28 ++++
 net/smc/smc_core.c  |   7 +-
 net/smc/smc_tx.c    |   8 +
 net/smc/smc_wr.c    |  47 ++++--
 net/smc/smc_wr.h    |   2 +
 11 files changed, 668 insertions(+), 36 deletions(-)
 create mode 100644 net/smc/smc_close.c
 create mode 100644 net/smc/smc_close.h

(limited to 'net')

diff --git a/net/smc/Makefile b/net/smc/Makefile
index 6255e29090b5..5cf0cafaa208 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_SMC)	+= smc.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index fc9c51c549e5..3f543d58bc5c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -39,6 +39,7 @@
 #include "smc_pnet.h"
 #include "smc_tx.h"
 #include "smc_rx.h"
+#include "smc_close.h"
 
 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 * creation
@@ -70,14 +71,29 @@ static int smc_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
+	int rc = 0;
 
 	if (!sk)
 		goto out;
 
 	smc = smc_sk(sk);
-	lock_sock(sk);
+	sock_hold(sk);
+	if (sk->sk_state == SMC_LISTEN)
+		/* smc_close_non_accepted() is called and acquires
+		 * sock lock for child sockets again
+		 */
+		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+	else
+		lock_sock(sk);
 
-	sk->sk_state = SMC_CLOSED;
+	if (smc->use_fallback) {
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk);
+	} else {
+		rc = smc_close_active(smc);
+		sock_set_flag(sk, SOCK_DEAD);
+		sk->sk_shutdown |= SHUTDOWN_MASK;
+	}
 	if (smc->clcsock) {
 		sock_release(smc->clcsock);
 		smc->clcsock = NULL;
@@ -86,11 +102,18 @@ static int smc_release(struct socket *sock)
 	/* detach socket */
 	sock_orphan(sk);
 	sock->sk = NULL;
+	if (smc->use_fallback) {
+		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+	} else if (sk->sk_state == SMC_CLOSED) {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
 	release_sock(sk);
 
 	sock_put(sk);
 out:
-	return 0;
+	return rc;
 }
 
 static void smc_destruct(struct sock *sk)
@@ -120,6 +143,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
+	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
 	sk_refcnt_debug_inc(sk);
 
 	return sk;
@@ -417,7 +441,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 out_connected:
 	smc_copy_sock_settings_to_clc(smc);
-	smc->sk.sk_state = SMC_ACTIVE;
+	if (smc->sk.sk_state == SMC_INIT)
+		smc->sk.sk_state = SMC_ACTIVE;
 
 	return rc ? rc : local_contact;
 
@@ -559,8 +584,8 @@ static void smc_accept_unlink(struct sock *sk)
 /* remove a sock from the accept queue to bind it to a new socket created
  * for a socket accept call from user space
  */
-static struct sock *smc_accept_dequeue(struct sock *parent,
-				       struct socket *new_sock)
+struct sock *smc_accept_dequeue(struct sock *parent,
+				struct socket *new_sock)
 {
 	struct smc_sock *isk, *n;
 	struct sock *new_sk;
@@ -581,11 +606,17 @@ static struct sock *smc_accept_dequeue(struct sock *parent,
 }
 
 /* clean up for a created but never accepted sock */
-static void smc_close_non_accepted(struct sock *sk)
+void smc_close_non_accepted(struct sock *sk)
 {
 	struct smc_sock *smc = smc_sk(sk);
 
 	sock_hold(sk);
+	lock_sock(sk);
+	if (!sk->sk_lingertime)
+		/* wait for peer closing */
+		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	if (!smc->use_fallback)
+		smc_close_active(smc);
 	if (smc->clcsock) {
 		struct socket *tcp;
 
@@ -593,7 +624,16 @@ static void smc_close_non_accepted(struct sock *sk)
 		smc->clcsock = NULL;
 		sock_release(tcp);
 	}
-	/* more closing stuff to be added with socket closing patch */
+	sock_set_flag(sk, SOCK_DEAD);
+	sk->sk_shutdown |= SHUTDOWN_MASK;
+	if (smc->use_fallback) {
+		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+	} else {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
+	release_sock(sk);
 	sock_put(sk);
 }
 
@@ -761,11 +801,12 @@ static void smc_listen_work(struct work_struct *work)
 
 out_connected:
 	sk_refcnt_debug_inc(newsmcsk);
-	newsmcsk->sk_state = SMC_ACTIVE;
+	if (newsmcsk->sk_state == SMC_INIT)
+		newsmcsk->sk_state = SMC_ACTIVE;
 enqueue:
 	if (local_contact == SMC_FIRST_CONTACT)
 		mutex_unlock(&smc_create_lgr_pending);
-	lock_sock(&lsmc->sk);
+	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
 	if (lsmc->sk.sk_state == SMC_LISTEN) {
 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
 	} else { /* no longer listening */
@@ -791,6 +832,7 @@ decline_rdma:
 
 out_err:
 	newsmcsk->sk_state = SMC_CLOSED;
+	smc_conn_free(&new_smc->conn);
 	goto enqueue; /* queue new sock with sk_err set */
 }
 
@@ -911,7 +953,8 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr,
 {
 	struct smc_sock *smc;
 
-	if (peer && (sock->sk->sk_state != SMC_ACTIVE))
+	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
+	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
 		return -ENOTCONN;
 
 	smc = smc_sk(sock->sk);
@@ -927,7 +970,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-	if (sk->sk_state != SMC_ACTIVE)
+	if ((sk->sk_state != SMC_ACTIVE) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_INIT))
 		goto out;
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
@@ -947,13 +992,21 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+	if ((sk->sk_state == SMC_INIT) ||
+	    (sk->sk_state == SMC_LISTEN) ||
+	    (sk->sk_state == SMC_CLOSED))
+		goto out;
+
+	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+		rc = 0;
 		goto out;
+	}
 
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
 	else
 		rc = smc_rx_recvmsg(smc, msg, len, flags);
+
 out:
 	release_sock(sk);
 	return rc;
@@ -1013,7 +1066,8 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 			mask |= smc_accept_poll(sk);
 		if (sk->sk_err)
 			mask |= POLLERR;
-		if (atomic_read(&smc->conn.sndbuf_space)) {
+		if (atomic_read(&smc->conn.sndbuf_space) ||
+		    (sk->sk_shutdown & SEND_SHUTDOWN)) {
 			mask |= POLLOUT | POLLWRNORM;
 		} else {
 			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -1021,7 +1075,14 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 		}
 		if (atomic_read(&smc->conn.bytes_to_rcv))
 			mask |= POLLIN | POLLRDNORM;
-		/* for now - to be enhanced in follow-on patch */
+		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+		    (sk->sk_state == SMC_CLOSED))
+			mask |= POLLHUP;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+		if (sk->sk_state == SMC_APPCLOSEWAIT1)
+			mask |= POLLIN;
+
 	}
 
 	return mask;
@@ -1032,31 +1093,53 @@ static int smc_shutdown(struct socket *sock, int how)
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
 	int rc = -EINVAL;
+	int rc1 = 0;
 
 	smc = smc_sk(sk);
 
 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
-		goto out_err;
+		return rc;
 
 	lock_sock(sk);
 
 	rc = -ENOTCONN;
-	if (sk->sk_state == SMC_CLOSED)
+	if ((sk->sk_state != SMC_LISTEN) &&
+	    (sk->sk_state != SMC_ACTIVE) &&
+	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
+	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
 		goto out;
 	if (smc->use_fallback) {
 		rc = kernel_sock_shutdown(smc->clcsock, how);
 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
 		if (sk->sk_shutdown == SHUTDOWN_MASK)
 			sk->sk_state = SMC_CLOSED;
-	} else {
-		rc = sock_no_shutdown(sock, how);
+		goto out;
+	}
+	switch (how) {
+	case SHUT_RDWR:		/* shutdown in both directions */
+		rc = smc_close_active(smc);
+		break;
+	case SHUT_WR:
+		rc = smc_close_shutdown_write(smc);
+		break;
+	case SHUT_RD:
+		if (sk->sk_state == SMC_LISTEN)
+			rc = smc_close_active(smc);
+		else
+			rc = 0;
+			/* nothing more to do because peer is not involved */
+		break;
 	}
+	rc1 = kernel_sock_shutdown(smc->clcsock, how);
+	/* map sock_shutdown_cmd constants to sk_shutdown value range */
+	sk->sk_shutdown |= how + 1;
 
 out:
 	release_sock(sk);
-
-out_err:
-	return rc;
+	return rc ? rc : rc1;
 }
 
 static int smc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 2bb1540b5103..959a5d2014ab 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -30,6 +30,16 @@ enum smc_state {		/* possible states of an SMC socket */
 	SMC_INIT	= 2,
 	SMC_CLOSED	= 7,
 	SMC_LISTEN	= 10,
+	/* normal close */
+	SMC_PEERCLOSEWAIT1	= 20,
+	SMC_PEERCLOSEWAIT2	= 21,
+	SMC_APPFINCLOSEWAIT	= 24,
+	SMC_APPCLOSEWAIT1	= 22,
+	SMC_APPCLOSEWAIT2	= 23,
+	SMC_PEERFINCLOSEWAIT	= 25,
+	/* abnormal close */
+	SMC_PEERABORTWAIT	= 26,
+	SMC_PROCESSABORT	= 27,
 };
 
 struct smc_link_group;
@@ -164,7 +174,13 @@ struct smc_sock {				/* smc sock container */
 	struct work_struct	smc_listen_work;/* prepare new accept socket */
 	struct list_head	accept_q;	/* sockets to be accepted */
 	spinlock_t		accept_q_lock;	/* protects accept_q */
+	struct delayed_work	sock_put_work;	/* final socket freeing */
 	bool			use_fallback;	/* fallback to tcp */
+	u8			wait_close_tx_prepared : 1;
+						/* shutdown wr or close
+						 * started, waiting for unsent
+						 * data to be sent
+						 */
 };
 
 static inline struct smc_sock *smc_sk(const struct sock *sk)
@@ -250,5 +266,7 @@ void smc_conn_free(struct smc_connection *conn);
 int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
 		    struct smc_ib_device *smcibdev, u8 ibport,
 		    struct smc_clc_msg_local *lcl, int srv_first_contact);
+struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
+void smc_close_non_accepted(struct sock *sk);
 
 #endif	/* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index c0a69300b2f4..5a339493872e 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -16,6 +16,7 @@
 #include "smc_cdc.h"
 #include "smc_tx.h"
 #include "smc_rx.h"
+#include "smc_close.h"
 
 /********************************** send *************************************/
 
@@ -55,6 +56,9 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 			       cdcpend->conn);
 	}
 	smc_tx_sndbuf_nonfull(smc);
+	if (smc->sk.sk_state != SMC_ACTIVE)
+		/* wake up smc_close_wait_tx_pends() */
+		smc->sk.sk_state_change(&smc->sk);
 	bh_unlock_sock(&smc->sk);
 }
 
@@ -149,6 +153,14 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
 				(unsigned long)conn);
 }
 
+bool smc_cdc_tx_has_pending(struct smc_connection *conn)
+{
+	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+	return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
+				     smc_cdc_tx_filter, (unsigned long)conn);
+}
+
 /********************************* receive ***********************************/
 
 static inline bool smc_cdc_before(u16 seq1, u16 seq2)
@@ -201,15 +213,20 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		smc->sk.sk_data_ready(&smc->sk);
 	}
 
-	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
 		smc->sk.sk_err = ECONNRESET;
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	}
 	if (smc_cdc_rxed_any_close_or_senddone(conn))
-		/* subsequent patch: terminate connection */
+		smc_close_passive_received(smc);
 
 	/* piggy backed tx info */
 	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
-	if (diff_cons && smc_tx_prepared_sends(conn))
+	if (diff_cons && smc_tx_prepared_sends(conn)) {
 		smc_tx_sndbuf_nonempty(conn);
+		/* trigger socket release if connection closed */
+		smc_close_wake_tx_prepared(smc);
+	}
 
 	/* subsequent patch: trigger socket release if connection closed */
 
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 135f61369a60..8e1d76f26007 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -212,6 +212,7 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
 int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
 		     struct smc_cdc_tx_pend *pend);
 int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+bool smc_cdc_tx_has_pending(struct smc_connection *conn);
 int smc_cdc_init(void) __init;
 
 #endif /* SMC_CDC_H */
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000000..d70c05b57021
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,441 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Socket Closing - normal and abnormal
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_CLOSE_WAIT_TX_PENDS_TIME		(5 * HZ)
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	/* Close non-accepted connections */
+	while ((sk = smc_accept_dequeue(parent, NULL)))
+		smc_close_non_accepted(sk);
+}
+
+static void smc_close_wait_tx_pends(struct smc_sock *smc)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+	signed long timeout;
+
+	timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!signal_pending(current) && timeout) {
+		int rc;
+
+		rc = sk_wait_event(sk, &timeout,
+				   !smc_cdc_tx_has_pending(&smc->conn),
+				   &wait);
+		if (rc)
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+
+	if (!timeout)
+		return;
+
+	if (!smc_tx_prepared_sends(&smc->conn))
+		return;
+
+	smc->wait_close_tx_prepared = 1;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!signal_pending(current) && timeout) {
+		int rc;
+
+		rc = sk_wait_event(sk, &timeout,
+				   !smc_tx_prepared_sends(&smc->conn) ||
+				   (sk->sk_err == ECONNABORTED) ||
+				   (sk->sk_err == ECONNRESET),
+				   &wait);
+		if (rc)
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+	if (smc->wait_close_tx_prepared)
+		/* wake up socket closing */
+		smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+	conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+	if (atomic_read(&conn->bytes_to_rcv))
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	else
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_abort(struct smc_connection *conn)
+{
+	conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+/* terminate smc socket abnormally - active abort
+ * RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+
+	bh_lock_sock(&smc->sk);
+	smc->sk.sk_err = ECONNABORTED;
+	if (smc->clcsock && smc->clcsock->sk) {
+		smc->clcsock->sk->sk_err = ECONNABORTED;
+		smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+	}
+	switch (smc->sk.sk_state) {
+	case SMC_INIT:
+		smc->sk.sk_state = SMC_PEERABORTWAIT;
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		txflags->peer_conn_abort = 1;
+		sock_release(smc->clcsock);
+		if (!smc_cdc_rxed_any_close(&smc->conn))
+			smc->sk.sk_state = SMC_PEERABORTWAIT;
+		else
+			smc->sk.sk_state = SMC_CLOSED;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+		if (!txflags->peer_conn_closed) {
+			smc->sk.sk_state = SMC_PEERABORTWAIT;
+			txflags->peer_conn_abort = 1;
+			sock_release(smc->clcsock);
+		} else {
+			smc->sk.sk_state = SMC_CLOSED;
+		}
+		break;
+	case SMC_PROCESSABORT:
+	case SMC_APPFINCLOSEWAIT:
+		if (!txflags->peer_conn_closed) {
+			txflags->peer_conn_abort = 1;
+			sock_release(smc->clcsock);
+		}
+		smc->sk.sk_state = SMC_CLOSED;
+		break;
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+	case SMC_CLOSED:
+		break;
+	}
+
+	sock_set_flag(&smc->sk, SOCK_DEAD);
+	bh_unlock_sock(&smc->sk);
+	smc->sk.sk_state_change(&smc->sk);
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	int old_state;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_LINGER) &&
+	    !(current->flags & PF_EXITING))
+		timeout = sk->sk_lingertime;
+
+again:
+	old_state = sk->sk_state;
+	switch (old_state) {
+	case SMC_INIT:
+		sk->sk_state = SMC_CLOSED;
+		if (smc->smc_listen_work.func)
+			flush_work(&smc->smc_listen_work);
+		sock_put(sk);
+		break;
+	case SMC_LISTEN:
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk); /* wake up accept */
+		if (smc->clcsock && smc->clcsock->sk) {
+			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+			/* wake up kernel_accept of smc_tcp_listen_worker */
+			smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+		}
+		release_sock(sk);
+		smc_close_cleanup_listen(sk);
+		flush_work(&smc->tcp_listen_work);
+		lock_sock(sk);
+		break;
+	case SMC_ACTIVE:
+		smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		if (sk->sk_state == SMC_ACTIVE) {
+			/* send close request */
+			rc = smc_close_final(conn);
+			sk->sk_state = SMC_PEERCLOSEWAIT1;
+		} else {
+			/* peer event has changed the state */
+			goto again;
+		}
+		break;
+	case SMC_APPFINCLOSEWAIT:
+		/* socket already shutdown wr or both (active close) */
+		if (txflags->peer_done_writing &&
+		    !txflags->peer_conn_closed) {
+			/* just shutdown wr done, send close request */
+			rc = smc_close_final(conn);
+		}
+		sk->sk_state = SMC_CLOSED;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		if (!smc_cdc_rxed_any_close(conn))
+			smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		if (sk->sk_err != ECONNABORTED) {
+			/* confirm close from peer */
+			rc = smc_close_final(conn);
+			if (rc)
+				break;
+		}
+		if (smc_cdc_rxed_any_close(conn))
+			/* peer has closed the socket already */
+			sk->sk_state = SMC_CLOSED;
+		else
+			/* peer has just issued a shutdown write */
+			sk->sk_state = SMC_PEERFINCLOSEWAIT;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+		/* peer sending PeerConnectionClosed will cause transition */
+		break;
+	case SMC_PROCESSABORT:
+		cancel_work_sync(&conn->tx_work);
+		smc_close_abort(conn);
+		sk->sk_state = SMC_CLOSED;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_PEERABORTWAIT:
+	case SMC_CLOSED:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(&smc->sk);
+	return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+	struct sock *sk = &smc->sk;
+
+	switch (sk->sk_state) {
+	case SMC_ACTIVE:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		smc_close_abort(&smc->conn);
+		sk->sk_state = SMC_PROCESSABORT;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+		if (txflags->peer_done_writing &&
+		    !txflags->peer_conn_closed) {
+			/* just shutdown, but not yet closed locally */
+			smc_close_abort(&smc->conn);
+			sk->sk_state = SMC_PROCESSABORT;
+		} else {
+			sk->sk_state = SMC_CLOSED;
+		}
+		break;
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+		sk->sk_state = SMC_CLOSED;
+		break;
+	case SMC_INIT:
+	case SMC_PROCESSABORT:
+	/* nothing to do, add tracing in future patch */
+		break;
+	}
+}
+
+/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
+ * or peer_done_writing.
+ * Called under tasklet context.
+ */
+void smc_close_passive_received(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *rxflags =
+		&smc->conn.local_rx_ctrl.conn_state_flags;
+	struct sock *sk = &smc->sk;
+	int old_state;
+
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	if (smc->clcsock && smc->clcsock->sk)
+		smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(&smc->sk, SOCK_DONE);
+
+	old_state = sk->sk_state;
+
+	if (rxflags->peer_conn_abort) {
+		smc_close_passive_abort_received(smc);
+		goto wakeup;
+	}
+
+	switch (sk->sk_state) {
+	case SMC_INIT:
+		if (atomic_read(&smc->conn.bytes_to_rcv) ||
+		    (rxflags->peer_done_writing &&
+		     !rxflags->peer_conn_closed))
+			sk->sk_state = SMC_APPCLOSEWAIT1;
+		else
+			sk->sk_state = SMC_CLOSED;
+		break;
+	case SMC_ACTIVE:
+		sk->sk_state = SMC_APPCLOSEWAIT1;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+		if (rxflags->peer_done_writing)
+			sk->sk_state = SMC_PEERCLOSEWAIT2;
+		/* fall through to check for closing */
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+		if (!smc_cdc_rxed_any_close(&smc->conn))
+			break;
+		if (sock_flag(sk, SOCK_DEAD) &&
+		    (sk->sk_shutdown == SHUTDOWN_MASK)) {
+			/* smc_release has already been called locally */
+			sk->sk_state = SMC_CLOSED;
+		} else {
+			/* just shutdown, but not yet closed locally */
+			sk->sk_state = SMC_APPFINCLOSEWAIT;
+		}
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+	case SMC_PROCESSABORT:
+	case SMC_CLOSED:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+wakeup:
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(sk);
+	sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+	sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+	if ((sk->sk_state == SMC_CLOSED) &&
+	    (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
+}
+
+void smc_close_sock_put_work(struct work_struct *work)
+{
+	struct smc_sock *smc = container_of(to_delayed_work(work),
+					    struct smc_sock,
+					    sock_put_work);
+
+	sock_put(&smc->sk);
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	struct sock *sk = &smc->sk;
+	int old_state;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_LINGER))
+		timeout = sk->sk_lingertime;
+
+again:
+	old_state = sk->sk_state;
+	switch (old_state) {
+	case SMC_ACTIVE:
+		smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		/* send close wr request */
+		rc = smc_close_wr(conn);
+		if (sk->sk_state == SMC_ACTIVE)
+			sk->sk_state = SMC_PEERCLOSEWAIT1;
+		else
+			goto again;
+		break;
+	case SMC_APPCLOSEWAIT1:
+		/* passive close */
+		if (!smc_cdc_rxed_any_close(conn))
+			smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		/* confirm close from peer */
+		rc = smc_close_wr(conn);
+		sk->sk_state = SMC_APPCLOSEWAIT2;
+		break;
+	case SMC_APPCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_PROCESSABORT:
+	case SMC_PEERABORTWAIT:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(&smc->sk);
+	return rc;
+}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000000..bc9a2df3633c
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,28 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CLOSE_H
+#define SMC_CLOSE_H
+
+#include <linux/workqueue.h>
+
+#include "smc.h"
+
+#define SMC_MAX_STREAM_WAIT_TIMEOUT		(2 * HZ)
+#define SMC_CLOSE_SOCK_PUT_DELAY		HZ
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc);
+void smc_close_active_abort(struct smc_sock *smc);
+int smc_close_active(struct smc_sock *smc);
+void smc_close_passive_received(struct smc_sock *smc);
+void smc_close_sock_put_work(struct work_struct *work);
+int smc_close_shutdown_write(struct smc_sock *smc);
+
+#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index e5c63950fc28..8b1d34378829 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -23,6 +23,7 @@
 #include "smc_wr.h"
 #include "smc_llc.h"
 #include "smc_cdc.h"
+#include "smc_close.h"
 
 #define SMC_LGR_NUM_INCR	256
 #define SMC_LGR_FREE_DELAY	(600 * HZ)
@@ -295,6 +296,7 @@ void smc_lgr_free(struct smc_link_group *lgr)
 void smc_lgr_terminate(struct smc_link_group *lgr)
 {
 	struct smc_connection *conn;
+	struct smc_sock *smc;
 	struct rb_node *node;
 
 	spin_lock_bh(&smc_lgr_list.lock);
@@ -311,11 +313,14 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	node = rb_first(&lgr->conns_all);
 	while (node) {
 		conn = rb_entry(node, struct smc_connection, alert_node);
+		smc = container_of(conn, struct smc_sock, conn);
+		sock_hold(&smc->sk);
 		__smc_lgr_unregister_conn(conn);
+		smc_close_active_abort(smc);
+		sock_put(&smc->sk);
 		node = rb_first(&lgr->conns_all);
 	}
 	write_unlock_bh(&lgr->conns_lock);
-	schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
 }
 
 /* Determine vlan of internal TCP socket.
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 7e8799fcd3a0..6e73b28915ea 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -139,6 +139,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		if (sk->sk_state == SMC_INIT)
 			return -ENOTCONN;
 		if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+		    (smc->sk.sk_err == ECONNABORTED) ||
 		    conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
 			return -EPIPE;
 		if (smc_cdc_rxed_any_close(conn))
@@ -392,6 +393,13 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 				   &pend);
 	if (rc < 0) {
 		if (rc == -EBUSY) {
+			struct smc_sock *smc =
+				container_of(conn, struct smc_sock, conn);
+
+			if (smc->sk.sk_err == ECONNABORTED) {
+				rc = sock_error(&smc->sk);
+				goto out_unlock;
+			}
 			rc = 0;
 			schedule_work(&conn->tx_work);
 		}
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 14f3f3fa6e52..eadf157418dc 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -81,6 +81,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 	if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
 		return;
 	if (wc->status) {
+		struct smc_link_group *lgr;
+
 		for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
 			/* clear full struct smc_wr_tx_pend including .priv */
 			memset(&link->wr_tx_pends[i], 0,
@@ -89,9 +91,10 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 			       sizeof(link->wr_tx_bufs[i]));
 			clear_bit(i, link->wr_tx_mask);
 		}
-		/* tbd in future patch: terminate connections of this link
-		 * group abnormally
-		 */
+		/* terminate connections of this link group abnormally */
+		lgr = container_of(link, struct smc_link_group,
+				   lnk[SMC_SINGLE_LINK]);
+		smc_lgr_terminate(lgr);
 	}
 	if (pnd_snd.handler)
 		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
@@ -176,9 +179,12 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
 			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
 		if (!rc) {
-			/* tbd in future patch: timeout - terminate connections
-			 * of this link group abnormally
-			 */
+			/* timeout - terminate connections */
+			struct smc_link_group *lgr;
+
+			lgr = container_of(link, struct smc_link_group,
+					   lnk[SMC_SINGLE_LINK]);
+			smc_lgr_terminate(lgr);
 			return -EPIPE;
 		}
 		if (rc == -ERESTARTSYS)
@@ -256,6 +262,24 @@ void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
 	}
 }
 
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+			   smc_wr_tx_filter filter, unsigned long data)
+{
+	struct smc_wr_tx_pend_priv *tx_pend;
+	struct smc_wr_rx_hdr *wr_rx;
+	int i;
+
+	for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+		wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+		if (wr_rx->type != wr_rx_hdr_type)
+			continue;
+		tx_pend = &link->wr_tx_pends[i].priv;
+		if (filter(tx_pend, data))
+			return true;
+	}
+	return false;
+}
+
 /****************************** receive queue ********************************/
 
 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
@@ -310,14 +334,19 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			smc_wr_rx_demultiplex(&wc[i]);
 			smc_wr_rx_post(link); /* refill WR RX */
 		} else {
+			struct smc_link_group *lgr;
+
 			/* handle status errors */
 			switch (wc[i].status) {
 			case IB_WC_RETRY_EXC_ERR:
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
-			/* tbd in future patch: terminate connections of this
-			 * link group abnormally
-			 */
+				/* terminate connections of this link group
+				 * abnormally
+				 */
+				lgr = container_of(link, struct smc_link_group,
+						   lnk[SMC_SINGLE_LINK]);
+				smc_lgr_terminate(lgr);
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 124f857ddaa8..0b9beeda6053 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -92,6 +92,8 @@ int smc_wr_tx_put_slot(struct smc_link *link,
 int smc_wr_tx_send(struct smc_link *link,
 		   struct smc_wr_tx_pend_priv *wr_pend_priv);
 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+			   smc_wr_tx_filter filter, unsigned long data);
 void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
 			     smc_wr_tx_filter filter,
 			     smc_wr_tx_dismisser dismisser,
-- 
cgit v1.2.3


From f16a7dd5cf27eeda187425c9c7d96802a549f9c4 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Mon, 9 Jan 2017 16:55:26 +0100
Subject: smc: netlink interface for SMC sockets

Support for SMC socket monitoring via netlink sockets of protocol
NETLINK_SOCK_DIAG.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/smc.h             |  20 ++++
 include/net/sock.h            |   3 +
 include/uapi/linux/netlink.h  |   1 +
 include/uapi/linux/smc_diag.h |  85 +++++++++++++++++
 net/smc/Kconfig               |   9 ++
 net/smc/Makefile              |   1 +
 net/smc/af_smc.c              |  43 ++++++++-
 net/smc/smc.h                 |   2 +
 net/smc/smc_close.c           |   1 +
 net/smc/smc_diag.c            | 215 ++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 379 insertions(+), 1 deletion(-)
 create mode 100644 include/net/smc.h
 create mode 100644 include/uapi/linux/smc_diag.h
 create mode 100644 net/smc/smc_diag.c

(limited to 'net')

diff --git a/include/net/smc.h b/include/net/smc.h
new file mode 100644
index 000000000000..12d26358ad9f
--- /dev/null
+++ b/include/net/smc.h
@@ -0,0 +1,20 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for the SMC module (socket related)
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef _SMC_H
+#define _SMC_H
+
+struct smc_hashinfo {
+	rwlock_t lock;
+	struct hlist_head ht;
+};
+
+int smc_hash_sk(struct sock *sk);
+void smc_unhash_sk(struct sock *sk);
+#endif	/* _SMC_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 99deda67eba0..389a0a619b45 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -70,6 +70,7 @@
 #include <net/checksum.h>
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
+#include <net/smc.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -986,6 +987,7 @@ struct request_sock_ops;
 struct timewait_sock_ops;
 struct inet_hashinfo;
 struct raw_hashinfo;
+struct smc_hashinfo;
 struct module;
 
 /*
@@ -1094,6 +1096,7 @@ struct proto {
 		struct inet_hashinfo	*hashinfo;
 		struct udp_table	*udp_table;
 		struct raw_hashinfo	*raw_hash;
+		struct smc_hashinfo	*smc_hash;
 	} h;
 
 	struct module		*owner;
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 0dba4e4ed2be..f3946a27bd07 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -27,6 +27,7 @@
 #define NETLINK_ECRYPTFS	19
 #define NETLINK_RDMA		20
 #define NETLINK_CRYPTO		21	/* Crypto layer */
+#define NETLINK_SMC		22	/* SMC monitoring */
 
 #define NETLINK_INET_DIAG	NETLINK_SOCK_DIAG
 
diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
new file mode 100644
index 000000000000..0063919fea34
--- /dev/null
+++ b/include/uapi/linux/smc_diag.h
@@ -0,0 +1,85 @@
+#ifndef _UAPI_SMC_DIAG_H_
+#define _UAPI_SMC_DIAG_H_
+
+#include <linux/types.h>
+#include <linux/inet_diag.h>
+#include <rdma/ib_verbs.h>
+
+/* Request structure */
+struct smc_diag_req {
+	__u8	diag_family;
+	__u8	pad[2];
+	__u8	diag_ext;		/* Query extended information */
+	struct inet_diag_sockid	id;
+};
+
+/* Base info structure. It contains socket identity (addrs/ports/cookie) based
+ * on the internal clcsock, and more SMC-related socket data
+ */
+struct smc_diag_msg {
+	__u8	diag_family;
+	__u8	diag_state;
+	__u8	diag_fallback;
+	__u8	diag_shutdown;
+	struct inet_diag_sockid id;
+
+	__u32	diag_uid;
+	__u64	diag_inode;
+};
+
+/* Extensions */
+
+enum {
+	SMC_DIAG_NONE,
+	SMC_DIAG_CONNINFO,
+	SMC_DIAG_LGRINFO,
+	SMC_DIAG_SHUTDOWN,
+	__SMC_DIAG_MAX,
+};
+
+#define SMC_DIAG_MAX (__SMC_DIAG_MAX - 1)
+
+/* SMC_DIAG_CONNINFO */
+
+struct smc_diag_cursor {
+	__u16	reserved;
+	__u16	wrap;
+	__u32	count;
+};
+
+struct smc_diag_conninfo {
+	__u32			token;		/* unique connection id */
+	__u32			sndbuf_size;	/* size of send buffer */
+	__u32			rmbe_size;	/* size of RMB element */
+	__u32			peer_rmbe_size;	/* size of peer RMB element */
+	/* local RMB element cursors */
+	struct smc_diag_cursor	rx_prod;	/* received producer cursor */
+	struct smc_diag_cursor	rx_cons;	/* received consumer cursor */
+	/* peer RMB element cursors */
+	struct smc_diag_cursor	tx_prod;	/* sent producer cursor */
+	struct smc_diag_cursor	tx_cons;	/* sent consumer cursor */
+	__u8			rx_prod_flags;	/* received producer flags */
+	__u8			rx_conn_state_flags; /* recvd connection flags*/
+	__u8			tx_prod_flags;	/* sent producer flags */
+	__u8			tx_conn_state_flags; /* sent connection flags*/
+	/* send buffer cursors */
+	struct smc_diag_cursor	tx_prep;	/* prepared to be sent cursor */
+	struct smc_diag_cursor	tx_sent;	/* sent cursor */
+	struct smc_diag_cursor	tx_fin;		/* confirmed sent cursor */
+};
+
+/* SMC_DIAG_LINKINFO */
+
+struct smc_diag_linkinfo {
+	__u8 link_id;			/* link identifier */
+	__u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */
+	__u8 ibport;			/* RDMA device port number */
+	__u8 gid[40];			/* local GID */
+	__u8 peer_gid[40];		/* peer GID */
+};
+
+struct smc_diag_lgrinfo {
+	struct smc_diag_linkinfo	lnk[1];
+	__u8				role;
+};
+#endif /* _UAPI_SMC_DIAG_H_ */
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index bc029803e728..c717ef0896aa 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -9,3 +9,12 @@ config SMC
 	  a separate socket family SMC.
 
 	  Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+	tristate "SMC: socket monitoring interface"
+	depends on SMC
+	---help---
+	  Support for SMC socket monitoring interface used by tools such as
+	  smcss.
+
+	  if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 5cf0cafaa208..188104654b54 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_SMC)	+= smc.o
+obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 3f543d58bc5c..5d4208ad029e 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -29,6 +29,7 @@
 #include <linux/in.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <net/smc.h>
 
 #include "smc.h"
 #include "smc_clc.h"
@@ -59,13 +60,48 @@ static void smc_set_keepalive(struct sock *sk, int val)
 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
 }
 
-static struct proto smc_proto = {
+static struct smc_hashinfo smc_v4_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+	struct hlist_head *head;
+
+	head = &h->ht;
+
+	write_lock_bh(&h->lock);
+	sk_add_node(sk, head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	write_unlock_bh(&h->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+	write_lock_bh(&h->lock);
+	if (sk_del_node_init(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+struct proto smc_proto = {
 	.name		= "SMC",
 	.owner		= THIS_MODULE,
 	.keepalive	= smc_set_keepalive,
+	.hash		= smc_hash_sk,
+	.unhash		= smc_unhash_sk,
 	.obj_size	= sizeof(struct smc_sock),
+	.h.smc_hash	= &smc_v4_hashinfo,
 	.slab_flags	= SLAB_DESTROY_BY_RCU,
 };
+EXPORT_SYMBOL_GPL(smc_proto);
 
 static int smc_release(struct socket *sock)
 {
@@ -109,6 +145,7 @@ static int smc_release(struct socket *sock)
 		schedule_delayed_work(&smc->sock_put_work,
 				      SMC_CLOSE_SOCK_PUT_DELAY);
 	}
+	sk->sk_prot->unhash(sk);
 	release_sock(sk);
 
 	sock_put(sk);
@@ -144,6 +181,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
 	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
+	sk->sk_prot->hash(sk);
 	sk_refcnt_debug_inc(sk);
 
 	return sk;
@@ -536,6 +574,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 		lsmc->sk.sk_err = -rc;
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -545,6 +584,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 			sock_release(new_clcsock);
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -1320,6 +1360,7 @@ static int __init smc_init(void)
 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
 		goto out_proto;
 	}
+	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
 
 	rc = smc_ib_register_client();
 	if (rc) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 959a5d2014ab..ee5fbea24549 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -21,6 +21,8 @@
 
 #define SMC_MAX_PORTS		2	/* Max # of ports */
 
+extern struct proto smc_proto;
+
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64
 #endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index d70c05b57021..03dfcc6b7661 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -384,6 +384,7 @@ void smc_close_sock_put_work(struct work_struct *work)
 					    struct smc_sock,
 					    sock_put_work);
 
+	smc->sk.sk_prot->unhash(&smc->sk);
 	sock_put(&smc->sk);
 }
 
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000000..d2d01cf70224
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+	sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+		be16_to_cpu(((__be16 *)gid_raw)[0]),
+		be16_to_cpu(((__be16 *)gid_raw)[1]),
+		be16_to_cpu(((__be16 *)gid_raw)[2]),
+		be16_to_cpu(((__be16 *)gid_raw)[3]),
+		be16_to_cpu(((__be16 *)gid_raw)[4]),
+		be16_to_cpu(((__be16 *)gid_raw)[5]),
+		be16_to_cpu(((__be16 *)gid_raw)[6]),
+		be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	r->diag_family = sk->sk_family;
+	if (!smc->clcsock)
+		return;
+	r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+	r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+	r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
+	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+	r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+	r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+				   struct smc_diag_msg *r,
+				   struct user_namespace *user_ns)
+{
+	if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+		return 1;
+
+	r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->diag_inode = sock_i_ino(sk);
+	return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+			   struct netlink_callback *cb,
+			   const struct smc_diag_req *req,
+			   struct nlattr *bc)
+{
+	struct smc_sock *smc = smc_sk(sk);
+	struct user_namespace *user_ns;
+	struct smc_diag_msg *r;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	smc_diag_msg_common_fill(r, sk);
+	r->diag_state = sk->sk_state;
+	r->diag_fallback = smc->use_fallback;
+	user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+	if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+		goto errout;
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
+		struct smc_connection *conn = &smc->conn;
+		struct smc_diag_conninfo cinfo = {
+			.token = conn->alert_token_local,
+			.sndbuf_size = conn->sndbuf_size,
+			.rmbe_size = conn->rmbe_size,
+			.peer_rmbe_size = conn->peer_rmbe_size,
+
+			.rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+			.rx_prod.count = conn->local_rx_ctrl.prod.count,
+			.rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+			.rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+			.tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+			.tx_prod.count = conn->local_tx_ctrl.prod.count,
+			.tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+			.tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+			.tx_prod_flags =
+				*(u8 *)&conn->local_tx_ctrl.prod_flags,
+			.tx_conn_state_flags =
+				*(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+			.rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+			.rx_conn_state_flags =
+				*(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+			.tx_prep.wrap = conn->tx_curs_prep.wrap,
+			.tx_prep.count = conn->tx_curs_prep.count,
+			.tx_sent.wrap = conn->tx_curs_sent.wrap,
+			.tx_sent.count = conn->tx_curs_sent.count,
+			.tx_fin.wrap = conn->tx_curs_fin.wrap,
+			.tx_fin.count = conn->tx_curs_fin.count,
+		};
+
+		if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+			goto errout;
+	}
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
+		struct smc_diag_lgrinfo linfo = {
+			.role = smc->conn.lgr->role,
+			.lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
+			.lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+		};
+
+		memcpy(linfo.lnk[0].ibname,
+		       smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
+		       sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+		smc_gid_be16_convert(linfo.lnk[0].gid,
+				     smc->conn.lgr->lnk[0].gid.raw);
+		smc_gid_be16_convert(linfo.lnk[0].peer_gid,
+				     smc->conn.lgr->lnk[0].peer_gid);
+
+		if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+			goto errout;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *bc = NULL;
+	struct hlist_head *head;
+	struct sock *sk;
+	int rc = 0;
+
+	read_lock(&smc_proto.h.smc_hash->lock);
+	head = &smc_proto.h.smc_hash->ht;
+	if (hlist_empty(head))
+		goto out;
+
+	sk_for_each(sk, head) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+		if (rc)
+			break;
+	}
+
+out:
+	read_unlock(&smc_proto.h.smc_hash->lock);
+	return rc;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+	struct net *net = sock_net(skb->sk);
+
+	if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+	    h->nlmsg_flags & NLM_F_DUMP) {
+		{
+			struct netlink_dump_control c = {
+				.dump = smc_diag_dump,
+				.min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+			};
+			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+		}
+	}
+	return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+	.family = AF_SMC,
+	.dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+	return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+	sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
-- 
cgit v1.2.3


From 39f19ebbf57b403695f7b5f9cf322fe1ddb5d7fb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov
Date: Mon, 9 Jan 2017 10:19:50 -0800
Subject: bpf: rename ARG_PTR_TO_STACK

since ARG_PTR_TO_STACK is no longer just pointer to stack
rename it to ARG_PTR_TO_MEM and adjust comment.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 12 ++++++------
 kernel/bpf/helpers.c     |  4 ++--
 kernel/bpf/verifier.c    | 28 ++++++++++++++--------------
 kernel/trace/bpf_trace.c | 20 ++++++++++----------
 net/core/filter.c        | 40 ++++++++++++++++++++--------------------
 5 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f74ae68086dc..94ea8d2383e6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -69,14 +69,14 @@ enum bpf_arg_type {
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
 	 */
-	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
-	ARG_PTR_TO_RAW_STACK,	/* any pointer to eBPF program stack, area does not
-				 * need to be initialized, helper function must fill
-				 * all bytes or clear them in error case.
+	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
+				 * helper function must fill all bytes or clear
+				 * them in error case.
 				 */
 
-	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
-	ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */
+	ARG_CONST_SIZE,		/* number of bytes accessed from memory */
+	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 045cbe673356..3d24e238221e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.func		= bpf_get_current_comm,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_RAW_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3d4f7bf32aaf..2efdc9128e3c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1034,8 +1034,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_STACK;
 		if (type != PTR_TO_PACKET && type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_CONST_STACK_SIZE ||
-		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+	} else if (arg_type == ARG_CONST_SIZE ||
+		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
 		expected_type = CONST_IMM;
 		/* One exception. Allow UNKNOWN_VALUE registers when the
 		 * boundaries are known and don't cause unsafe memory accesses
@@ -1050,8 +1050,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_CTX;
 		if (type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_STACK ||
-		   arg_type == ARG_PTR_TO_RAW_STACK) {
+	} else if (arg_type == ARG_PTR_TO_MEM ||
+		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a CONST_IMM type. Final test
@@ -1062,7 +1062,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
 			 type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
 			goto err_type;
-		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
+		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -1108,9 +1108,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
 						   false, NULL);
-	} else if (arg_type == ARG_CONST_STACK_SIZE ||
-		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
-		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
+	} else if (arg_type == ARG_CONST_SIZE ||
+		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
+		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
 		/* bpf_xxx(..., buf, len) call will access 'len' bytes
 		 * from stack pointer 'buf'. Check it
@@ -1118,7 +1118,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+			verbose("ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1235,15 +1235,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 {
 	int count = 0;
 
-	if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
 
 	return count > 1 ? -EINVAL : 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fa77311dadb2..f883c43c96f3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.func		= bpf_probe_read,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_RAW_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_ANYTHING,
 };
 
@@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_ANYTHING,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
@@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 	.func		= bpf_trace_printk,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 };
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
@@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -492,8 +492,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
diff --git a/net/core/filter.c b/net/core/filter.c
index 1969b3f118c1..f4d16a905754 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1416,8 +1416,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -1447,8 +1447,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_RAW_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 };
 
 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1601,10 +1601,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -2306,8 +2306,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2377,8 +2377,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2412,8 +2412,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static struct metadata_dst __percpu *md_dst;
@@ -2483,8 +2483,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2509,8 +2509,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
@@ -2593,8 +2593,8 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
-- 
cgit v1.2.3


From b369e7fd41f7cdbe2488cb736ef4f958bb94b5e2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Mon, 9 Jan 2017 10:29:27 -0800
Subject: tcp: make TCP_INFO more consistent

tcp_get_info() has to lock the socket, so lets lock it
for an extended critical section, so that various fields
have consistent values.

This solves an annoying issue that some applications
reported when multiple counters are updated during one
particular rx/rx event, and TCP_INFO was called from
another cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ec97e4b4a62f..c8d46c140b4a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2766,6 +2766,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_sacked = sk->sk_max_ack_backlog;
 		return;
 	}
+
+	slow = lock_sock_fast(sk);
+
 	info->tcpi_ca_state = icsk->icsk_ca_state;
 	info->tcpi_retransmits = icsk->icsk_retransmits;
 	info->tcpi_probes = icsk->icsk_probes_out;
@@ -2816,15 +2819,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_total_retrans = tp->total_retrans;
 
-	slow = lock_sock_fast(sk);
-
 	info->tcpi_bytes_acked = tp->bytes_acked;
 	info->tcpi_bytes_received = tp->bytes_received;
 	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
 	tcp_get_info_chrono_stats(tp, info);
 
-	unlock_sock_fast(sk, slow);
-
 	info->tcpi_segs_out = tp->segs_out;
 	info->tcpi_segs_in = tp->segs_in;
 
@@ -2840,6 +2839,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		do_div(rate64, intv);
 		info->tcpi_delivery_rate = rate64;
 	}
+	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
-- 
cgit v1.2.3


From 3a89eaa65db68bf53bf92dedc60084f810e1779a Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Mon, 9 Jan 2017 16:49:26 -0500
Subject: net: dsa: select NET_SWITCHDEV

The support for DSA Ethernet switch chips depends on TCP/IP networking,
thus explicit that HAVE_NET_DSA depends on INET.

DSA uses SWITCHDEV, thus select it instead of depending on it.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 96e47c539bee..39bb5b3a82f2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,12 +1,13 @@
 config HAVE_NET_DSA
 	def_bool y
-	depends on NETDEVICES && !S390
+	depends on INET && NETDEVICES && !S390
 
 # Drivers must select NET_DSA and the appropriate tagging format
 
 config NET_DSA
 	tristate "Distributed Switch Architecture"
-	depends on HAVE_NET_DSA && NET_SWITCHDEV
+	depends on HAVE_NET_DSA
+	select NET_SWITCHDEV
 	select PHYLIB
 	---help---
 	  Say Y if you want to enable support for the hardware switches supported
-- 
cgit v1.2.3


From 961a9c0a4a9783e47f8cb76134b2303c4872380b Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 9 Jan 2017 14:20:45 +0100
Subject: xfrm: remove unused function

Has been ifdef'd out for more than 10 years, remove it.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_state.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 542074c00c78..d6660a8c0ea5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -90,11 +90,3 @@ void __init xfrm4_state_init(void)
 {
 	xfrm_state_register_afinfo(&xfrm4_state_afinfo);
 }
-
-#if 0
-void __exit xfrm4_state_fini(void)
-{
-	xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
-}
-#endif  /*  0  */
-
-- 
cgit v1.2.3


From 423826a7b104724a16676a75d597a35d3bdb18b8 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 9 Jan 2017 14:20:46 +0100
Subject: xfrm: avoid rcu sparse warning

xfrm/xfrm_state.c:1973:21: error: incompatible types in comparison expression (different address spaces)

Harmless, but lets fix it to reduce the noise.

While at it, get rid of unneeded NULL check, its never hit:

net/ipv4/xfrm4_state.c: xfrm_state_register_afinfo(&xfrm4_state_afinfo);
net/ipv6/xfrm6_state.c: return xfrm_state_register_afinfo(&xfrm6_state_afinfo);
net/ipv6/xfrm6_state.c: xfrm_state_unregister_afinfo(&xfrm6_state_afinfo);

... are the only callsites.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 6b3366fef019..57e9578c35e2 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1932,10 +1932,10 @@ EXPORT_SYMBOL(xfrm_unregister_km);
 int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
 {
 	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+
+	if (WARN_ON(afinfo->family >= NPROTO))
 		return -EAFNOSUPPORT;
+
 	spin_lock_bh(&xfrm_state_afinfo_lock);
 	if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
 		err = -EEXIST;
@@ -1948,14 +1948,14 @@ EXPORT_SYMBOL(xfrm_state_register_afinfo);
 
 int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 {
-	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+	int err = 0, family = afinfo->family;
+
+	if (WARN_ON(family >= NPROTO))
 		return -EAFNOSUPPORT;
+
 	spin_lock_bh(&xfrm_state_afinfo_lock);
 	if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
-		if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo))
+		if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo)
 			err = -EINVAL;
 		else
 			RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
-- 
cgit v1.2.3


From af5d27c4e12b804c065c0e7c87507fea5683dab4 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 9 Jan 2017 14:20:47 +0100
Subject: xfrm: remove xfrm_state_put_afinfo

commit 44abdc3047aecafc141dfbaf1ed
("xfrm: replace rwlock on xfrm_state_afinfo with rcu") made
xfrm_state_put_afinfo equivalent to rcu_read_unlock.

Use spatch to replace it with direct calls to rcu_read_unlock:

@@
struct xfrm_state_afinfo *a;
@@

-  xfrm_state_put_afinfo(a);
+  rcu_read_unlock();

old:
 text    data     bss     dec     hex filename
22570      72     424   23066    5a1a xfrm_state.o
 1612       0       0    1612     64c xfrm_output.o
new:
22554      72     424   23050    5a0a xfrm_state.o
 1596       0       0    1596     63c xfrm_output.o

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  1 -
 net/xfrm/xfrm_output.c |  8 +++-----
 net/xfrm/xfrm_state.c  | 31 +++++++++++++------------------
 3 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 31947b9c21d6..957d0cc30691 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -343,7 +343,6 @@ struct xfrm_state_afinfo {
 int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
 int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
 struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
-void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
 
 struct xfrm_input_afinfo {
 	unsigned int		family;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 637387bbaaea..8ba29fe58352 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -246,10 +246,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu)
 		return;
 
 	afinfo = xfrm_state_get_afinfo(proto);
-	if (!afinfo)
-		return;
-
-	afinfo->local_error(skb, mtu);
-	xfrm_state_put_afinfo(afinfo);
+	if (afinfo)
+		afinfo->local_error(skb, mtu);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(xfrm_local_error);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 57e9578c35e2..783084484582 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -192,7 +192,7 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
 	else
 		err = -EEXIST;
 	spin_unlock_bh(&xfrm_type_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_type);
@@ -213,7 +213,7 @@ int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
 	else
 		typemap[type->proto] = NULL;
 	spin_unlock_bh(&xfrm_type_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_type);
@@ -235,13 +235,13 @@ retry:
 	if (unlikely(type && !try_module_get(type->owner)))
 		type = NULL;
 	if (!type && !modload_attempted) {
-		xfrm_state_put_afinfo(afinfo);
+		rcu_read_unlock();
 		request_module("xfrm-type-%d-%d", family, proto);
 		modload_attempted = 1;
 		goto retry;
 	}
 
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return type;
 }
 
@@ -280,7 +280,7 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family)
 
 out:
 	spin_unlock_bh(&xfrm_mode_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_mode);
@@ -308,7 +308,7 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
 	}
 
 	spin_unlock_bh(&xfrm_mode_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_mode);
@@ -331,13 +331,13 @@ retry:
 	if (unlikely(mode && !try_module_get(mode->owner)))
 		mode = NULL;
 	if (!mode && !modload_attempted) {
-		xfrm_state_put_afinfo(afinfo);
+		rcu_read_unlock();
 		request_module("xfrm-mode-%d-%d", family, encap);
 		modload_attempted = 1;
 		goto retry;
 	}
 
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return mode;
 }
 
@@ -651,13 +651,13 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 	afinfo->init_tempsel(&x->sel, fl);
 
 	if (family != tmpl->encap_family) {
-		xfrm_state_put_afinfo(afinfo);
+		rcu_read_unlock();
 		afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
 		if (!afinfo)
 			return -1;
 	}
 	afinfo->init_temprop(x, tmpl, daddr, saddr);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -1474,7 +1474,7 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
 	if (afinfo->tmpl_sort)
 		err = afinfo->tmpl_sort(dst, src, n);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_tmpl_sort);
@@ -1494,7 +1494,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
 	if (afinfo->state_sort)
 		err = afinfo->state_sort(dst, src, n);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_state_sort);
@@ -1978,11 +1978,6 @@ struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
 	return afinfo;
 }
 
-void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
-{
-	rcu_read_unlock();
-}
-
 /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
 void xfrm_state_delete_tunnel(struct xfrm_state *x)
 {
@@ -2025,7 +2020,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
 	if (afinfo->init_flags)
 		err = afinfo->init_flags(x);
 
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	if (err)
 		goto error;
-- 
cgit v1.2.3


From 711059b9752ad09ae6bcd4be8e48d30e5db483d8 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 9 Jan 2017 14:20:48 +0100
Subject: xfrm: add and use xfrm_state_afinfo_get_rcu

xfrm_init_tempstate is always called from within rcu read side section.
We can thus use a simpler function that doesn't call rcu_read_lock
again.

While at it, also make xfrm_init_tempstate return value void, the
return value was never tested.

A followup patch will replace remaining callers of xfrm_state_get_afinfo
with xfrm_state_afinfo_get_rcu variant and then remove the 'old'
get_afinfo interface.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    |  1 +
 net/xfrm/xfrm_state.c | 25 +++++++++++++++----------
 2 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 957d0cc30691..c52197cf51dc 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -343,6 +343,7 @@ struct xfrm_state_afinfo {
 int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
 int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
 struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
+struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family);
 
 struct xfrm_input_afinfo {
 	unsigned int		family;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 783084484582..b5dad899fb0e 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -639,26 +639,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 }
 EXPORT_SYMBOL(xfrm_sad_getinfo);
 
-static int
+static void
 xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 		    const struct xfrm_tmpl *tmpl,
 		    const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 		    unsigned short family)
 {
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	if (!afinfo)
-		return -1;
-	afinfo->init_tempsel(&x->sel, fl);
+	struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
+
+	if (afinfo)
+		afinfo->init_tempsel(&x->sel, fl);
 
 	if (family != tmpl->encap_family) {
-		rcu_read_unlock();
-		afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+		afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
 		if (!afinfo)
-			return -1;
+			return;
 	}
 	afinfo->init_temprop(x, tmpl, daddr, saddr);
-	rcu_read_unlock();
-	return 0;
 }
 
 static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
@@ -1966,6 +1963,14 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
 
+struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family)
+{
+	if (unlikely(family >= NPROTO))
+		return NULL;
+
+	return rcu_dereference(xfrm_state_afinfo[family]);
+}
+
 struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
 {
 	struct xfrm_state_afinfo *afinfo;
-- 
cgit v1.2.3


From 75cda62d9ca2cd3fab0412d438fcd1bfa0580b3d Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 9 Jan 2017 14:20:49 +0100
Subject: xfrm: state: simplify rcu_read_unlock handling in two spots

Instead of:
  if (foo) {
      unlock();
      return bar();
   }
   unlock();
do:
   unlock();
   if (foo)
       return bar();

This is ok because rcu protected structure is only dereferenced before
the conditional.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index b5dad899fb0e..a62097e640b5 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -231,17 +231,18 @@ retry:
 		return NULL;
 	typemap = afinfo->type_map;
 
-	type = typemap[proto];
+	type = READ_ONCE(typemap[proto]);
 	if (unlikely(type && !try_module_get(type->owner)))
 		type = NULL;
+
+	rcu_read_unlock();
+
 	if (!type && !modload_attempted) {
-		rcu_read_unlock();
 		request_module("xfrm-type-%d-%d", family, proto);
 		modload_attempted = 1;
 		goto retry;
 	}
 
-	rcu_read_unlock();
 	return type;
 }
 
@@ -327,17 +328,17 @@ retry:
 	if (unlikely(afinfo == NULL))
 		return NULL;
 
-	mode = afinfo->mode_map[encap];
+	mode = READ_ONCE(afinfo->mode_map[encap]);
 	if (unlikely(mode && !try_module_get(mode->owner)))
 		mode = NULL;
+
+	rcu_read_unlock();
 	if (!mode && !modload_attempted) {
-		rcu_read_unlock();
 		request_module("xfrm-mode-%d-%d", family, encap);
 		modload_attempted = 1;
 		goto retry;
 	}
 
-	rcu_read_unlock();
 	return mode;
 }
 
-- 
cgit v1.2.3


From a505e58252715bbc18a0ee1abae23615fe2586db Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan
Date: Tue, 10 Jan 2017 07:47:15 -0800
Subject: packet: pdiag_put_ring() should return TX_RING info for TPACKET_V3

Commit 7f953ab2ba46 ("af_packet: TX_RING support for TPACKET_V3")
now makes it possible to use TX_RING with TPACKET_V3, so make the
the relevant information available via 'ss -e -a --packet'

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/diag.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/packet/diag.c b/net/packet/diag.c
index 0ed68f0238bf..7ef1c881ae74 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -73,8 +73,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
 {
 	struct packet_diag_ring pdr;
 
-	if (!ring->pg_vec || ((ver > TPACKET_V2) &&
-				(nl_type == PACKET_DIAG_TX_RING)))
+	if (!ring->pg_vec)
 		return 0;
 
 	pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
-- 
cgit v1.2.3


From 7acec26cec5aa75e0dc54da1fb0b1f8acf575273 Mon Sep 17 00:00:00 2001
From: Jorge Ramirez-Ortiz
Date: Mon, 9 Jan 2017 15:25:49 +0100
Subject: cfg80211: wext does not need to set monitor channel in managed mode

There is not a valid reason to attempt setting the monitor channel
while in managed mode. Since this code path only deals with this mode,
remove the code block.

Johannes: I'll note that the comment indicated it was for backward
compatibility, but the code wasn't functional since switching the
monitor channel isn't supported (any more?) when in managed mode, as
that mode owns the channel configuration. Additionally, since monitor
can't be done on a managed mode interface, this would only have had
any effect to start with if a separate monitor interface is present,
in which case it's better to change the channel through that anyway,
if even possible.

Signed-off-by: Jorge Ramirez-Ortiz <jorge.ramirez-ortiz@linaro.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/wext-sme.c | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'net')

diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 995163830a61..c434f193f39a 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -105,30 +105,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
 			goto out;
 	}
 
-
 	wdev->wext.connect.channel = chan;
-
-	/*
-	 * SSID is not set, we just want to switch monitor channel,
-	 * this is really just backward compatibility, if the SSID
-	 * is set then we use the channel to select the BSS to use
-	 * to connect to instead. If we were connected on another
-	 * channel we disconnected above and reconnect below.
-	 */
-	if (chan && !wdev->wext.connect.ssid_len) {
-		struct cfg80211_chan_def chandef = {
-			.width = NL80211_CHAN_WIDTH_20_NOHT,
-			.center_freq1 = freq,
-		};
-
-		chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
-		if (chandef.chan)
-			err = cfg80211_set_monitor_channel(rdev, &chandef);
-		else
-			err = -EINVAL;
-		goto out;
-	}
-
 	err = cfg80211_mgd_wext_connect(rdev, wdev);
  out:
 	wdev_unlock(wdev);
-- 
cgit v1.2.3


From 9f91484f6fcc28f9b5ebe11755e7488e39ea75e4 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Mon, 9 Jan 2017 18:13:51 -0500
Subject: net: dsa: make "label" property optional for dsa2

In the new DTS bindings for DSA (dsa2), the "ethernet" and "link"
phandles are respectively mandatory and exclusive to CPU port and DSA
link device tree nodes.

Simplify dsa2.c a bit by checking the presence of such phandle instead
of checking the redundant "label" property.

Then the Linux philosophy for Ethernet switch ports is to expose them to
userspace as standard NICs by default. Thus use the standard enumerated
"eth%d" device name if no "label" property is provided for a user port.
This allows to save DTS files from subjective net device names.

If one wants to rename an interface, udev rules can be used as usual.

Of course the current behavior is unchanged, and the optional "label"
property for user ports has precedence over the enumerated name.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/dsa/dsa.txt | 20 ++++++++-----------
 net/dsa/dsa2.c                                    | 24 ++++-------------------
 2 files changed, 12 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt
index a4a570fb2494..cfe8f64eca4f 100644
--- a/Documentation/devicetree/bindings/net/dsa/dsa.txt
+++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt
@@ -34,13 +34,9 @@ Required properties:
 
 Each port children node must have the following mandatory properties:
 - reg			: Describes the port address in the switch
-- label			: Describes the label associated with this port, which
-                          will become the netdev name. Special labels are
-			  "cpu" to indicate a CPU port and "dsa" to
-			  indicate an uplink/downlink port between switches in
-			  the cluster.
 
-A port labelled "dsa" has the following mandatory property:
+An uplink/downlink port between switches in the cluster has the following
+mandatory property:
 
 - link			: Should be a list of phandles to other switch's DSA
 			  port. This port is used as the outgoing port
@@ -48,12 +44,17 @@ A port labelled "dsa" has the following mandatory property:
 			  information must be given, not just the one hop
 			  routes to neighbouring switches.
 
-A port labelled "cpu" has the following mandatory property:
+A CPU port has the following mandatory property:
 
 - ethernet		: Should be a phandle to a valid Ethernet device node.
                           This host device is what the switch port is
 			  connected to.
 
+A user port has the following optional property:
+
+- label			: Describes the label associated with this port, which
+                          will become the netdev name.
+
 Port child nodes may also contain the following optional standardised
 properties, described in binding documents:
 
@@ -107,7 +108,6 @@ linked into one DSA cluster.
 
 			switch0port5: port@5 {
 				reg = <5>;
-				label = "dsa";
 				phy-mode = "rgmii-txid";
 				link = <&switch1port6
 					&switch2port9>;
@@ -119,7 +119,6 @@ linked into one DSA cluster.
 
 			port@6 {
 				reg = <6>;
-				label = "cpu";
 				ethernet = <&fec1>;
 				fixed-link {
 					speed = <100>;
@@ -165,7 +164,6 @@ linked into one DSA cluster.
 
 			switch1port5: port@5 {
 				reg = <5>;
-				label = "dsa";
 				link = <&switch2port9>;
 				phy-mode = "rgmii-txid";
 				fixed-link {
@@ -176,7 +174,6 @@ linked into one DSA cluster.
 
 			switch1port6: port@6 {
 				reg = <6>;
-				label = "dsa";
 				phy-mode = "rgmii-txid";
 				link = <&switch0port5>;
 				fixed-link {
@@ -255,7 +252,6 @@ linked into one DSA cluster.
 
 			switch2port9: port@9 {
 				reg = <9>;
-				label = "dsa";
 				phy-mode = "rgmii-txid";
 				link = <&switch1port5
 					&switch0port5>;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index bad119cee2a3..9526bdf2a34a 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -81,30 +81,12 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
 
 static bool dsa_port_is_dsa(struct device_node *port)
 {
-	const char *name;
-
-	name = of_get_property(port, "label", NULL);
-	if (!name)
-		return false;
-
-	if (!strcmp(name, "dsa"))
-		return true;
-
-	return false;
+	return !!of_parse_phandle(port, "link", 0);
 }
 
 static bool dsa_port_is_cpu(struct device_node *port)
 {
-	const char *name;
-
-	name = of_get_property(port, "label", NULL);
-	if (!name)
-		return false;
-
-	if (!strcmp(name, "cpu"))
-		return true;
-
-	return false;
+	return !!of_parse_phandle(port, "ethernet", 0);
 }
 
 static bool dsa_ds_find_port(struct dsa_switch *ds,
@@ -268,6 +250,8 @@ static int dsa_user_port_apply(struct device_node *port, u32 index,
 	int err;
 
 	name = of_get_property(port, "label", NULL);
+	if (!name)
+		name = "eth%d";
 
 	err = dsa_slave_create(ds, ds->dev, index, name);
 	if (err) {
-- 
cgit v1.2.3


From 44bb765cf07ab6622e6fdf4bce546b43bd20faee Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Tue, 10 Jan 2017 12:32:36 -0800
Subject: net: dsa: Implement ndo_get_phys_port_name()

Return the physical port number of a DSA created network device using
ndo_get_phys_port_name().

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5cd5b8137c08..fed3fbd403cb 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -990,6 +990,15 @@ static int dsa_slave_get_phys_port_id(struct net_device *dev,
 
 	ppid->id_len = sizeof(p->port);
 	memcpy(ppid->id, &p->port, ppid->id_len);
+}
+
+static int dsa_slave_get_phys_port_name(struct net_device *dev,
+					char *name, size_t len)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (snprintf(name, len, "p%d", p->port) >= len)
+		return -EINVAL;
 
 	return 0;
 }
@@ -1042,6 +1051,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
 	.ndo_get_phys_port_id	= dsa_slave_get_phys_port_id,
+	.ndo_get_phys_port_name	= dsa_slave_get_phys_port_name,
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
-- 
cgit v1.2.3


From 592050b2541407d033da18226d3644644832d082 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Tue, 10 Jan 2017 12:32:37 -0800
Subject: Revert "net: dsa: Implement ndo_get_phys_port_id"

This reverts commit 3a543ef479868e36c95935de320608a7e41466ca ("net: dsa:
Implement ndo_get_phys_port_id") since it misuses the purpose of
ndo_get_phys_port_id(). We have ndo_get_phys_port_name() to do the
correct thing for us now.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index fed3fbd403cb..0cdcaf526987 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -983,15 +983,6 @@ static void dsa_slave_poll_controller(struct net_device *dev)
 }
 #endif
 
-static int dsa_slave_get_phys_port_id(struct net_device *dev,
-				      struct netdev_phys_item_id *ppid)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	ppid->id_len = sizeof(p->port);
-	memcpy(ppid->id, &p->port, ppid->id_len);
-}
-
 static int dsa_slave_get_phys_port_name(struct net_device *dev,
 					char *name, size_t len)
 {
@@ -1050,7 +1041,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
-	.ndo_get_phys_port_id	= dsa_slave_get_phys_port_id,
 	.ndo_get_phys_port_name	= dsa_slave_get_phys_port_name,
 };
 
-- 
cgit v1.2.3


From 55733350e5e8b70c5e54a30dbf98148c695f21f5 Mon Sep 17 00:00:00 2001
From: Simon Horman
Date: Wed, 11 Jan 2017 14:05:42 +0100
Subject: flow disector: ARP support

Allow dissection of (R)ARP operation hardware and protocol addresses
for Ethernet hardware and IPv4 protocol addresses.

There are currently no users of FLOW_DISSECTOR_KEY_ARP.
A follow-up patch will allow FLOW_DISSECTOR_KEY_ARP to be used by the
flower classifier.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 19 +++++++++++++++
 net/core/flow_dissector.c    | 57 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

(limited to 'net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index d896a33e00d4..ac9703018a3a 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -88,6 +88,24 @@ struct flow_dissector_key_addrs {
 	};
 };
 
+/**
+ * flow_dissector_key_arp:
+ *	@ports: Operation, source and target addresses for an ARP header
+ *              for Ethernet hardware addresses and IPv4 protocol addresses
+ *		sip: Sender IP address
+ *		tip: Target IP address
+ *		op:  Operation
+ *		sha: Sender hardware address
+ *		tpa: Target hardware address
+ */
+struct flow_dissector_key_arp {
+	__u32 sip;
+	__u32 tip;
+	__u8 op;
+	unsigned char sha[ETH_ALEN];
+	unsigned char tha[ETH_ALEN];
+};
+
 /**
  * flow_dissector_key_tp_ports:
  *	@ports: port numbers of Transport header
@@ -141,6 +159,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
+	FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
 	FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index fe4e1531976c..5b3800fe20f3 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -138,6 +138,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
+	struct flow_dissector_key_arp *key_arp;
 	struct flow_dissector_key_ports *key_ports;
 	struct flow_dissector_key_icmp *key_icmp;
 	struct flow_dissector_key_tags *key_tags;
@@ -379,6 +380,62 @@ mpls:
 
 		nhoff += FCOE_HEADER_LEN;
 		goto out_good;
+
+	case htons(ETH_P_ARP):
+	case htons(ETH_P_RARP): {
+		struct {
+			unsigned char ar_sha[ETH_ALEN];
+			unsigned char ar_sip[4];
+			unsigned char ar_tha[ETH_ALEN];
+			unsigned char ar_tip[4];
+		} *arp_eth, _arp_eth;
+		const struct arphdr *arp;
+		struct arphdr *_arp;
+
+		arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
+					   hlen, &_arp);
+		if (!arp)
+			goto out_bad;
+
+		if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+		    arp->ar_pro != htons(ETH_P_IP) ||
+		    arp->ar_hln != ETH_ALEN ||
+		    arp->ar_pln != 4 ||
+		    (arp->ar_op != htons(ARPOP_REPLY) &&
+		     arp->ar_op != htons(ARPOP_REQUEST)))
+			goto out_bad;
+
+		arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
+					       sizeof(_arp_eth), data,
+					       hlen - sizeof(_arp),
+					       &_arp_eth);
+		if (!arp)
+			goto out_bad;
+
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_ARP)) {
+
+			key_arp = skb_flow_dissector_target(flow_dissector,
+							    FLOW_DISSECTOR_KEY_ARP,
+							    target_container);
+
+			memcpy(&key_arp->sip, arp_eth->ar_sip,
+			       sizeof(key_arp->sip));
+			memcpy(&key_arp->tip, arp_eth->ar_tip,
+			       sizeof(key_arp->tip));
+
+			/* Only store the lower byte of the opcode;
+			 * this covers ARPOP_REPLY and ARPOP_REQUEST.
+			 */
+			key_arp->op = ntohs(arp->ar_op) & 0xff;
+
+			ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
+			ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
+		}
+
+		goto out_good;
+	}
+
 	default:
 		goto out_bad;
 	}
-- 
cgit v1.2.3


From 99d31326cbe6951872af5c8a6bc2679388a4d9ef Mon Sep 17 00:00:00 2001
From: Simon Horman
Date: Wed, 11 Jan 2017 14:05:43 +0100
Subject: net/sched: cls_flower: Support matching on ARP

Support matching on ARP operation, and hardware and protocol addresses
for Ethernet hardware and IPv4 protocol addresses.

Example usage:

tc qdisc add dev eth0 ingress

tc filter add dev eth0 protocol arp parent ffff: flower indev eth0 \
	arp_op request arp_sip 10.0.0.1 action drop
tc filter add dev eth0 protocol rarp parent ffff: flower indev eth0 \
	arp_op reply arp_tha 52:54:3f:00:00:00/24 action drop

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 11 ++++++++++
 net/sched/cls_flower.c       | 51 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index a081efbd61a2..1e5e1ddfdaca 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -416,6 +416,17 @@ enum {
 	TCA_FLOWER_KEY_ICMPV6_TYPE,	/* u8 */
 	TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */
 
+	TCA_FLOWER_KEY_ARP_SIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_SIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_OP,		/* u8 */
+	TCA_FLOWER_KEY_ARP_OP_MASK,	/* u8 */
+	TCA_FLOWER_KEY_ARP_SHA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_SHA_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA_MASK,	/* ETH_ALEN */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 970db7a41684..a3bfda3091a4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -40,6 +40,7 @@ struct fl_flow_key {
 	};
 	struct flow_dissector_key_ports tp;
 	struct flow_dissector_key_icmp icmp;
+	struct flow_dissector_key_arp arp;
 	struct flow_dissector_key_keyid enc_key_id;
 	union {
 		struct flow_dissector_key_ipv4_addrs enc_ipv4;
@@ -401,6 +402,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ICMPV6_CODE]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_SIP]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_SIP_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_TIP]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_TIP_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_OP]		= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_OP_MASK]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_SHA]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_SHA_MASK]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_THA]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_THA_MASK]	= { .len = ETH_ALEN },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -572,6 +583,23 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       &mask->icmp.code,
 			       TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
 			       sizeof(key->icmp.code));
+	} else if (key->basic.n_proto == htons(ETH_P_ARP) ||
+		   key->basic.n_proto == htons(ETH_P_RARP)) {
+		fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP,
+			       &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK,
+			       sizeof(key->arp.sip));
+		fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP,
+			       &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK,
+			       sizeof(key->arp.tip));
+		fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP,
+			       &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK,
+			       sizeof(key->arp.op));
+		fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+			       mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+			       sizeof(key->arp.sha));
+		fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+			       mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+			       sizeof(key->arp.tha));
 	}
 
 	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
@@ -688,6 +716,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
 			     FLOW_DISSECTOR_KEY_PORTS, tp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ICMP, icmp);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_ARP, arp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_VLAN, vlan);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
@@ -1112,6 +1142,27 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 				  TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
 				  sizeof(key->icmp.code))))
 		goto nla_put_failure;
+	else if ((key->basic.n_proto == htons(ETH_P_ARP) ||
+		  key->basic.n_proto == htons(ETH_P_RARP)) &&
+		 (fl_dump_key_val(skb, &key->arp.sip,
+				  TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip,
+				  TCA_FLOWER_KEY_ARP_SIP_MASK,
+				  sizeof(key->arp.sip)) ||
+		  fl_dump_key_val(skb, &key->arp.tip,
+				  TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip,
+				  TCA_FLOWER_KEY_ARP_TIP_MASK,
+				  sizeof(key->arp.tip)) ||
+		  fl_dump_key_val(skb, &key->arp.op,
+				  TCA_FLOWER_KEY_ARP_OP, &mask->arp.op,
+				  TCA_FLOWER_KEY_ARP_OP_MASK,
+				  sizeof(key->arp.op)) ||
+		  fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+				  mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+				  sizeof(key->arp.sha)) ||
+		  fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+				  mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+				  sizeof(key->arp.tha))))
+		goto nla_put_failure;
 
 	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
-- 
cgit v1.2.3


From cef0acd4d7d4811d2d19cd0195031bf0dfe41249 Mon Sep 17 00:00:00 2001
From: David Spinadel
Date: Mon, 21 Nov 2016 16:58:40 +0200
Subject: mac80211: Add RX flag to indicate ICV stripped

Add a flag that indicates that the WEP ICV was stripped from an
RX packet, allowing the device to not transfer that if it's
already checked.

Signed-off-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 5 ++++-
 net/mac80211/wep.c     | 3 ++-
 net/mac80211/wpa.c     | 3 ++-
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5f5cb194cd78..86967b85dfd0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1017,7 +1017,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_DECRYPTED: This frame was decrypted in hardware.
  * @RX_FLAG_MMIC_STRIPPED: the Michael MIC is stripped off this frame,
  *	verification has been done by the hardware.
- * @RX_FLAG_IV_STRIPPED: The IV/ICV are stripped from this frame.
+ * @RX_FLAG_IV_STRIPPED: The IV and ICV are stripped from this frame.
  *	If this flag is set, the stack cannot do any replay detection
  *	hence the driver or hardware will have to do that.
  * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this
@@ -1088,6 +1088,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before.
  *	This is used for AMSDU subframes which can have the same PN as
  *	the first subframe.
+ * @RX_FLAG_ICV_STRIPPED: The ICV is stripped from this frame. CRC checking must
+ *	be done in the hardware.
  */
 enum mac80211_rx_flags {
 	RX_FLAG_MMIC_ERROR		= BIT(0),
@@ -1123,6 +1125,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_RADIOTAP_VENDOR_DATA	= BIT(31),
 	RX_FLAG_MIC_STRIPPED		= BIT_ULL(32),
 	RX_FLAG_ALLOW_SAME_PN		= BIT_ULL(33),
+	RX_FLAG_ICV_STRIPPED		= BIT_ULL(34),
 };
 
 #define RX_FLAG_STBC_SHIFT		26
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index efa3f48f1ec5..73e8f347802e 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -293,7 +293,8 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
 			return RX_DROP_UNUSABLE;
 		ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
 		/* remove ICV */
-		if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
+		if (!(status->flag & RX_FLAG_ICV_STRIPPED) &&
+		    pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
 			return RX_DROP_UNUSABLE;
 	}
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 8af6dd388d11..c1ef22df865f 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -294,7 +294,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
 		return RX_DROP_UNUSABLE;
 
 	/* Trim ICV */
-	skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
+	if (!(status->flag & RX_FLAG_ICV_STRIPPED))
+		skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
 
 	/* Remove IV */
 	memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen);
-- 
cgit v1.2.3


From 2cc6f5a715cf9f7d5ca8fd2bde62130fcfd90b94 Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Wed, 19 Oct 2016 15:02:32 +0200
Subject: mac80211: set wifi_acked[_valid] bits for transmitted SKBs

There may be situations in which the in-kernel originator of an
SKB cares about its wifi transmission status. To have that, set
the wifi_acked[_valid] bits before freeing/orphaning the SKB if
the destructor is set. The originator can then use it in there.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/status.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index f7c5ae597639..d6a1bfaa7a81 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -541,6 +541,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 	} else if (info->ack_frame_id) {
 		ieee80211_report_ack_skb(local, info, acked, dropped);
 	}
+
+	if (!dropped && skb->destructor) {
+		skb->wifi_acked_valid = 1;
+		skb->wifi_acked = acked;
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 738b35ccee1bcd7cf4af147edd76e7880533ad9f Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Wed, 11 Jan 2017 21:13:02 -0800
Subject: net: core: Make netif_wake_subqueue a wrapper

netif_wake_subqueue() is duplicating the same thing that netif_tx_wake_queue()
does, so make it call it directly after looking up the queue from the index.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 +++++++++++++-
 net/core/dev.c            | 22 ----------------------
 2 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ebd9e2c12f44..97ae0ac513ee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3106,7 +3106,19 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev,
 	return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
 }
 
-void netif_wake_subqueue(struct net_device *dev, u16 queue_index);
+/**
+ *	netif_wake_subqueue - allow sending packets on subqueue
+ *	@dev: network device
+ *	@queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+
+	netif_tx_wake_queue(txq);
+}
 
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index e98cc06d2577..ad5959e56116 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2408,28 +2408,6 @@ void netif_schedule_queue(struct netdev_queue *txq)
 }
 EXPORT_SYMBOL(netif_schedule_queue);
 
-/**
- *	netif_wake_subqueue - allow sending packets on subqueue
- *	@dev: network device
- *	@queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-
-	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
-		struct Qdisc *q;
-
-		rcu_read_lock();
-		q = rcu_dereference(txq->qdisc);
-		__netif_schedule(q);
-		rcu_read_unlock();
-	}
-}
-EXPORT_SYMBOL(netif_wake_subqueue);
-
 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 {
 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
-- 
cgit v1.2.3


From 526735ddc0ae8c7a7751e9ad3a57ae76cd3c35d5 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Thu, 12 Jan 2017 14:57:14 +0100
Subject: net: fix AF_SMC related typo

When introducing the new socket family AF_SMC in
commit ac7138746e14 ("smc: establish new socket family"),
a typo in af_family_clock_key_strings has slipped in.
This patch repairs it.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Fixes: ac7138746e14 ("smc: establish new socket family")
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index dbbdc4f16789..8b35debfe454 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
-  "clock-AF_QIPCRTR", "closck-AF_smc"    , "clock-AF_MAX"
+  "clock-AF_QIPCRTR", "clock-AF_SMC"     , "clock-AF_MAX"
 };
 
 /*
-- 
cgit v1.2.3


From 143c017108f8a695e4e1c1f5ec57ee89be9cad70 Mon Sep 17 00:00:00 2001
From: Ursula Braun
Date: Thu, 12 Jan 2017 14:57:15 +0100
Subject: smc: ETH_ALEN as memcpy length for mac addresses

When creating an SMC connection, there is a CLC (connection layer control)
handshake to prepare for RDMA traffic. The corresponding code is part of
commit 0cfdd8f92cac ("smc: connection and link group creation").
Mac addresses to be exchanged in the handshake are copied with a wrong
length of 12 instead of 6 bytes. Following code overwrites the wrongly
copied code, but nevertheless the correct length should already be used for
the preceding mac address copying. Use ETH_ALEN for the memcpy length with
mac addresses.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Fixes: 0cfdd8f92cac ("smc: connection and link group creation")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_clc.c | 10 ++++------
 net/smc/smc_ib.h  |  4 +++-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index e1e684c562b8..cc6b6f8651eb 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/in.h>
+#include <linux/if_ether.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 
@@ -151,8 +152,7 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 	pclc.hdr.version = SMC_CLC_V1;		/* SMC version */
 	memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
 	memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
-	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1],
-	       sizeof(smcibdev->mac[ibport - 1]));
+	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
 
 	/* determine subnet and mask from internal TCP socket */
 	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
@@ -199,8 +199,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
 	memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
 	memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
 	       SMC_GID_SIZE);
-	memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
-	       sizeof(link->smcibdev->mac));
+	memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
 	hton24(cclc.qpn, link->roce_qp->qp_num);
 	cclc.rmb_rkey =
 		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
@@ -252,8 +251,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
 	memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
 	memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
 	       SMC_GID_SIZE);
-	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
-	       sizeof(link->smcibdev->mac[link->ibport - 1]));
+	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
 	hton24(aclc.qpn, link->roce_qp->qp_num);
 	aclc.rmb_rkey =
 		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 3fe2d55c6051..a95f74bb5569 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -11,6 +11,7 @@
 #ifndef _SMC_IB_H
 #define _SMC_IB_H
 
+#include <linux/if_ether.h>
 #include <rdma/ib_verbs.h>
 
 #define SMC_MAX_PORTS			2	/* Max # of ports */
@@ -34,7 +35,8 @@ struct smc_ib_device {				/* ib-device infos for smc */
 	struct ib_cq		*roce_cq_recv;	/* recv completion queue */
 	struct tasklet_struct	send_tasklet;	/* called by send cq handler */
 	struct tasklet_struct	recv_tasklet;	/* called by recv cq handler */
-	char			mac[SMC_MAX_PORTS][6]; /* mac address per port*/
+	char			mac[SMC_MAX_PORTS][ETH_ALEN];
+						/* mac address per port*/
 	union ib_gid		gid[SMC_MAX_PORTS]; /* gid per port */
 	u8			initialized : 1; /* ib dev CQ, evthdl done */
 	struct work_struct	port_event_work;
-- 
cgit v1.2.3


From 6b8cc1d11ef75c5b9c530b3d0d148f3c2dd25f93 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Thu, 12 Jan 2017 11:51:32 +0100
Subject: bpf: pass original insn directly to convert_ctx_access

Currently, when calling convert_ctx_access() callback for the various
program types, we pass in insn->dst_reg, insn->src_reg, insn->off from
the original instruction. This information is needed to rewrite the
instruction that is based on the user ctx structure into a kernel
representation for the ctx. As we'd like to allow access size beyond
just BPF_W, we'd need also insn->code for that in order to decode the
original access size. Given that, lets just pass insn directly to the
convert_ctx_access() callback and work on that to not clutter the
callback with even more arguments we need to pass when everything is
already contained in insn. So lets go through that once, no functional
change.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   7 ++-
 kernel/bpf/verifier.c    |   3 +-
 kernel/trace/bpf_trace.c |  15 ++---
 net/core/filter.c        | 139 +++++++++++++++++++++++++----------------------
 4 files changed, 87 insertions(+), 77 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 94ea8d2383e6..f8c3560b01db 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -161,9 +161,10 @@ struct bpf_verifier_ops {
 				enum bpf_reg_type *reg_type);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
-	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
-				  int src_reg, int ctx_off,
-				  struct bpf_insn *insn, struct bpf_prog *prog);
+	u32 (*convert_ctx_access)(enum bpf_access_type type,
+				  const struct bpf_insn *src,
+				  struct bpf_insn *dst,
+				  struct bpf_prog *prog);
 };
 
 struct bpf_prog_type_list {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2efdc9128e3c..df7e47244e75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3177,8 +3177,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
 			continue;
 
-		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					      insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f883c43c96f3..1860e7f1e5a8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -572,28 +572,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				      int src_reg, int ctx_off,
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
+				      const struct bpf_insn *si,
 				      struct bpf_insn *insn_buf,
 				      struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct bpf_perf_event_data, sample_period):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
-						       data), dst_reg, src_reg,
+						       data), si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, data));
-		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+		*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
 				      offsetof(struct perf_sample_data, period));
 		break;
 	default:
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
-						       regs), dst_reg, src_reg,
+						       regs), si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, regs));
-		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
+				      si->off);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index f4d16a905754..8cfbdefbfb1c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2972,32 +2972,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					int src_reg, int ctx_off,
+static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
+					const struct bpf_insn *si,
 					struct bpf_insn *insn_buf,
 					struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, len):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, len));
 		break;
 
 	case offsetof(struct __sk_buff, protocol):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, protocol));
 		break;
 
 	case offsetof(struct __sk_buff, vlan_proto):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, vlan_proto));
 		break;
 
@@ -3005,17 +3006,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		break;
 
 	case offsetof(struct __sk_buff, ingress_ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, skb_iif));
 		break;
 
@@ -3023,17 +3024,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 
 	case offsetof(struct __sk_buff, hash):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, hash));
 		break;
 
@@ -3041,63 +3042,74 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		break;
 
 	case offsetof(struct __sk_buff, pkt_type):
-		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, queue_mapping):
-		return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_present):
 		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_tci):
 		return convert_skb_access(SKF_AD_VLAN_TAG,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, cb[0]) ...
 	     offsetof(struct __sk_buff, cb[4]):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
 
 		prog->cb_access = 1;
-		ctx_off -= offsetof(struct __sk_buff, cb[0]);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, data);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, cb[0]);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, data);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_classid):
-		ctx_off -= offsetof(struct __sk_buff, tc_classid);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, tc_classid);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, tc_classid);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, data));
 		break;
 
 	case offsetof(struct __sk_buff, data_end):
-		ctx_off -= offsetof(struct __sk_buff, data_end);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
-		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
-				      ctx_off);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_end);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_end);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_index):
@@ -3105,110 +3117,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
-		break;
 #else
 		if (type == BPF_WRITE)
-			*insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+			*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
 		else
-			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
-		break;
+			*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
 #endif
+		break;
 	}
 
 	return insn - insn_buf;
 }
 
 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
-					  int dst_reg, int src_reg,
-					  int ctx_off,
+					  const struct bpf_insn *si,
 					  struct bpf_insn *insn_buf,
 					  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct bpf_sock, bound_dev_if):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					offsetof(struct sock, sk_bound_dev_if));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
 
 	case offsetof(struct bpf_sock, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_family));
 		break;
 
 	case offsetof(struct bpf_sock, type):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
 		break;
 
 	case offsetof(struct bpf_sock, protocol):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
 		break;
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					 int src_reg, int ctx_off,
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+					 const struct bpf_insn *si,
 					 struct bpf_insn *insn_buf,
 					 struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 	default:
-		return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
-						    ctx_off, insn_buf, prog);
+		return sk_filter_convert_ctx_access(type, si, insn_buf, prog);
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				  int src_reg, int ctx_off,
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
 				  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct xdp_md, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
 	}
-- 
cgit v1.2.3


From 62c7989b24dbd348c2507ee6458ebf5637d6ddb5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Thu, 12 Jan 2017 11:51:33 +0100
Subject: bpf: allow b/h/w/dw access for bpf's cb in ctx

When structs are used to store temporary state in cb[] buffer that is
used with programs and among tail calls, then the generated code will
not always access the buffer in bpf_w chunks. We can ease programming
of it and let this act more natural by allowing for aligned b/h/w/dw
sized access for cb[] ctx member. Various test cases are attached as
well for the selftest suite. Potentially, this can also be reused for
other program types to pass data around.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |   8 +-
 net/core/filter.c                           |  41 ++-
 tools/testing/selftests/bpf/test_verifier.c | 442 +++++++++++++++++++++++++++-
 3 files changed, 478 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df7e47244e75..d60e12c67266 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3165,10 +3165,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	insn = env->prog->insnsi + delta;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
 			type = BPF_READ;
-		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+		else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+			 insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+			 insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
 			 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
 			type = BPF_WRITE;
 		else
diff --git a/net/core/filter.c b/net/core/filter.c
index 8cfbdefbfb1c..90383860e224 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2776,11 +2776,33 @@ static bool __is_valid_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
+
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
+
+	switch (off) {
+	case offsetof(struct __sk_buff, cb[0]) ...
+	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+		if (size == sizeof(__u16) &&
+		    off > offsetof(struct __sk_buff, cb[4]) + sizeof(__u16))
+			return false;
+		if (size == sizeof(__u32) &&
+		    off > offsetof(struct __sk_buff, cb[4]))
+			return false;
+		if (size == sizeof(__u64) &&
+		    off > offsetof(struct __sk_buff, cb[2]))
+			return false;
+		if (size != sizeof(__u8)  &&
+		    size != sizeof(__u16) &&
+		    size != sizeof(__u32) &&
+		    size != sizeof(__u64))
+			return false;
+		break;
+	default:
+		if (size != sizeof(__u32))
+			return false;
+	}
 
 	return true;
 }
@@ -2799,7 +2821,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 			break;
 		default:
 			return false;
@@ -2823,7 +2845,7 @@ static bool lwt_is_valid_access(int off, int size,
 		case offsetof(struct __sk_buff, mark):
 		case offsetof(struct __sk_buff, priority):
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 			break;
 		default:
 			return false;
@@ -2915,7 +2937,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		case offsetof(struct __sk_buff, tc_index):
 		case offsetof(struct __sk_buff, priority):
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 		case offsetof(struct __sk_buff, tc_classid):
 			break;
 		default:
@@ -3066,8 +3088,11 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
 					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, cb[0]) ...
-	     offsetof(struct __sk_buff, cb[4]):
+	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+			      offsetof(struct qdisc_skb_cb, data)) %
+			     sizeof(__u64));
 
 		prog->cb_access = 1;
 		off  = si->off;
@@ -3075,10 +3100,10 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
 		off += offsetof(struct sk_buff, cb);
 		off += offsetof(struct qdisc_skb_cb, data);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg,
+			*insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
 					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg,
+			*insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
 					      si->src_reg, off);
 		break;
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 9bb45346dc72..1aa73241c999 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -859,15 +859,451 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
-		"check non-u32 access to cb",
+		"check cb access: byte",
 		.insns = {
-			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_1,
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 3),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: byte, oob 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, oob 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) - 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, oob 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, oob 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) - 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, wrong type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"check cb access: half",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: half, unaligned",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) - 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) - 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, wrong type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"check cb access: word",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: word, unaligned 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: word, unaligned 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: word, unaligned 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: word, unaligned 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: double, unaligned 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, unaligned 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) - 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 6",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) - 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, wrong type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
 				    offsetof(struct __sk_buff, cb[0])),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
-		.errstr_unpriv = "R1 leaks addr",
 		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
 	},
 	{
 		"check out of range skb->cb access",
-- 
cgit v1.2.3


From 79471b10d67a52f5b3744d2e14c06437a65746f2 Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Thu, 12 Jan 2017 14:39:28 +0000
Subject: lwt_bpf: bpf_lwt_prog_cmp() can be static

Fixes the following sparse warning:

net/core/lwt_bpf.c:355:5: warning:
 symbol 'bpf_lwt_prog_cmp' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/lwt_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 71bb3e2eca08..40ef8ae8d93d 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -352,7 +352,7 @@ static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
 	       0;
 }
 
-int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
 {
 	/* FIXME:
 	 * The LWT state is currently rebuilt for delete requests which
-- 
cgit v1.2.3


From c1ce1560a1ae1a58505c26bc0e46ce1aee982d54 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Wed, 11 Jan 2017 18:10:37 -0800
Subject: secure_seq: fix sparse errors

Fixes following warnings :

net/core/secure_seq.c:125:28: warning: incorrect type in argument 1
(different base types)
net/core/secure_seq.c:125:28:    expected unsigned int const [unsigned]
[usertype] a
net/core/secure_seq.c:125:28:    got restricted __be32 [usertype] saddr
net/core/secure_seq.c:125:35: warning: incorrect type in argument 2
(different base types)
net/core/secure_seq.c:125:35:    expected unsigned int const [unsigned]
[usertype] b
net/core/secure_seq.c:125:35:    got restricted __be32 [usertype] daddr
net/core/secure_seq.c:125:43: warning: cast from restricted __be16
net/core/secure_seq.c:125:61: warning: restricted __be16 degrades to
integer

Fixes: 7cd23e5300c1 ("secure_seq: use SipHash in place of MD5")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/secure_seq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 3a9fcec94ace..758f140b6bed 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -122,7 +122,9 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 {
 	u64 seq;
 	net_secret_init();
-	seq = siphash_3u32(saddr, daddr, (u32)sport << 16 | dport, &net_secret);
+	seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+			   (__force u32)sport << 16 | (__force u32)dport,
+			   &net_secret);
 	seq += ktime_get_real_ns();
 	seq &= (1ull << 48) - 1;
 	return seq;
-- 
cgit v1.2.3


From 8fb472c09b9df478a062eacc7841448e40fc3c17 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Thu, 12 Jan 2017 15:53:33 +0100
Subject: ipmr: improve hash scalability

Recently we started using ipmr with thousands of entries and easily hit
soft lockups on smaller devices. The reason is that the hash function
uses the high order bits from the src and dst, but those don't change in
many common cases, also the hash table  is only 64 elements so with
thousands it doesn't scale at all.
This patch migrates the hash table to rhashtable, and in particular the
rhl interface which allows for duplicate elements to be chained because
of the MFC_PROXY support (*,G; *,*,oif cases) which allows for multiple
duplicate entries to be added with different interfaces (IMO wrong, but
it's been in for a long time).

And here are some results from tests I've run in a VM:
 mr_table size (default, allocated for all namespaces):
  Before                    After
   49304 bytes               2400 bytes

 Add 65000 routes (the diff is much larger on smaller devices):
  Before                    After
   1m42s                     58s

 Forwarding 256 byte packets with 65000 routes (test done in a VM):
  Before                    After
   3 Mbps / ~1465 pps        122 Mbps / ~59000 pps

As a bonus we no longer see the soft lockups on smaller devices which
showed up even with 2000 entries before.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |  57 ++++++++---
 net/ipv4/ipmr.c        | 255 +++++++++++++++++++++++++++----------------------
 2 files changed, 182 insertions(+), 130 deletions(-)

(limited to 'net')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index f019b62f27b5..d7f63339ef0b 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -3,6 +3,7 @@
 
 #include <linux/in.h>
 #include <linux/pim.h>
+#include <linux/rhashtable.h>
 #include <net/sock.h>
 #include <uapi/linux/mroute.h>
 
@@ -60,7 +61,6 @@ struct vif_device {
 #define VIFF_STATIC 0x8000
 
 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
-#define MFC_LINES 64
 
 struct mr_table {
 	struct list_head	list;
@@ -69,8 +69,9 @@ struct mr_table {
 	struct sock __rcu	*mroute_sk;
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc_unres_queue;
-	struct list_head	mfc_cache_array[MFC_LINES];
 	struct vif_device	vif_table[MAXVIFS];
+	struct rhltable		mfc_hash;
+	struct list_head	mfc_cache_list;
 	int			maxvif;
 	atomic_t		cache_resolve_queue_len;
 	bool			mroute_do_assert;
@@ -85,17 +86,48 @@ enum {
 	MFC_STATIC = BIT(0),
 };
 
+struct mfc_cache_cmp_arg {
+	__be32 mfc_mcastgrp;
+	__be32 mfc_origin;
+};
+
+/**
+ * struct mfc_cache - multicast routing entries
+ * @mnode: rhashtable list
+ * @mfc_mcastgrp: destination multicast group address
+ * @mfc_origin: source address
+ * @cmparg: used for rhashtable comparisons
+ * @mfc_parent: source interface (iif)
+ * @mfc_flags: entry flags
+ * @expires: unresolved entry expire time
+ * @unresolved: unresolved cached skbs
+ * @last_assert: time of last assert
+ * @minvif: minimum VIF id
+ * @maxvif: maximum VIF id
+ * @bytes: bytes that have passed for this entry
+ * @pkt: packets that have passed for this entry
+ * @wrong_if: number of wrong source interface hits
+ * @lastuse: time of last use of the group (traffic or update)
+ * @ttls: OIF TTL threshold array
+ * @list: global entry list
+ * @rcu: used for entry destruction
+ */
 struct mfc_cache {
-	struct list_head list;
-	__be32 mfc_mcastgrp;			/* Group the entry belongs to 	*/
-	__be32 mfc_origin;			/* Source of packet 		*/
-	vifi_t mfc_parent;			/* Source interface		*/
-	int mfc_flags;				/* Flags on line		*/
+	struct rhlist_head mnode;
+	union {
+		struct {
+			__be32 mfc_mcastgrp;
+			__be32 mfc_origin;
+		};
+		struct mfc_cache_cmp_arg cmparg;
+	};
+	vifi_t mfc_parent;
+	int mfc_flags;
 
 	union {
 		struct {
 			unsigned long expires;
-			struct sk_buff_head unresolved;	/* Unresolved buffers		*/
+			struct sk_buff_head unresolved;
 		} unres;
 		struct {
 			unsigned long last_assert;
@@ -105,18 +137,13 @@ struct mfc_cache {
 			unsigned long pkt;
 			unsigned long wrong_if;
 			unsigned long lastuse;
-			unsigned char ttls[MAXVIFS];	/* TTL thresholds		*/
+			unsigned char ttls[MAXVIFS];
 		} res;
 	} mfc_un;
+	struct list_head list;
 	struct rcu_head	rcu;
 };
 
-#ifdef __BIG_ENDIAN
-#define MFC_HASH(a,b)	(((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1))
-#else
-#define MFC_HASH(a,b)	((((__force u32)(__be32)a)^(((__force u32)(__be32)b)>>2))&(MFC_LINES-1))
-#endif
-
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 824c4fdf21eb..beacd028848c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -299,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 }
 #endif
 
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+				const void *ptr)
+{
+	const struct mfc_cache_cmp_arg *cmparg = arg->key;
+	struct mfc_cache *c = (struct mfc_cache *)ptr;
+
+	return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+	       cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+	.head_offset = offsetof(struct mfc_cache, mnode),
+	.key_offset = offsetof(struct mfc_cache, cmparg),
+	.key_len = sizeof(struct mfc_cache_cmp_arg),
+	.nelem_hint = 3,
+	.locks_mul = 1,
+	.obj_cmpfn = ipmr_hash_cmp,
+	.automatic_shrinking = true,
+};
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
-	unsigned int i;
 
 	/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
 	if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -318,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	write_pnet(&mrt->net, net);
 	mrt->id = id;
 
-	/* Forwarding cache */
-	for (i = 0; i < MFC_LINES; i++)
-		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
-
+	rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
+	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
 	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -338,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
 {
 	del_timer_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, true);
+	rhltable_destroy(&mrt->mfc_hash);
 	kfree(mrt);
 }
 
@@ -839,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 					 __be32 origin,
 					 __be32 mcastgrp)
 {
-	int line = MFC_HASH(mcastgrp, origin);
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
-			return c;
-	}
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		return c;
+
 	return NULL;
 }
 
@@ -853,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
 						    int vifi)
 {
-	int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = htonl(INADDR_ANY),
+			.mfc_origin = htonl(INADDR_ANY)
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
-		if (c->mfc_origin == htonl(INADDR_ANY) &&
-		    c->mfc_mcastgrp == htonl(INADDR_ANY) &&
-		    c->mfc_un.res.ttls[vifi] < 255)
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (c->mfc_un.res.ttls[vifi] < 255)
 			return c;
 
 	return NULL;
@@ -869,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
 					     __be32 mcastgrp, int vifi)
 {
-	int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = htonl(INADDR_ANY)
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c, *proxy;
 
 	if (mcastgrp == htonl(INADDR_ANY))
 		goto skip;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
-		if (c->mfc_origin == htonl(INADDR_ANY) &&
-		    c->mfc_mcastgrp == mcastgrp) {
-			if (c->mfc_un.res.ttls[vifi] < 255)
-				return c;
-
-			/* It's ok if the vifi is part of the static tree */
-			proxy = ipmr_cache_find_any_parent(mrt,
-							   c->mfc_parent);
-			if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
-				return c;
-		}
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+		if (c->mfc_un.res.ttls[vifi] < 255)
+			return c;
+
+		/* It's ok if the vifi is part of the static tree */
+		proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
+		if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+			return c;
+	}
 
 skip:
 	return ipmr_cache_find_any_parent(mrt, vifi);
 }
 
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+						__be32 origin, __be32 mcastgrp,
+						int parent)
+{
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin,
+	};
+	struct rhlist_head *tmp, *list;
+	struct mfc_cache *c;
+
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (parent == -1 || parent == c->mfc_parent)
+			return c;
+
+	return NULL;
+}
+
 /* Allocate a multicast cache entry */
 static struct mfc_cache *ipmr_cache_alloc(void)
 {
@@ -1028,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
 static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 				 struct sk_buff *skb)
 {
+	const struct iphdr *iph = ip_hdr(skb);
+	struct mfc_cache *c;
 	bool found = false;
 	int err;
-	struct mfc_cache *c;
-	const struct iphdr *iph = ip_hdr(skb);
 
 	spin_lock_bh(&mfc_unres_lock);
 	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1095,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 {
-	int line;
-	struct mfc_cache *c, *next;
+	struct mfc_cache *c;
 
-	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (!c)
+		return -ENOENT;
+	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+	list_del_rcu(&c->list);
+	mroute_netlink_event(mrt, c, RTM_DELROUTE);
+	ipmr_cache_free(c);
 
-	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
-		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
-		    (parent == -1 || parent == c->mfc_parent)) {
-			list_del_rcu(&c->list);
-			mroute_netlink_event(mrt, c, RTM_DELROUTE);
-			ipmr_cache_free(c);
-			return 0;
-		}
-	}
-	return -ENOENT;
+	return 0;
 }
 
 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 			struct mfcctl *mfc, int mrtsock, int parent)
 {
-	bool found = false;
-	int line;
 	struct mfc_cache *uc, *c;
+	bool found;
+	int ret;
 
 	if (mfc->mfcc_parent >= MAXVIFS)
 		return -ENFILE;
 
-	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
-
-	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
-		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
-		    (parent == -1 || parent == c->mfc_parent)) {
-			found = true;
-			break;
-		}
-	}
-
-	if (found) {
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (c) {
 		write_lock_bh(&mrt_lock);
 		c->mfc_parent = mfc->mfcc_parent;
 		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1160,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-	list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
-
+	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+				  ipmr_rht_params);
+	if (ret) {
+		pr_err("ipmr: rhtable insert error %d\n", ret);
+		ipmr_cache_free(c);
+		return ret;
+	}
+	list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
 	/* Check to see if we resolved a queued list. If so we
 	 * need to send on the frames and tidy up.
 	 */
@@ -1191,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 /* Close the multicast socket, and clear the vif tables etc */
 static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
-	int i;
+	struct mfc_cache *c, *tmp;
 	LIST_HEAD(list);
-	struct mfc_cache *c, *next;
+	int i;
 
 	/* Shut down all active vif entries */
 	for (i = 0; i < mrt->maxvif; i++) {
@@ -1204,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 	unregister_netdevice_many(&list);
 
 	/* Wipe the cache */
-	for (i = 0; i < MFC_LINES; i++) {
-		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
-			if (!all && (c->mfc_flags & MFC_STATIC))
-				continue;
-			list_del_rcu(&c->list);
-			mroute_netlink_event(mrt, c, RTM_DELROUTE);
-			ipmr_cache_free(c);
-		}
+	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+		if (!all && (c->mfc_flags & MFC_STATIC))
+			continue;
+		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+		list_del_rcu(&c->list);
+		mroute_netlink_event(mrt, c, RTM_DELROUTE);
+		ipmr_cache_free(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
 			mroute_netlink_event(mrt, c, RTM_DELROUTE);
 			ipmr_destroy_unres(mrt, c);
@@ -1791,9 +1836,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			  struct sk_buff *skb, struct mfc_cache *cache,
 			  int local)
 {
+	int true_vifi = ipmr_find_vif(mrt, skb->dev);
 	int psend = -1;
 	int vif, ct;
-	int true_vifi = ipmr_find_vif(mrt, skb->dev);
 
 	vif = cache->mfc_parent;
 	cache->mfc_un.res.pkt++;
@@ -2293,34 +2338,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	struct mr_table *mrt;
 	struct mfc_cache *mfc;
 	unsigned int t = 0, s_t;
-	unsigned int h = 0, s_h;
 	unsigned int e = 0, s_e;
 
 	s_t = cb->args[0];
-	s_h = cb->args[1];
-	s_e = cb->args[2];
+	s_e = cb->args[1];
 
 	rcu_read_lock();
 	ipmr_for_each_table(mrt, net) {
 		if (t < s_t)
 			goto next_table;
-		if (t > s_t)
-			s_h = 0;
-		for (h = s_h; h < MFC_LINES; h++) {
-			list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
-				if (e < s_e)
-					goto next_entry;
-				if (ipmr_fill_mroute(mrt, skb,
-						     NETLINK_CB(cb->skb).portid,
-						     cb->nlh->nlmsg_seq,
-						     mfc, RTM_NEWROUTE,
-						     NLM_F_MULTI) < 0)
-					goto done;
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+			if (e < s_e)
+				goto next_entry;
+			if (ipmr_fill_mroute(mrt, skb,
+					     NETLINK_CB(cb->skb).portid,
+					     cb->nlh->nlmsg_seq,
+					     mfc, RTM_NEWROUTE,
+					     NLM_F_MULTI) < 0)
+				goto done;
 next_entry:
-				e++;
-			}
-			e = s_e = 0;
+			e++;
 		}
+		e = 0;
+		s_e = 0;
+
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 			if (e < s_e)
@@ -2337,16 +2378,15 @@ next_entry2:
 			e++;
 		}
 		spin_unlock_bh(&mfc_unres_lock);
-		e = s_e = 0;
-		s_h = 0;
+		e = 0;
+		s_e = 0;
 next_table:
 		t++;
 	}
 done:
 	rcu_read_unlock();
 
-	cb->args[2] = e;
-	cb->args[1] = h;
+	cb->args[1] = e;
 	cb->args[0] = t;
 
 	return skb->len;
@@ -2590,10 +2630,8 @@ struct ipmr_mfc_iter {
 	struct seq_net_private p;
 	struct mr_table *mrt;
 	struct list_head *cache;
-	int ct;
 };
 
-
 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 					  struct ipmr_mfc_iter *it, loff_t pos)
 {
@@ -2601,12 +2639,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 	struct mfc_cache *mfc;
 
 	rcu_read_lock();
-	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
-		it->cache = &mrt->mfc_cache_array[it->ct];
-		list_for_each_entry_rcu(mfc, it->cache, list)
-			if (pos-- == 0)
-				return mfc;
-	}
+	it->cache = &mrt->mfc_cache_list;
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+		if (pos-- == 0)
+			return mfc;
 	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
@@ -2633,17 +2669,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 
 	it->mrt = mrt;
 	it->cache = NULL;
-	it->ct = 0;
 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
 		: SEQ_START_TOKEN;
 }
 
 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct mfc_cache *mfc = v;
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt = it->mrt;
+	struct mfc_cache *mfc = v;
 
 	++*pos;
 
@@ -2656,19 +2691,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (it->cache == &mrt->mfc_unres_queue)
 		goto end_of_list;
 
-	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
-
-	while (++it->ct < MFC_LINES) {
-		it->cache = &mrt->mfc_cache_array[it->ct];
-		if (list_empty(it->cache))
-			continue;
-		return list_first_entry(it->cache, struct mfc_cache, list);
-	}
-
 	/* exhausted cache_array, show unresolved */
 	rcu_read_unlock();
 	it->cache = &mrt->mfc_unres_queue;
-	it->ct = 0;
 
 	spin_lock_bh(&mfc_unres_lock);
 	if (!list_empty(it->cache))
@@ -2688,7 +2713,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
 	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &mrt->mfc_cache_array[it->ct])
+	else if (it->cache == &mrt->mfc_cache_list)
 		rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From 717ac5ce13020920fa3975f290281395b6407900 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 12 Jan 2017 08:50:10 -0800
Subject: ipv6: sr: static percpu allocation for hmac_ring

Current allocations are not NUMA aware, and lack proper
cleanup in case of error.

It is perfectly fine to use static per cpu allocations for 256 bytes
per cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Lebrun <david.lebrun@uclouvain.be>
Acked-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_hmac.c | 43 +++----------------------------------------
 1 file changed, 3 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index ef1c8a46e7ac..6389bf3e9c9f 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -45,7 +45,7 @@
 #include <net/seg6_hmac.h>
 #include <linux/random.h>
 
-static char * __percpu *hmac_ring;
+static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
 
 static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
 {
@@ -192,7 +192,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
 	 */
 
 	local_bh_disable();
-	ring = *this_cpu_ptr(hmac_ring);
+	ring = this_cpu_ptr(hmac_ring);
 	off = ring;
 
 	/* source address */
@@ -353,27 +353,6 @@ out:
 }
 EXPORT_SYMBOL(seg6_push_hmac);
 
-static int seg6_hmac_init_ring(void)
-{
-	int i;
-
-	hmac_ring = alloc_percpu(char *);
-
-	if (!hmac_ring)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i) {
-		char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL);
-
-		if (!ring)
-			return -ENOMEM;
-
-		*per_cpu_ptr(hmac_ring, i) = ring;
-	}
-
-	return 0;
-}
-
 static int seg6_hmac_init_algo(void)
 {
 	struct seg6_hmac_algo *algo;
@@ -422,16 +401,7 @@ static int seg6_hmac_init_algo(void)
 
 int __init seg6_hmac_init(void)
 {
-	int ret;
-
-	ret = seg6_hmac_init_ring();
-	if (ret < 0)
-		goto out;
-
-	ret = seg6_hmac_init_algo();
-
-out:
-	return ret;
+	return seg6_hmac_init_algo();
 }
 EXPORT_SYMBOL(seg6_hmac_init);
 
@@ -450,13 +420,6 @@ void seg6_hmac_exit(void)
 	struct seg6_hmac_algo *algo = NULL;
 	int i, alg_count, cpu;
 
-	for_each_possible_cpu(i) {
-		char *ring = *per_cpu_ptr(hmac_ring, i);
-
-		kfree(ring);
-	}
-	free_percpu(hmac_ring);
-
 	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
 	for (i = 0; i < alg_count; i++) {
 		algo = &hmac_algos[i];
-- 
cgit v1.2.3


From 10b2eb6949ece992a1dd58edb28e01f05e5bf004 Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Fri, 13 Jan 2017 09:31:32 +0100
Subject: wext: uninline stream addition functions

With 78, 111 and 85 bytes respectively (on x86-64), the
functions iwe_stream_add_event(), iwe_stream_add_point()
and iwe_stream_add_value() really shouldn't be inlines.

It appears that at least my compiler already decided
the same, and created a single instance of each one
of them for each file using it, but that's still a
number of instances in the system overall, which this
reduces.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/iw_handler.h | 67 +++++-------------------------------------------
 net/wireless/wext-core.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 60 deletions(-)

(limited to 'net')

diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h
index c2aa73e5e6bb..2509728650bd 100644
--- a/include/net/iw_handler.h
+++ b/include/net/iw_handler.h
@@ -505,25 +505,8 @@ static inline int iwe_stream_event_len_adjust(struct iw_request_info *info,
 /*
  * Wrapper to add an Wireless Event to a stream of events.
  */
-static inline char *
-iwe_stream_add_event(struct iw_request_info *info, char *stream, char *ends,
-		     struct iw_event *iwe, int event_len)
-{
-	int lcp_len = iwe_stream_lcp_len(info);
-
-	event_len = iwe_stream_event_len_adjust(info, event_len);
-
-	/* Check if it's possible */
-	if(likely((stream + event_len) < ends)) {
-		iwe->len = event_len;
-		/* Beware of alignement issues on 64 bits */
-		memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
-		memcpy(stream + lcp_len, &iwe->u,
-		       event_len - lcp_len);
-		stream += event_len;
-	}
-	return stream;
-}
+char *iwe_stream_add_event(struct iw_request_info *info, char *stream,
+			   char *ends, struct iw_event *iwe, int event_len);
 
 static inline char *
 iwe_stream_add_event_check(struct iw_request_info *info, char *stream,
@@ -541,27 +524,8 @@ iwe_stream_add_event_check(struct iw_request_info *info, char *stream,
  * Wrapper to add an short Wireless Event containing a pointer to a
  * stream of events.
  */
-static inline char *
-iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends,
-		     struct iw_event *iwe, char *extra)
-{
-	int event_len = iwe_stream_point_len(info) + iwe->u.data.length;
-	int point_len = iwe_stream_point_len(info);
-	int lcp_len   = iwe_stream_lcp_len(info);
-
-	/* Check if it's possible */
-	if(likely((stream + event_len) < ends)) {
-		iwe->len = event_len;
-		memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
-		memcpy(stream + lcp_len,
-		       ((char *) &iwe->u) + IW_EV_POINT_OFF,
-		       IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN);
-		if (iwe->u.data.length && extra)
-			memcpy(stream + point_len, extra, iwe->u.data.length);
-		stream += event_len;
-	}
-	return stream;
-}
+char *iwe_stream_add_point(struct iw_request_info *info, char *stream,
+			   char *ends, struct iw_event *iwe, char *extra);
 
 static inline char *
 iwe_stream_add_point_check(struct iw_request_info *info, char *stream,
@@ -580,25 +544,8 @@ iwe_stream_add_point_check(struct iw_request_info *info, char *stream,
  * Be careful, this one is tricky to use properly :
  * At the first run, you need to have (value = event + IW_EV_LCP_LEN).
  */
-static inline char *
-iwe_stream_add_value(struct iw_request_info *info, char *event, char *value,
-		     char *ends, struct iw_event *iwe, int event_len)
-{
-	int lcp_len = iwe_stream_lcp_len(info);
-
-	/* Don't duplicate LCP */
-	event_len -= IW_EV_LCP_LEN;
-
-	/* Check if it's possible */
-	if(likely((value + event_len) < ends)) {
-		/* Add new value */
-		memcpy(value, &iwe->u, event_len);
-		value += event_len;
-		/* Patch LCP */
-		iwe->len = value - event;
-		memcpy(event, (char *) iwe, lcp_len);
-	}
-	return value;
-}
+char *iwe_stream_add_value(struct iw_request_info *info, char *event,
+			   char *value, char *ends, struct iw_event *iwe,
+			   int event_len);
 
 #endif	/* _IW_HANDLER_H */
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6250b1cfcde5..1a4db6790e20 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -1119,3 +1119,70 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
 	return ret;
 }
 #endif
+
+char *iwe_stream_add_event(struct iw_request_info *info, char *stream,
+			   char *ends, struct iw_event *iwe, int event_len)
+{
+	int lcp_len = iwe_stream_lcp_len(info);
+
+	event_len = iwe_stream_event_len_adjust(info, event_len);
+
+	/* Check if it's possible */
+	if (likely((stream + event_len) < ends)) {
+		iwe->len = event_len;
+		/* Beware of alignement issues on 64 bits */
+		memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
+		memcpy(stream + lcp_len, &iwe->u,
+		       event_len - lcp_len);
+		stream += event_len;
+	}
+
+	return stream;
+}
+EXPORT_SYMBOL(iwe_stream_add_event);
+
+char *iwe_stream_add_point(struct iw_request_info *info, char *stream,
+			   char *ends, struct iw_event *iwe, char *extra)
+{
+	int event_len = iwe_stream_point_len(info) + iwe->u.data.length;
+	int point_len = iwe_stream_point_len(info);
+	int lcp_len   = iwe_stream_lcp_len(info);
+
+	/* Check if it's possible */
+	if (likely((stream + event_len) < ends)) {
+		iwe->len = event_len;
+		memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
+		memcpy(stream + lcp_len,
+		       ((char *) &iwe->u) + IW_EV_POINT_OFF,
+		       IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN);
+		if (iwe->u.data.length && extra)
+			memcpy(stream + point_len, extra, iwe->u.data.length);
+		stream += event_len;
+	}
+
+	return stream;
+}
+EXPORT_SYMBOL(iwe_stream_add_point);
+
+char *iwe_stream_add_value(struct iw_request_info *info, char *event,
+			   char *value, char *ends, struct iw_event *iwe,
+			   int event_len)
+{
+	int lcp_len = iwe_stream_lcp_len(info);
+
+	/* Don't duplicate LCP */
+	event_len -= IW_EV_LCP_LEN;
+
+	/* Check if it's possible */
+	if (likely((value + event_len) < ends)) {
+		/* Add new value */
+		memcpy(value, &iwe->u, event_len);
+		value += event_len;
+		/* Patch LCP */
+		iwe->len = value - event;
+		memcpy(event, (char *) iwe, lcp_len);
+	}
+
+	return value;
+}
+EXPORT_SYMBOL(iwe_stream_add_value);
-- 
cgit v1.2.3


From ab5bb2d51ba8da4add4612b529968446cdb504b1 Mon Sep 17 00:00:00 2001
From: vamsi krishna
Date: Fri, 13 Jan 2017 01:12:19 +0200
Subject: cfg80211: Add support for randomizing TA of Public Action frames

Add support to use a random local address (Address 2 = TA in transmit
and the same address in receive functionality) for Public Action frames
in order to improve privacy of WLAN clients. Applications fill the
random transmit address in the frame buffer in the NL80211_CMD_FRAME
command. This can be used only with the drivers that indicate support
for random local address by setting the new
NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA and/or
NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED in ext_features.

The driver needs to configure receive behavior to accept frames to the
specified random address during the time the frame exchange is pending
and such frames need to be acknowledged similarly to frames sent to the
local permanent address when this random address functionality is not
used.

Signed-off-by: vamsi krishna <vamsin@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  6 ++++++
 net/wireless/mlme.c          | 21 +++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 174f4b30e804..908886c83894 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4699,6 +4699,10 @@ enum nl80211_feature_flags {
  *	configuration (AP/mesh) with VHT rates.
  * @NL80211_EXT_FEATURE_FILS_STA: This driver supports Fast Initial Link Setup
  *	with user space SME (NL80211_CMD_AUTHENTICATE) in station mode.
+ * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA: This driver supports randomized TA
+ *	in @NL80211_CMD_FRAME while not associated.
+ * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED: This driver supports
+ *	randomized TA in @NL80211_CMD_FRAME while associated.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4714,6 +4718,8 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_BEACON_RATE_HT,
 	NL80211_EXT_FEATURE_BEACON_RATE_VHT,
 	NL80211_EXT_FEATURE_FILS_STA,
+	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA,
+	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 1c63a77aea34..b876f40c9dad 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -662,8 +662,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			return err;
 	}
 
-	if (!ether_addr_equal(mgmt->sa, wdev_address(wdev)))
-		return -EINVAL;
+	if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) {
+		/* Allow random TA to be used with Public Action frames if the
+		 * driver has indicated support for this. Otherwise, only allow
+		 * the local address to be used.
+		 */
+		if (!ieee80211_is_action(mgmt->frame_control) ||
+		    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
+			return -EINVAL;
+		if (!wdev->current_bss &&
+		    !wiphy_ext_feature_isset(
+			    &rdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
+			return -EINVAL;
+		if (wdev->current_bss &&
+		    !wiphy_ext_feature_isset(
+			    &rdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
+			return -EINVAL;
+	}
 
 	/* Transmit the Action frame as requested by user space */
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
-- 
cgit v1.2.3


From bf95ecdba93b98d27ac219e79f773f2074b4ca47 Mon Sep 17 00:00:00 2001
From: vamsi krishna
Date: Fri, 13 Jan 2017 01:12:20 +0200
Subject: cfg80211: Add support to sched scan to report better BSSs

Enhance sched scan to support option of finding a better BSS while in
connected state. Firmware scans the medium and reports when it finds a
known BSS which has better RSSI than the current connected BSS. New
attributes to specify the relative RSSI (compared to the current BSS)
are added to the sched scan to implement this.

Signed-off-by: vamsi krishna <vamsin@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 36 +++++++++++++++++++++++++-----------
 include/uapi/linux/nl80211.h | 30 ++++++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cb13789ebaef..4456491132cd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1619,6 +1619,17 @@ struct cfg80211_sched_scan_plan {
 	u32 iterations;
 };
 
+/**
+ * struct cfg80211_bss_select_adjust - BSS selection with RSSI adjustment.
+ *
+ * @band: band of BSS which should match for RSSI level adjustment.
+ * @delta: value of RSSI level adjustment.
+ */
+struct cfg80211_bss_select_adjust {
+	enum nl80211_band band;
+	s8 delta;
+};
+
 /**
  * struct cfg80211_sched_scan_request - scheduled scan request description
  *
@@ -1654,6 +1665,16 @@ struct cfg80211_sched_scan_plan {
  *	cycle.  The driver may ignore this parameter and start
  *	immediately (or at any other time), if this feature is not
  *	supported.
+ * @relative_rssi_set: Indicates whether @relative_rssi is set or not.
+ * @relative_rssi: Relative RSSI threshold in dB to restrict scan result
+ *	reporting in connected state to cases where a matching BSS is determined
+ *	to have better or slightly worse RSSI than the current connected BSS.
+ *	The relative RSSI threshold values are ignored in disconnected state.
+ * @rssi_adjust: delta dB of RSSI preference to be given to the BSSs that belong
+ *	to the specified band while deciding whether a better BSS is reported
+ *	using @relative_rssi. If delta is a negative number, the BSSs that
+ *	belong to the specified band will be penalized by delta dB in relative
+ *	comparisions.
  */
 struct cfg80211_sched_scan_request {
 	struct cfg80211_ssid *ssids;
@@ -1673,6 +1694,10 @@ struct cfg80211_sched_scan_request {
 	u8 mac_addr[ETH_ALEN] __aligned(2);
 	u8 mac_addr_mask[ETH_ALEN] __aligned(2);
 
+	bool relative_rssi_set;
+	s8 relative_rssi;
+	struct cfg80211_bss_select_adjust rssi_adjust;
+
 	/* internal */
 	struct wiphy *wiphy;
 	struct net_device *dev;
@@ -1980,17 +2005,6 @@ struct cfg80211_ibss_params {
 	struct ieee80211_ht_cap ht_capa_mask;
 };
 
-/**
- * struct cfg80211_bss_select_adjust - BSS selection with RSSI adjustment.
- *
- * @band: band of BSS which should match for RSSI level adjustment.
- * @delta: value of RSSI level adjustment.
- */
-struct cfg80211_bss_select_adjust {
-	enum nl80211_band band;
-	s8 delta;
-};
-
 /**
  * struct cfg80211_bss_selection - connection parameters for BSS selection.
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 908886c83894..6b17feb5e839 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1982,6 +1982,20 @@ enum nl80211_commands {
  * @NL80211_ATTR_BSSID: The BSSID of the AP. Note that %NL80211_ATTR_MAC is also
  *	used in various commands/events for specifying the BSSID.
  *
+ * @NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI: Relative RSSI threshold by which
+ *	other BSSs has to be better or slightly worse than the current
+ *	connected BSS so that they get reported to user space.
+ *	This will give an opportunity to userspace to consider connecting to
+ *	other matching BSSs which have better or slightly worse RSSI than
+ *	the current connected BSS by using an offloaded operation to avoid
+ *	unnecessary wakeups.
+ *
+ * @NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST: When present the RSSI level for BSSs in
+ *	the specified band is to be adjusted before doing
+ *	%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI based comparision to figure out
+ *	better BSSs. The attribute value is a packed structure
+ *	value as specified by &struct nl80211_bss_select_rssi_adjust.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2388,6 +2402,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_BSSID,
 
+	NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
+	NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3080,6 +3097,13 @@ enum nl80211_reg_rule_attr {
  *	how this API was implemented in the past. Also, due to the same problem,
  *	the only way to create a matchset with only an RSSI filter (with this
  *	attribute) is if there's only a single matchset with the RSSI attribute.
+ * @NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI: Flag indicating whether
+ *	%NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to be used as absolute RSSI or
+ *	relative to current bss's RSSI.
+ * @NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST: When present the RSSI level for
+ *	BSS-es in the specified band is to be adjusted before doing
+ *	RSSI-based BSS selection. The attribute value is a packed structure
+ *	value as specified by &struct nl80211_bss_select_rssi_adjust.
  * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter
  *	attribute number currently defined
  * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use
@@ -3089,6 +3113,8 @@ enum nl80211_sched_scan_match_attr {
 
 	NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
 	NL80211_SCHED_SCAN_MATCH_ATTR_RSSI,
+	NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI,
+	NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST,
 
 	/* keep last */
 	__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST,
@@ -4703,6 +4729,9 @@ enum nl80211_feature_flags {
  *	in @NL80211_CMD_FRAME while not associated.
  * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED: This driver supports
  *	randomized TA in @NL80211_CMD_FRAME while associated.
+ * @NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI: The driver supports sched_scan
+ *	for reporting BSSs with better RSSI than the current connected BSS
+ *	(%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI).
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4720,6 +4749,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_FILS_STA,
 	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA,
 	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED,
+	NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b378d0a04003..71c66ff9a702 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -405,6 +405,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
 	[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
 	[NL80211_ATTR_BSSID] = { .len = ETH_ALEN },
+	[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
+	[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
+		.len = sizeof(struct nl80211_bss_select_rssi_adjust)
+	},
 };
 
 /* policy for the key attributes */
@@ -6950,6 +6954,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 	if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
 		return ERR_PTR(-EINVAL);
 
+	if (!wiphy_ext_feature_isset(
+		    wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) &&
+	    (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] ||
+	     attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]))
+		return ERR_PTR(-EINVAL);
+
 	request = kzalloc(sizeof(*request)
 			+ sizeof(*request->ssids) * n_ssids
 			+ sizeof(*request->match_sets) * n_match_sets
@@ -7156,6 +7166,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 		request->delay =
 			nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
 
+	if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) {
+		request->relative_rssi = nla_get_s8(
+			attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]);
+		request->relative_rssi_set = true;
+	}
+
+	if (request->relative_rssi_set &&
+	    attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) {
+		struct nl80211_bss_select_rssi_adjust *rssi_adjust;
+
+		rssi_adjust = nla_data(
+			attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]);
+		request->rssi_adjust.band = rssi_adjust->band;
+		request->rssi_adjust.delta = rssi_adjust->delta;
+		if (!is_band_valid(wiphy, request->rssi_adjust.band)) {
+			err = -EINVAL;
+			goto out_free;
+		}
+	}
+
 	err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
 	if (err)
 		goto out_free;
@@ -9692,6 +9722,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
 	if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
 		return -ENOBUFS;
 
+	if (req->relative_rssi_set) {
+		struct nl80211_bss_select_rssi_adjust rssi_adjust;
+
+		if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
+			       req->relative_rssi))
+			return -ENOBUFS;
+
+		rssi_adjust.band = req->rssi_adjust.band;
+		rssi_adjust.delta = req->rssi_adjust.delta;
+		if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
+			    sizeof(rssi_adjust), &rssi_adjust))
+			return -ENOBUFS;
+	}
+
 	freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
 	if (!freqs)
 		return -ENOBUFS;
-- 
cgit v1.2.3


From 3093ebbeabcdddc9a982950052f2151df43c7aa2 Mon Sep 17 00:00:00 2001
From: Purushottam Kushwaha
Date: Fri, 13 Jan 2017 01:12:21 +0200
Subject: cfg80211: Specify the reason for connect timeout

This enhances the connect timeout API to also carry the reason for the
timeout. These reason codes for the connect time out are represented by
enum nl80211_timeout_reason and are passed to user space through a new
attribute NL80211_ATTR_TIMEOUT_REASON (u32).

Signed-off-by: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
[keep gfp_t argument last]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 18 ++++++++++++++----
 include/uapi/linux/nl80211.h | 21 +++++++++++++++++++++
 net/wireless/core.h          |  4 +++-
 net/wireless/mlme.c          |  3 ++-
 net/wireless/nl80211.c       |  9 +++++++--
 net/wireless/nl80211.h       |  4 +++-
 net/wireless/sme.c           | 39 +++++++++++++++++++++++++++------------
 net/wireless/util.c          |  2 +-
 8 files changed, 78 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4456491132cd..9b3427c8d1db 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5090,6 +5090,12 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
  *      %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
  *      the real status code for failures.
  * @gfp: allocation flags
+ * @timeout_reason: reason for connection timeout. This is used when the
+ *	connection fails due to a timeout instead of an explicit rejection from
+ *	the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
+ *	not known. This value is used only if @status < 0 to indicate that the
+ *	failure is due to a timeout and not due to explicit rejection by the AP.
+ *	This value is ignored in other cases (@status >= 0).
  *
  * It should be called by the underlying driver whenever connect() has
  * succeeded. This is similar to cfg80211_connect_result(), but with the
@@ -5099,7 +5105,8 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 			  struct cfg80211_bss *bss, const u8 *req_ie,
 			  size_t req_ie_len, const u8 *resp_ie,
-			  size_t resp_ie_len, int status, gfp_t gfp);
+			  size_t resp_ie_len, int status, gfp_t gfp,
+			  enum nl80211_timeout_reason timeout_reason);
 
 /**
  * cfg80211_connect_result - notify cfg80211 of connection result
@@ -5125,7 +5132,8 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			u16 status, gfp_t gfp)
 {
 	cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, resp_ie,
-			     resp_ie_len, status, gfp);
+			     resp_ie_len, status, gfp,
+			     NL80211_TIMEOUT_UNSPECIFIED);
 }
 
 /**
@@ -5136,6 +5144,7 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
  * @req_ie: association request IEs (maybe be %NULL)
  * @req_ie_len: association request IEs length
  * @gfp: allocation flags
+ * @timeout_reason: reason for connection timeout.
  *
  * It should be called by the underlying driver whenever connect() has failed
  * in a sequence where no explicit authentication/association rejection was
@@ -5145,10 +5154,11 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
  */
 static inline void
 cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
-			 const u8 *req_ie, size_t req_ie_len, gfp_t gfp)
+			 const u8 *req_ie, size_t req_ie_len, gfp_t gfp,
+			 enum nl80211_timeout_reason timeout_reason)
 {
 	cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, NULL, 0, -1,
-			     gfp);
+			     gfp, timeout_reason);
 }
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 6b17feb5e839..c51b40cc0645 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1996,6 +1996,10 @@ enum nl80211_commands {
  *	better BSSs. The attribute value is a packed structure
  *	value as specified by &struct nl80211_bss_select_rssi_adjust.
  *
+ * @NL80211_ATTR_TIMEOUT_REASON: The reason for which an operation timed out.
+ *	u32 attribute with an &enum nl80211_timeout_reason value. This is used,
+ *	e.g., with %NL80211_CMD_CONNECT event.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2405,6 +2409,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
 	NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
 
+	NL80211_ATTR_TIMEOUT_REASON,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -4788,6 +4794,21 @@ enum nl80211_connect_failed_reason {
 	NL80211_CONN_FAIL_BLOCKED_CLIENT,
 };
 
+/**
+ * enum nl80211_timeout_reason - timeout reasons
+ *
+ * @NL80211_TIMEOUT_UNSPECIFIED: Timeout reason unspecified.
+ * @NL80211_TIMEOUT_SCAN: Scan (AP discovery) timed out.
+ * @NL80211_TIMEOUT_AUTH: Authentication timed out.
+ * @NL80211_TIMEOUT_ASSOC: Association timed out.
+ */
+enum nl80211_timeout_reason {
+	NL80211_TIMEOUT_UNSPECIFIED,
+	NL80211_TIMEOUT_SCAN,
+	NL80211_TIMEOUT_AUTH,
+	NL80211_TIMEOUT_ASSOC,
+};
+
 /**
  * enum nl80211_scan_flags -  scan request control flags
  *
diff --git a/net/wireless/core.h b/net/wireless/core.h
index ba42055a036d..58ca206982fe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -228,6 +228,7 @@ struct cfg80211_event {
 			size_t resp_ie_len;
 			struct cfg80211_bss *bss;
 			int status; /* -1 = failed; 0..65535 = status code */
+			enum nl80211_timeout_reason timeout_reason;
 		} cr;
 		struct {
 			const u8 *req_ie;
@@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			       const u8 *req_ie, size_t req_ie_len,
 			       const u8 *resp_ie, size_t resp_ie_len,
 			       int status, bool wextev,
-			       struct cfg80211_bss *bss);
+			       struct cfg80211_bss *bss,
+			       enum nl80211_timeout_reason timeout_reason);
 void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 			     size_t ie_len, u16 reason, bool from_ap);
 int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index b876f40c9dad..22b3d9990065 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
 	/* update current_bss etc., consumes the bss reference */
 	__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
 				  status_code,
-				  status_code == WLAN_STATUS_SUCCESS, bss);
+				  status_code == WLAN_STATUS_SUCCESS, bss,
+				  NL80211_TIMEOUT_UNSPECIFIED);
 }
 EXPORT_SYMBOL(cfg80211_rx_assoc_resp);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 71c66ff9a702..b4e7bdd673e0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -409,6 +409,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
 		.len = sizeof(struct nl80211_bss_select_rssi_adjust)
 	},
+	[NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
 };
 
 /* policy for the key attributes */
@@ -13231,7 +13232,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 				 struct net_device *netdev, const u8 *bssid,
 				 const u8 *req_ie, size_t req_ie_len,
 				 const u8 *resp_ie, size_t resp_ie_len,
-				 int status, gfp_t gfp)
+				 int status,
+				 enum nl80211_timeout_reason timeout_reason,
+				 gfp_t gfp)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -13252,7 +13255,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	    nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
 			status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
 			status) ||
-	    (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) ||
+	    (status < 0 &&
+	     (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
+	      nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) ||
 	    (req_ie &&
 	     nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
 	    (resp_ie &&
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 75f82520211d..e488dca87423 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -56,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 				 struct net_device *netdev, const u8 *bssid,
 				 const u8 *req_ie, size_t req_ie_len,
 				 const u8 *resp_ie, size_t resp_ie_len,
-				 int status, gfp_t gfp);
+				 int status,
+				 enum nl80211_timeout_reason timeout_reason,
+				 gfp_t gfp);
 void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev, const u8 *bssid,
 			 const u8 *req_ie, size_t req_ie_len,
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 46693913fcea..b347e63d7aaa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -34,10 +34,11 @@ struct cfg80211_conn {
 		CFG80211_CONN_SCAN_AGAIN,
 		CFG80211_CONN_AUTHENTICATE_NEXT,
 		CFG80211_CONN_AUTHENTICATING,
-		CFG80211_CONN_AUTH_FAILED,
+		CFG80211_CONN_AUTH_FAILED_TIMEOUT,
 		CFG80211_CONN_ASSOCIATE_NEXT,
 		CFG80211_CONN_ASSOCIATING,
 		CFG80211_CONN_ASSOC_FAILED,
+		CFG80211_CONN_ASSOC_FAILED_TIMEOUT,
 		CFG80211_CONN_DEAUTH,
 		CFG80211_CONN_ABANDON,
 		CFG80211_CONN_CONNECTED,
@@ -140,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
 	return err;
 }
 
-static int cfg80211_conn_do_work(struct wireless_dev *wdev)
+static int cfg80211_conn_do_work(struct wireless_dev *wdev,
+				 enum nl80211_timeout_reason *treason)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_connect_params *params;
@@ -171,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
 					  NULL, 0,
 					  params->key, params->key_len,
 					  params->key_idx, NULL, 0);
-	case CFG80211_CONN_AUTH_FAILED:
+	case CFG80211_CONN_AUTH_FAILED_TIMEOUT:
+		*treason = NL80211_TIMEOUT_AUTH;
 		return -ENOTCONN;
 	case CFG80211_CONN_ASSOCIATE_NEXT:
 		if (WARN_ON(!rdev->ops->assoc))
@@ -198,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
 					     WLAN_REASON_DEAUTH_LEAVING,
 					     false);
 		return err;
+	case CFG80211_CONN_ASSOC_FAILED_TIMEOUT:
+		*treason = NL80211_TIMEOUT_ASSOC;
+		/* fall through */
 	case CFG80211_CONN_ASSOC_FAILED:
 		cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
 				     NULL, 0,
@@ -223,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work)
 		container_of(work, struct cfg80211_registered_device, conn_work);
 	struct wireless_dev *wdev;
 	u8 bssid_buf[ETH_ALEN], *bssid = NULL;
+	enum nl80211_timeout_reason treason;
 
 	rtnl_lock();
 
@@ -244,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work)
 			memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
 			bssid = bssid_buf;
 		}
-		if (cfg80211_conn_do_work(wdev)) {
+		treason = NL80211_TIMEOUT_UNSPECIFIED;
+		if (cfg80211_conn_do_work(wdev, &treason)) {
 			__cfg80211_connect_result(
 					wdev->netdev, bssid,
-					NULL, 0, NULL, 0, -1, false, NULL);
+					NULL, 0, NULL, 0, -1, false, NULL,
+					treason);
 		}
 		wdev_unlock(wdev);
 	}
@@ -352,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
 	} else if (status_code != WLAN_STATUS_SUCCESS) {
 		__cfg80211_connect_result(wdev->netdev, mgmt->bssid,
 					  NULL, 0, NULL, 0,
-					  status_code, false, NULL);
+					  status_code, false, NULL,
+					  NL80211_TIMEOUT_UNSPECIFIED);
 	} else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
 		wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
 		schedule_work(&rdev->conn_work);
@@ -400,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev)
 	if (!wdev->conn)
 		return;
 
-	wdev->conn->state = CFG80211_CONN_AUTH_FAILED;
+	wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT;
 	schedule_work(&rdev->conn_work);
 }
 
@@ -422,7 +432,7 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
 	if (!wdev->conn)
 		return;
 
-	wdev->conn->state = CFG80211_CONN_ASSOC_FAILED;
+	wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT;
 	schedule_work(&rdev->conn_work);
 }
 
@@ -564,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 
 	/* we're good if we have a matching bss struct */
 	if (bss) {
-		err = cfg80211_conn_do_work(wdev);
+		enum nl80211_timeout_reason treason;
+
+		err = cfg80211_conn_do_work(wdev, &treason);
 		cfg80211_put_bss(wdev->wiphy, bss);
 	} else {
 		/* otherwise we'll need to scan for the AP first */
@@ -661,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			       const u8 *req_ie, size_t req_ie_len,
 			       const u8 *resp_ie, size_t resp_ie_len,
 			       int status, bool wextev,
-			       struct cfg80211_bss *bss)
+			       struct cfg80211_bss *bss,
+			       enum nl80211_timeout_reason timeout_reason)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	const u8 *country_ie;
@@ -680,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 	nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev,
 				    bssid, req_ie, req_ie_len,
 				    resp_ie, resp_ie_len,
-				    status, GFP_KERNEL);
+				    status, timeout_reason, GFP_KERNEL);
 
 #ifdef CONFIG_CFG80211_WEXT
 	if (wextev) {
@@ -771,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 			  struct cfg80211_bss *bss, const u8 *req_ie,
 			  size_t req_ie_len, const u8 *resp_ie,
-			  size_t resp_ie_len, int status, gfp_t gfp)
+			  size_t resp_ie_len, int status, gfp_t gfp,
+			  enum nl80211_timeout_reason timeout_reason)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -811,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 		cfg80211_hold_bss(bss_from_pub(bss));
 	ev->cr.bss = bss;
 	ev->cr.status = status;
+	ev->cr.timeout_reason = timeout_reason;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index cd8a7ae55e7d..1b9296882dcd 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -951,7 +951,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 				ev->cr.resp_ie, ev->cr.resp_ie_len,
 				ev->cr.status,
 				ev->cr.status == WLAN_STATUS_SUCCESS,
-				ev->cr.bss);
+				ev->cr.bss, ev->cr.timeout_reason);
 			break;
 		case EVENT_ROAMED:
 			__cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
-- 
cgit v1.2.3


From db8da6bb574e1692cb86624317c572b0b9306560 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:30 -0800
Subject: tcp: new helper function for RACK loss detection

Create a new helper tcp_rack_mark_skb_lost to prepare the
upcoming RACK reordering timer support.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_recovery.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..f38dba5aed7a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -3,6 +3,19 @@
 
 int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
 
+static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_skb_mark_lost_uncond_verify(tp, skb);
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+		/* Account for retransmits that are lost again */
+		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+		tp->retrans_out -= tcp_skb_pcount(skb);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+	}
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -61,13 +74,7 @@ int tcp_rack_mark_lost(struct sock *sk)
 				continue;
 
 			/* skb is lost if packet sent later is sacked */
-			tcp_skb_mark_lost_uncond_verify(tp, skb);
-			if (scb->sacked & TCPCB_SACKED_RETRANS) {
-				scb->sacked &= ~TCPCB_SACKED_RETRANS;
-				tp->retrans_out -= tcp_skb_pcount(skb);
-				NET_INC_STATS(sock_net(sk),
-					      LINUX_MIB_TCPLOSTRETRANSMIT);
-			}
+			tcp_rack_mark_skb_lost(sk, skb);
 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
 			/* Original data are sent sequentially so stop early
 			 * b/c the rest are all sent after rack_sent
-- 
cgit v1.2.3


From e636f8b0104d6622aaaed6aa5ef17dfbf165bc51 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:31 -0800
Subject: tcp: new helper for RACK to detect loss

Create a new helper tcp_rack_detect_loss to prepare the upcoming
RACK reordering timer patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  3 +--
 net/ipv4/tcp_input.c    | 12 ++++++++----
 net/ipv4/tcp_recovery.c | 22 +++++++++++++---------
 3 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1da0aa724929..51183bba3835 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,8 +1863,7 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern int tcp_rack_mark_lost(struct sock *sk);
-
+extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp,
 			     const struct skb_mstamp *xmit_time, u8 sacked);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec6d84363024..bb24b93e64bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2865,10 +2865,14 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	}
 
 	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
-	    tcp_rack_mark_lost(sk)) {
-		flag |= FLAG_LOST_RETRANS;
-		*ack_flag |= FLAG_LOST_RETRANS;
+	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+		u32 prior_retrans = tp->retrans_out;
+
+		tcp_rack_mark_lost(sk);
+		if (prior_retrans > tp->retrans_out) {
+			flag |= FLAG_LOST_RETRANS;
+			*ack_flag |= FLAG_LOST_RETRANS;
+		}
 	}
 
 	/* E. Process state. */
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index f38dba5aed7a..7ea0377229c0 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,17 +32,11 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	u32 reo_wnd, prior_retrans = tp->retrans_out;
-
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
-		return 0;
-
-	/* Reset the advanced flag to avoid unnecessary queue scanning */
-	tp->rack.advanced = 0;
+	u32 reo_wnd;
 
 	/* To be more reordering resilient, allow min_rtt/4 settling delay
 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
@@ -82,7 +76,17 @@ int tcp_rack_mark_lost(struct sock *sk)
 			break;
 		}
 	}
-	return prior_retrans - tp->retrans_out;
+}
+
+void tcp_rack_mark_lost(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+		return;
+	/* Reset the advanced flag to avoid unnecessary queue scanning */
+	tp->rack.advanced = 0;
+	tcp_rack_detect_loss(sk);
 }
 
 /* Record the most recently (re)sent time among the (s)acked packets */
-- 
cgit v1.2.3


From deed7be78f512d003c6290da0a781479b31b3d74 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:32 -0800
Subject: tcp: record most recent RTT in RACK loss detection

Record the most recent RTT in RACK. It is often identical to the
"ca_rtt_us" values in tcp_clean_rtx_queue. But when the packet has
been retransmitted, RACK choses to believe the ACK is for the
(latest) retransmitted packet if the RTT is over minimum RTT.

This requires passing the arrival time of the most recent ACK to
RACK routines. The timestamp is now recorded in the "ack_time"
in tcp_sacktag_state during the ACK processing.

This patch does not change the RACK algorithm itself. It only adds
the RTT variable to prepare the next main patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  1 +
 include/net/tcp.h       |  7 ++++---
 net/ipv4/tcp_input.c    | 36 ++++++++++++++++++++++--------------
 net/ipv4/tcp_recovery.c | 41 +++++++++++++++++++++++------------------
 4 files changed, 50 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fc5848dad7a4..1255c592719c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -207,6 +207,7 @@ struct tcp_sock {
 	/* Information of the most recently (s)acked skb */
 	struct tcp_rack {
 		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
+		u32 rtt_us;  /* Associated RTT */
 		u8 advanced; /* mstamp advanced since last lost marking */
 		u8 reord;    /* reordering detected */
 	} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51183bba3835..1439107658c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,9 +1863,10 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern void tcp_rack_mark_lost(struct sock *sk);
-extern void tcp_rack_advance(struct tcp_sock *tp,
-			     const struct skb_mstamp *xmit_time, u8 sacked);
+extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+			     const struct skb_mstamp *xmit_time,
+			     const struct skb_mstamp *ack_time);
 
 /*
  * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb24b93e64bc..8ccd171999bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1135,6 +1135,7 @@ struct tcp_sacktag_state {
 	 */
 	struct skb_mstamp first_sackt;
 	struct skb_mstamp last_sackt;
+	struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
 	struct rate_sample *rate;
 	int	flag;
 };
@@ -1217,7 +1218,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
-		tcp_rack_advance(tp, xmit_time, sacked);
+		tcp_rack_advance(tp, sacked, xmit_time, &state->ack_time);
 
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
@@ -2813,7 +2814,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
-				  bool is_dupack, int *ack_flag, int *rexmit)
+				  bool is_dupack, int *ack_flag, int *rexmit,
+				  const struct skb_mstamp *ack_time)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2868,7 +2870,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
 		u32 prior_retrans = tp->retrans_out;
 
-		tcp_rack_mark_lost(sk);
+		tcp_rack_mark_lost(sk, ack_time);
 		if (prior_retrans > tp->retrans_out) {
 			flag |= FLAG_LOST_RETRANS;
 			*ack_flag |= FLAG_LOST_RETRANS;
@@ -3105,11 +3107,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			       u32 prior_snd_una, int *acked,
-			       struct tcp_sacktag_state *sack,
-			       struct skb_mstamp *now)
+			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct skb_mstamp first_ackt, last_ackt;
+	struct skb_mstamp *now = &sack->ack_time;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
@@ -3169,7 +3171,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else if (tcp_is_sack(tp)) {
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
-				tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+				tcp_rack_advance(tp, sacked,
+						 &skb->skb_mstamp,
+						 &sack->ack_time);
 		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
@@ -3599,7 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
-	struct skb_mstamp now;
 
 	sack_state.first_sackt.v64 = 0;
 	sack_state.rate = &rs;
@@ -3625,7 +3628,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
-	skb_mstamp_get(&now);
+	skb_mstamp_get(&sack_state.ack_time);
 
 	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
@@ -3693,11 +3696,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state, &now);
+				    &sack_state);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	}
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
@@ -3712,15 +3716,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tcp_schedule_loss_probe(sk);
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
-	tcp_rate_gen(sk, delivered, lost, &now, &rs);
-	tcp_cong_control(sk, ack, delivered, flag, &rs);
+	tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
+		     sack_state.rate);
+	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3741,9 +3747,11 @@ old_ack:
 	 * If data was DSACKed, see if we can undo a cwnd reduction.
 	 */
 	if (TCP_SKB_CB(skb)->sacked) {
+		skb_mstamp_get(&sack_state.ack_time);
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 7ea0377229c0..557363cde58a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,7 +32,7 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-static void tcp_rack_detect_loss(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -62,13 +62,14 @@ static void tcp_rack_detect_loss(struct sock *sk)
 			continue;
 
 		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
-
-			if (skb_mstamp_us_delta(&tp->rack.mstamp,
-						&skb->skb_mstamp) <= reo_wnd)
-				continue;
-
-			/* skb is lost if packet sent later is sacked */
-			tcp_rack_mark_skb_lost(sk, skb);
+			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
+			 * A packet is lost if its elapsed time is beyond
+			 * the recent RTT plus the reordering window.
+			 */
+			if (skb_mstamp_us_delta(now, &skb->skb_mstamp) >
+			    tp->rack.rtt_us + reo_wnd) {
+				tcp_rack_mark_skb_lost(sk, skb);
+			}
 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
 			/* Original data are sent sequentially so stop early
 			 * b/c the rest are all sent after rack_sent
@@ -78,7 +79,7 @@ static void tcp_rack_detect_loss(struct sock *sk)
 	}
 }
 
-void tcp_rack_mark_lost(struct sock *sk)
+void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -86,20 +87,25 @@ void tcp_rack_mark_lost(struct sock *sk)
 		return;
 	/* Reset the advanced flag to avoid unnecessary queue scanning */
 	tp->rack.advanced = 0;
-	tcp_rack_detect_loss(sk);
+	tcp_rack_detect_loss(sk, now);
 }
 
-/* Record the most recently (re)sent time among the (s)acked packets */
-void tcp_rack_advance(struct tcp_sock *tp,
-		      const struct skb_mstamp *xmit_time, u8 sacked)
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+		      const struct skb_mstamp *xmit_time,
+		      const struct skb_mstamp *ack_time)
 {
+	u32 rtt_us;
+
 	if (tp->rack.mstamp.v64 &&
 	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
 		return;
 
+	rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
 	if (sacked & TCPCB_RETRANS) {
-		struct skb_mstamp now;
-
 		/* If the sacked packet was retransmitted, it's ambiguous
 		 * whether the retransmission or the original (or the prior
 		 * retransmission) was sacked.
@@ -110,11 +116,10 @@ void tcp_rack_advance(struct tcp_sock *tp,
 		 * so it's at least one RTT (i.e., retransmission is at least
 		 * an RTT later).
 		 */
-		skb_mstamp_get(&now);
-		if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+		if (rtt_us < tcp_min_rtt(tp))
 			return;
 	}
-
+	tp->rack.rtt_us = rtt_us;
 	tp->rack.mstamp = *xmit_time;
 	tp->rack.advanced = 1;
 }
-- 
cgit v1.2.3


From 57dde7f70de34d4251f291c9eac7ad920aaf56b2 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:33 -0800
Subject: tcp: add reordering timer in RACK loss detection

This patch makes RACK install a reordering timer when it suspects
some packets might be lost, but wants to delay the decision
a little bit to accomodate reordering.

It does not create a new timer but instead repurposes the existing
RTO timer, because both are meant to retransmit packets.
Specifically it arms a timer ICSK_TIME_REO_TIMEOUT when
the RACK timing check fails. The wait time is set to

  RACK.RTT + RACK.reo_wnd - (NOW - Packet.xmit_time) + fudge

This translates to expecting a packet (Packet) should take
(RACK.RTT + RACK.reo_wnd + fudge) to deliver after it was sent.

When there are multiple packets that need a timer, we use one timer
with the maximum timeout. Therefore the timer conservatively uses
the maximum window to expire N packets by one timeout, instead of
N timeouts to expire N packets sent at different times.

The fudge factor is 2 jiffies to ensure when the timer fires, all
the suspected packets would exceed the deadline and be marked lost
by tcp_rack_detect_loss(). It has to be at least 1 jiffy because the
clock may tick between calling icsk_reset_xmit_timer(timeout) and
actually hang the timer. The next jiffy is to lower-bound the timeout
to 2 jiffies when reo_wnd is < 1ms.

When the reordering timer fires (tcp_rack_reo_timeout): If we aren't
in Recovery we'll enter fast recovery and force fast retransmit.
This is very similar to the early retransmit (RFC5827) except RACK
is not constrained to only enter recovery for small outstanding
flights.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  4 ++-
 include/net/tcp.h                  |  4 +++
 net/ipv4/inet_diag.c               |  1 +
 net/ipv4/tcp_input.c               |  6 ++--
 net/ipv4/tcp_ipv4.c                |  1 +
 net/ipv4/tcp_output.c              |  3 +-
 net/ipv4/tcp_recovery.c            | 57 +++++++++++++++++++++++++++++++++-----
 net/ipv4/tcp_timer.c               |  3 ++
 net/ipv6/tcp_ipv6.c                |  1 +
 9 files changed, 68 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 85ee3879499e..84b2edde09b1 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -144,6 +144,7 @@ struct inet_connection_sock {
 #define ICSK_TIME_PROBE0	3	/* Zero window probe timer */
 #define ICSK_TIME_EARLY_RETRANS 4	/* Early retransmit timer */
 #define ICSK_TIME_LOSS_PROBE	5	/* Tail loss probe timer */
+#define ICSK_TIME_REO_TIMEOUT	6	/* Reordering timer */
 
 static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
 {
@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	}
 
 	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
-	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
+	    what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
+	    what == ICSK_TIME_REO_TIMEOUT) {
 		icsk->icsk_pending = what;
 		icsk->icsk_timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1439107658c2..64fcdeb3358b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
 					                 * for local resources.
 					                 */
+#define TCP_REO_TIMEOUT_MIN	(2000) /* Min RACK reordering timeout in usec */
 
 #define TCP_KEEPALIVE_TIME	(120*60*HZ)	/* two hours */
 #define TCP_KEEPALIVE_PROBES	9		/* Max of 9 keepalive probes	*/
@@ -397,6 +398,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 int tcp_child_process(struct sock *parent, struct sock *child,
 		      struct sk_buff *skb);
 void tcp_enter_loss(struct sock *sk);
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
 void tcp_clear_retrans(struct tcp_sock *tp);
 void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
@@ -541,6 +543,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
 void tcp_retransmit_timer(struct sock *sk);
 void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
+void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
 int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
 
@@ -1867,6 +1870,7 @@ extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
 			     const struct skb_mstamp *xmit_time,
 			     const struct skb_mstamp *ack_time);
+extern void tcp_rack_reo_timeout(struct sock *sk);
 
 /*
  * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4dea33e5f295..d216e40623d3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -216,6 +216,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		r->idiag_timer = 1;
 		r->idiag_retrans = icsk->icsk_retransmits;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8ccd171999bf..be1191829963 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2522,8 +2522,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
 	tcp_ecn_queue_cwr(tp);
 }
 
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
-			       int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int sndcnt = 0;
@@ -2691,7 +2690,7 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
 
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int mib_idx;
@@ -3031,6 +3030,7 @@ void tcp_rearm_rto(struct sock *sk)
 		u32 rto = inet_csk(sk)->icsk_rto;
 		/* Offset the time elapsed after installing regular RTO */
 		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+		    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp =
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 56d756ecfb59..ebf3e0c4967a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2230,6 +2230,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1d5331a1b1dc..0ba9026cb70d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2960,7 +2960,8 @@ begin_fwd:
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += tcp_skb_pcount(skb);
 
-		if (skb == tcp_write_queue_head(sk))
+		if (skb == tcp_write_queue_head(sk) &&
+		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 						  inet_csk(sk)->icsk_rto,
 						  TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 557363cde58a..eb39b1b6d1dc 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,19 +32,18 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+				 u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	u32 reo_wnd;
 
+	*reo_timeout = 0;
 	/* To be more reordering resilient, allow min_rtt/4 settling delay
 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
 	 * RTT because reordering is often a path property and less related
 	 * to queuing or delayed ACKs.
-	 *
-	 * TODO: measure and adapt to the observed reordering delay, and
-	 * use a timer to retransmit like the delayed early retransmit.
 	 */
 	reo_wnd = 1000;
 	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
@@ -66,10 +65,23 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
 			 * A packet is lost if its elapsed time is beyond
 			 * the recent RTT plus the reordering window.
 			 */
-			if (skb_mstamp_us_delta(now, &skb->skb_mstamp) >
-			    tp->rack.rtt_us + reo_wnd) {
+			u32 elapsed = skb_mstamp_us_delta(now,
+							  &skb->skb_mstamp);
+			s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
+
+			if (remaining < 0) {
 				tcp_rack_mark_skb_lost(sk, skb);
+				continue;
 			}
+
+			/* Skip ones marked lost but not yet retransmitted */
+			if ((scb->sacked & TCPCB_LOST) &&
+			    !(scb->sacked & TCPCB_SACKED_RETRANS))
+				continue;
+
+			/* Record maximum wait time (+1 to avoid 0) */
+			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
 			/* Original data are sent sequentially so stop early
 			 * b/c the rest are all sent after rack_sent
@@ -82,12 +94,19 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
 void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 timeout;
 
 	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
 		return;
+
 	/* Reset the advanced flag to avoid unnecessary queue scanning */
 	tp->rack.advanced = 0;
-	tcp_rack_detect_loss(sk, now);
+	tcp_rack_detect_loss(sk, now, &timeout);
+	if (timeout) {
+		timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+					  timeout, inet_csk(sk)->icsk_rto);
+	}
 }
 
 /* Record the most recently (re)sent time among the (s)acked packets
@@ -123,3 +142,27 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
 	tp->rack.mstamp = *xmit_time;
 	tp->rack.advanced = 1;
 }
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct skb_mstamp now;
+	u32 timeout, prior_inflight;
+
+	skb_mstamp_get(&now);
+	prior_inflight = tcp_packets_in_flight(tp);
+	tcp_rack_detect_loss(sk, &now, &timeout);
+	if (prior_inflight != tcp_packets_in_flight(tp)) {
+		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+			tcp_enter_recovery(sk, false);
+			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+				tcp_cwnd_reduction(sk, 1, 0);
+		}
+		tcp_xmit_retransmit_queue(sk);
+	}
+	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+		tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 29a9bd5f1225..953c02a8566e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -563,6 +563,9 @@ void tcp_write_timer_handler(struct sock *sk)
 	event = icsk->icsk_pending;
 
 	switch (event) {
+	case ICSK_TIME_REO_TIMEOUT:
+		tcp_rack_reo_timeout(sk);
+		break;
 	case ICSK_TIME_EARLY_RETRANS:
 		tcp_resume_early_retransmit(sk);
 		break;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 228965dca3c5..f52c3742b404 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1746,6 +1746,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
-- 
cgit v1.2.3


From 1d0833df594390876647c54c2c88069d29059665 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:34 -0800
Subject: tcp: use sequence to break TS ties for RACK loss detection

The packets inside a jumbo skb (e.g., TSO) share the same skb
timestamp, even though they are sent sequentially on the wire. Since
RACK is based on time, it can not detect some packets inside the
same skb are lost.  However, we can leverage the packet sequence
numbers as extended timestamps to detect losses. Therefore, when
RACK timestamp is identical to skb's timestamp (i.e., one of the
packets of the skb is acked or sacked), we use the sequence numbers
of the acked and unacked packets to break ties.

We can use the same sequence logic to advance RACK xmit time as
well to detect more losses and avoid timeout.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  1 +
 include/net/tcp.h       |  2 +-
 net/ipv4/tcp_input.c    |  5 +++--
 net/ipv4/tcp_recovery.c | 17 ++++++++++++++---
 4 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1255c592719c..970d5f00589f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -208,6 +208,7 @@ struct tcp_sock {
 	struct tcp_rack {
 		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
 		u32 rtt_us;  /* Associated RTT */
+		u32 end_seq; /* Ending TCP sequence of the skb */
 		u8 advanced; /* mstamp advanced since last lost marking */
 		u8 reord;    /* reordering detected */
 	} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 64fcdeb3358b..5fb1e75a32a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1867,7 +1867,7 @@ extern int sysctl_tcp_recovery;
 #define TCP_RACK_LOST_RETRANS  0x1
 
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
-extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     const struct skb_mstamp *xmit_time,
 			     const struct skb_mstamp *ack_time);
 extern void tcp_rack_reo_timeout(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be1191829963..e42ca11c0326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1218,7 +1218,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
-		tcp_rack_advance(tp, sacked, xmit_time, &state->ack_time);
+		tcp_rack_advance(tp, sacked, end_seq,
+				 xmit_time, &state->ack_time);
 
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
@@ -3171,7 +3172,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else if (tcp_is_sack(tp)) {
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
-				tcp_rack_advance(tp, sacked,
+				tcp_rack_advance(tp, sacked, scb->end_seq,
 						 &skb->skb_mstamp,
 						 &sack->ack_time);
 		}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index eb39b1b6d1dc..1e330a2f913d 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -16,6 +16,14 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+				const struct skb_mstamp *t2,
+				u32 seq1, u32 seq2)
+{
+	return skb_mstamp_after(t1, t2) ||
+	       (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -60,7 +68,8 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 		    scb->sacked & TCPCB_SACKED_ACKED)
 			continue;
 
-		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+		if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
+					tp->rack.end_seq, scb->end_seq)) {
 			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
 			 * A packet is lost if its elapsed time is beyond
 			 * the recent RTT plus the reordering window.
@@ -113,14 +122,15 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
  * draft-cheng-tcpm-rack-00.txt
  */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 		      const struct skb_mstamp *xmit_time,
 		      const struct skb_mstamp *ack_time)
 {
 	u32 rtt_us;
 
 	if (tp->rack.mstamp.v64 &&
-	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+	    !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
+				 end_seq, tp->rack.end_seq))
 		return;
 
 	rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
@@ -140,6 +150,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
 	}
 	tp->rack.rtt_us = rtt_us;
 	tp->rack.mstamp = *xmit_time;
+	tp->rack.end_seq = end_seq;
 	tp->rack.advanced = 1;
 }
 
-- 
cgit v1.2.3


From 98e36d449cc681f1bb2ce2230243f7f977a7da1b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:35 -0800
Subject: tcp: check undo conditions before detecting losses

Currently RACK would mark loss before the undo operations in TCP
loss recovery. This could incorrectly identify real losses as
spurious. For example a sender first experiences a delay spike and
then eventually some packets were lost due to buffer overrun.
In this case, the sender should perform fast recovery b/c not all
the packets were lost.

But the sender may first trigger a (spurious) RTO and reset
cwnd to 1. The following ACKs may used to mark real losses by
tcp_rack_mark_lost. Then in tcp_process_loss this ACK could trigger
F-RTO undo condition and unmark real losses and revert the cwnd
reduction. If there are no more ACKs coming back, eventually the
sender would timeout again instead of performing fast recovery.

The patch fixes this incorrect process by always performing
the undo checks before detecting losses.

Fixes: 4f41b1c58a32 ("tcp: use RACK to detect losses")
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e42ca11c0326..9c98dc874825 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2801,6 +2801,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
 	return false;
 }
 
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+				   const struct skb_mstamp *ack_time)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Use RACK to detect loss */
+	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+		u32 prior_retrans = tp->retrans_out;
+
+		tcp_rack_mark_lost(sk, ack_time);
+		if (prior_retrans > tp->retrans_out)
+			*ack_flag |= FLAG_LOST_RETRANS;
+	}
+}
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2866,17 +2881,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		}
 	}
 
-	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
-		u32 prior_retrans = tp->retrans_out;
-
-		tcp_rack_mark_lost(sk, ack_time);
-		if (prior_retrans > tp->retrans_out) {
-			flag |= FLAG_LOST_RETRANS;
-			*ack_flag |= FLAG_LOST_RETRANS;
-		}
-	}
-
 	/* E. Process state. */
 	switch (icsk->icsk_ca_state) {
 	case TCP_CA_Recovery:
@@ -2894,11 +2898,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			tcp_try_keep_open(sk);
 			return;
 		}
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, is_dupack, rexmit);
-		if (icsk->icsk_ca_state != TCP_CA_Open &&
-		    !(flag & FLAG_LOST_RETRANS))
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
+		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+		      (*ack_flag & FLAG_LOST_RETRANS)))
 			return;
 		/* Change state if cwnd is undone or retransmits are lost */
 	default:
@@ -2912,6 +2918,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk);
 
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
 		if (!tcp_time_to_recover(sk, flag)) {
 			tcp_try_to_open(sk, flag);
 			return;
-- 
cgit v1.2.3


From a0370b3f3f2cfb8b424b04c0545414abaa53f5ee Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:36 -0800
Subject: tcp: enable RACK loss detection to trigger recovery

This patch changes two things:

1. Start fast recovery with RACK in addition to other heuristics
   (e.g., DUPACK threshold, FACK). Prior to this change RACK
   is enabled to detect losses only after the recovery has
   started by other algorithms.

2. Disable TCP early retransmit. RACK subsumes the early retransmit
   with the new reordering timer feature. A latter patch in this
   series removes the early retransmit code.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       | 11 ++++-------
 net/ipv4/tcp_input.c    | 29 +++++++++++++++++++++--------
 net/ipv4/tcp_recovery.c | 16 ++++++++++------
 3 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5fb1e75a32a9..423438dd6fe9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_recovery;
+#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
+
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern int sysctl_tcp_min_tso_segs;
@@ -1043,6 +1046,7 @@ static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
 
 	tp->do_early_retrans = sysctl_tcp_early_retrans &&
 		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
+		!(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
 		net->ipv4.sysctl_tcp_reordering == 3;
 }
 
@@ -1859,13 +1863,6 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
-
-/* Flags to enable various loss recovery features. See below */
-extern int sysctl_tcp_recovery;
-
-/* Use TCP RACK to detect (some) tail and retransmit losses */
-#define TCP_RACK_LOST_RETRANS  0x1
-
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     const struct skb_mstamp *xmit_time,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c98dc874825..4ad75b8c4fee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
  *	F.e. after RTO, when all the queue is considered as lost,
  *	lost_out = packets_out and in_flight = retrans_out.
  *
- *		Essentially, we have now two algorithms counting
+ *		Essentially, we have now a few algorithms detecting
  *		lost packets.
  *
- *		FACK: It is the simplest heuristics. As soon as we decided
+ *		If the receiver supports SACK:
+ *
+ *		RFC6675/3517: It is the conventional algorithm. A packet is
+ *		considered lost if the number of higher sequence packets
+ *		SACKed is greater than or equal the DUPACK thoreshold
+ *		(reordering). This is implemented in tcp_mark_head_lost and
+ *		tcp_update_scoreboard.
+ *
+ *		RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ *		(2017-) that checks timing instead of counting DUPACKs.
+ *		Essentially a packet is considered lost if it's not S/ACKed
+ *		after RTT + reordering_window, where both metrics are
+ *		dynamically measured and adjusted. This is implemented in
+ *		tcp_rack_mark_lost.
+ *
+ *		FACK: it is the simplest heuristics. As soon as we decided
  *		that something is lost, we decide that _all_ not SACKed
  *		packets until the most forward SACK are lost. I.e.
  *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
  *		takes place. We use FACK by default until reordering
  *		is suspected on the path to this destination.
  *
- *		NewReno: when Recovery is entered, we assume that one segment
+ *		If the receiver does not support SACK:
+ *
+ *		NewReno (RFC6582): in Recovery we assume that one segment
  *		is lost (classic Reno). While we are in Recovery and
  *		a partial ACK arrives, we assume that one more packet
  *		is lost (NewReno). This heuristics are the same in NewReno
  *		and SACK.
  *
- *  Imagine, that's all! Forget about all this shamanism about CWND inflation
- *  deflation etc. CWND is real congestion window, never inflated, changes
- *  only according to classic VJ rules.
- *
  * Really tricky (and requiring careful tuning) part of algorithm
  * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
  * The first determines the moment _when_ we should reduce CWND and,
@@ -2807,7 +2820,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+	if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
 		u32 prior_retrans = tp->retrans_out;
 
 		tcp_rack_mark_lost(sk, ack_time);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 1e330a2f913d..4ecb38ae8504 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,7 +1,7 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
 
 static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 {
@@ -24,7 +24,9 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
 	       (t1->v64 == t2->v64 && after(seq1, seq2));
 }
 
-/* Marks a packet lost, if some packet sent later has been (s)acked.
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
  *
@@ -37,8 +39,10 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
  * is being more resilient to reordering by simply allowing some
  * "settling delay", instead of tweaking the dupthresh.
  *
- * The current version is only used after recovery starts but can be
- * easily extended to detect the first loss.
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
  */
 static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 				 u32 *reo_timeout)
@@ -54,7 +58,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 	 * to queuing or delayed ACKs.
 	 */
 	reo_wnd = 1000;
-	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
 
 	tcp_for_write_queue(skb, sk) {
@@ -105,7 +109,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 timeout;
 
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+	if (!tp->rack.advanced)
 		return;
 
 	/* Reset the advanced flag to avoid unnecessary queue scanning */
-- 
cgit v1.2.3


From 89fe18e44f7ee5ab1c90d0dff5835acee7751427 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:37 -0800
Subject: tcp: extend F-RTO to catch more spurious timeouts

Current F-RTO reverts cwnd reset whenever a never-retransmitted
packet was (s)acked. The timeout can be declared spurious because
the packets acknoledged with this ACK was transmitted before the
timeout, so clearly not all the packets are lost to reset the cwnd.

This nice detection does not really depend F-RTO internals. This
patch applies the detection universally. On Google servers this
change detected 20% more spurious timeouts.

Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4ad75b8c4fee..9469ce384d3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1939,7 +1939,6 @@ void tcp_enter_loss(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct sk_buff *skb;
-	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
 	bool mark_lost;
 
@@ -2000,13 +1999,15 @@ void tcp_enter_loss(struct sock *sk)
 	tp->high_seq = tp->snd_nxt;
 	tcp_ecn_queue_cwr(tp);
 
-	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
-	 * loss recovery is underway except recurring timeout(s) on
-	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+	/* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+	 * if a previous recovery is underway, otherwise it may incorrectly
+	 * call a timeout spurious if some previously retransmitted packets
+	 * are s/acked (sec 3.2). We do not apply that retriction since
+	 * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+	 * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+	 * on PTMU discovery to avoid sending new data.
 	 */
-	tp->frto = sysctl_tcp_frto &&
-		   (new_recovery || icsk->icsk_retransmits) &&
-		   !inet_csk(sk)->icsk_mtup.probe_size;
+	tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2740,14 +2741,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 	    tcp_try_undo_loss(sk, false))
 		return;
 
-	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
-		/* Step 3.b. A timeout is spurious if not all data are
-		 * lost, i.e., never-retransmitted data are (s)acked.
-		 */
-		if ((flag & FLAG_ORIG_SACK_ACKED) &&
-		    tcp_try_undo_loss(sk, true))
-			return;
+	/* The ACK (s)acks some never-retransmitted data meaning not all
+	 * the data packets before the timeout were lost. Therefore we
+	 * undo the congestion window and state. This is essentially
+	 * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+	 * a retransmitted skb is permantly marked, we can apply such an
+	 * operation even if F-RTO was not used.
+	 */
+	if ((flag & FLAG_ORIG_SACK_ACKED) &&
+	    tcp_try_undo_loss(sk, tp->undo_marker))
+		return;
 
+	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
 		if (after(tp->snd_nxt, tp->high_seq)) {
 			if (flag & FLAG_DATA_SACKED || is_dupack)
 				tp->frto = 0; /* Step 3.a. loss was real */
-- 
cgit v1.2.3


From 840a3cbe89694fad75578856976f180e852e69aa Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:38 -0800
Subject: tcp: remove forward retransmit feature

Forward retransmit is an esoteric feature in RFC3517 (condition(3)
in the NextSeg()). Basically if a packet is not considered lost by
the current criteria (# of dupacks etc), but the congestion window
has room for more packets, then retransmit this packet.

However it actually conflicts with the rest of recovery design. For
example, when reordering is detected we want to be conservative
in retransmitting packets but forward-retransmit feature would
break that to force more retransmission. Also the implementation is
fairly complicated inside the retransmission logic inducing extra
iterations in the write queue. With RACK losses are being detected
timely and this heuristic is no longer necessary. There this patch
removes the feature.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 -
 net/ipv4/tcp_input.c  |  5 -----
 net/ipv4/tcp_output.c | 61 +++------------------------------------------------
 3 files changed, 3 insertions(+), 64 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 970d5f00589f..8e5f4c15d0e5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -307,7 +307,6 @@ struct tcp_sock {
 					 */
 
 	int     lost_cnt_hint;
-	u32     retransmit_high;	/* L-bits may be on up to this seqno */
 
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
 	u32	high_seq;	/* snd_nxt at onset of congestion	*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9469ce384d3b..a041a92348ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -916,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 	    before(TCP_SKB_CB(skb)->seq,
 		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
 		tp->retransmit_skb_hint = skb;
-
-	if (!tp->lost_out ||
-	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
-		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 /* Sum the number of packets on the wire we have marked as lost.
@@ -1983,7 +1979,6 @@ void tcp_enter_loss(struct sock *sk)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
-			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0ba9026cb70d..6327e4d368a4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2831,36 +2831,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	return err;
 }
 
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	/* Forward retransmissions are possible only during Recovery. */
-	if (icsk->icsk_ca_state != TCP_CA_Recovery)
-		return false;
-
-	/* No forward retransmissions in Reno are possible. */
-	if (tcp_is_reno(tp))
-		return false;
-
-	/* Yeah, we have to make difficult choice between forward transmission
-	 * and retransmission... Both ways have their merits...
-	 *
-	 * For now we do not retransmit anything, while we have some new
-	 * segments to send. In the other cases, follow rule 3 for
-	 * NextSeg() specified in RFC3517.
-	 */
-
-	if (tcp_may_send_now(sk))
-		return false;
-
-	return true;
-}
-
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2875,24 +2845,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	struct sk_buff *hole = NULL;
-	u32 max_segs, last_lost;
+	u32 max_segs;
 	int mib_idx;
-	int fwd_rexmitting = 0;
 
 	if (!tp->packets_out)
 		return;
 
-	if (!tp->lost_out)
-		tp->retransmit_high = tp->snd_una;
-
 	if (tp->retransmit_skb_hint) {
 		skb = tp->retransmit_skb_hint;
-		last_lost = TCP_SKB_CB(skb)->end_seq;
-		if (after(last_lost, tp->retransmit_high))
-			last_lost = tp->retransmit_high;
 	} else {
 		skb = tcp_write_queue_head(sk);
-		last_lost = tp->snd_una;
 	}
 
 	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2915,31 +2877,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		 */
 		segs = min_t(int, segs, max_segs);
 
-		if (fwd_rexmitting) {
-begin_fwd:
-			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-				break;
-			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
-		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
-			tp->retransmit_high = last_lost;
-			if (!tcp_can_forward_retransmit(sk))
-				break;
-			/* Backtrack if necessary to non-L'ed skb */
-			if (hole) {
-				skb = hole;
-				hole = NULL;
-			}
-			fwd_rexmitting = 1;
-			goto begin_fwd;
-
+		if (tp->retrans_out >= tp->lost_out) {
+			break;
 		} else if (!(sacked & TCPCB_LOST)) {
 			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
 				hole = skb;
 			continue;
 
 		} else {
-			last_lost = TCP_SKB_CB(skb)->end_seq;
 			if (icsk->icsk_ca_state != TCP_CA_Loss)
 				mib_idx = LINUX_MIB_TCPFASTRETRANS;
 			else
-- 
cgit v1.2.3


From bec41a11dd3dc8c54f766b4f494140ca92ba7c10 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:39 -0800
Subject: tcp: remove early retransmit

This patch removes the support of RFC5827 early retransmit (i.e.,
fast recovery on small inflight with <3 dupacks) because it is
subsumed by the new RACK loss detection. More specifically when
RACK receives DUPACKs, it'll arm a reordering timer to start fast
recovery after a quarter of (min)RTT, hence it covers the early
retransmit except RACK does not limit itself to specific inflight
or dupack numbers.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 19 +++--------
 include/linux/tcp.h                    |  3 +-
 include/net/tcp.h                      | 19 -----------
 net/ipv4/inet_diag.c                   |  1 -
 net/ipv4/tcp.c                         |  3 --
 net/ipv4/tcp_input.c                   | 60 ++--------------------------------
 net/ipv4/tcp_ipv4.c                    |  1 -
 net/ipv4/tcp_metrics.c                 |  1 -
 net/ipv4/tcp_minisocks.c               |  1 -
 net/ipv4/tcp_output.c                  | 11 +++----
 net/ipv4/tcp_timer.c                   |  3 --
 net/ipv6/tcp_ipv6.c                    |  1 -
 12 files changed, 12 insertions(+), 111 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7dd65c9cf707..7de2cf79e16f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
 	Allows TCP to send "duplicate" SACKs.
 
 tcp_early_retrans - INTEGER
-	Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
-	for triggering fast retransmit when the amount of outstanding data is
-	small and when no previously unsent data can be transmitted (such
-	that limited transmit could be used). Also controls the use of
-	Tail loss probe (TLP) that converts RTOs occurring due to tail
-	losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
+	Tail loss probe (TLP) converts RTOs occurring due to tail
+	losses into fast recovery (draft-ietf-tcpm-rack). Note that
+	TLP requires RACK to function properly (see tcp_recovery below)
 	Possible values:
-		0 disables ER
-		1 enables ER
-		2 enables ER but delays fast recovery and fast retransmit
-		  by a fourth of RTT. This mitigates connection falsely
-		  recovers when network has a small degree of reordering
-		  (less than 3 packets).
-		3 enables delayed ER and TLP.
-		4 enables TLP only.
+		0 disables TLP
+		3 or 4 enables TLP
 	Default: 3
 
 tcp_ecn - INTEGER
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8e5f4c15d0e5..4733368f953a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,8 +224,7 @@ struct tcp_sock {
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
-	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-		syn_data:1,	/* SYN includes data */
+	u8	syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 423438dd6fe9..c55d65f74f7f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -565,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 			     const struct sk_buff *next_skb);
 
 /* tcp_input.c */
-void tcp_resume_early_retransmit(struct sock *sk);
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
@@ -1037,24 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
 	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
 }
 
-/* TCP early-retransmit (ER) is similar to but more conservative than
- * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
- */
-static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
-{
-	struct net *net = sock_net((struct sock *)tp);
-
-	tp->do_early_retrans = sysctl_tcp_early_retrans &&
-		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
-		!(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
-		net->ipv4.sysctl_tcp_reordering == 3;
-}
-
-static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
-{
-	tp->do_early_retrans = 0;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index d216e40623d3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -215,7 +215,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		r->idiag_timer = 1;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c8d46c140b4a..d9023e8ed53e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
-	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
@@ -2477,8 +2476,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		else {
 			tp->thin_dupack = val;
-			if (tp->thin_dupack)
-				tcp_disable_early_retrans(tp);
 		}
 		break;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a041a92348ee..79c819077a59 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -904,8 +904,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		tcp_disable_fack(tp);
 	}
 
-	if (metric > 0)
-		tcp_disable_early_retrans(tp);
 	tp->rack.reord = 1;
 }
 
@@ -2054,30 +2052,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
 }
 
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned long delay;
-
-	/* Delay early retransmit and entering fast recovery for
-	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
-	 * available, or RTO is scheduled to fire first.
-	 */
-	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
-	    (flag & FLAG_ECE) || !tp->srtt_us)
-		return false;
-
-	delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
-		    msecs_to_jiffies(2));
-
-	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
-		return false;
-
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
-				  TCP_RTO_MAX);
-	return true;
-}
-
 /* Linux NewReno/SACK/FACK/ECN state machine.
  * --------------------------------------
  *
@@ -2221,16 +2195,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	    tcp_is_sack(tp) && !tcp_send_head(sk))
 		return true;
 
-	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
-	 * retransmissions due to small network reorderings, we implement
-	 * Mitigation A.3 in the RFC and delay the retransmission for a short
-	 * interval if appropriate.
-	 */
-	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
-	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
-	    !tcp_may_send_now(sk))
-		return !tcp_pause_early_retransmit(sk, flag);
-
 	return false;
 }
 
@@ -3050,8 +3014,7 @@ void tcp_rearm_rto(struct sock *sk)
 	} else {
 		u32 rto = inet_csk(sk)->icsk_rto;
 		/* Offset the time elapsed after installing regular RTO */
-		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-		    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp =
@@ -3068,24 +3031,6 @@ void tcp_rearm_rto(struct sock *sk)
 	}
 }
 
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tcp_rearm_rto(sk);
-
-	/* Stop if ER is disabled after the delayed ER timer is scheduled */
-	if (!tp->do_early_retrans)
-		return;
-
-	tcp_enter_recovery(sk, false);
-	tcp_update_scoreboard(sk, 1);
-	tcp_xmit_retransmit_queue(sk);
-}
-
 /* If we get here, the whole TSO packet has not been acked. */
 static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
@@ -3651,8 +3596,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	skb_mstamp_get(&sack_state.ack_time);
 
-	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+	if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
 
 	if (after(ack, prior_snd_una)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf3e0c4967a..63214136cf1c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2229,7 +2229,6 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	int state;
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ba8f02d0f283..b9ed0d50aead 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk)
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 	if (val && tp->reordering != val) {
 		tcp_disable_fack(tp);
-		tcp_disable_early_retrans(tp);
 		tp->reordering = val;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 06fde26a82b7..bdb443471c39 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->sacked_out = 0;
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = treq->snt_synack.stamp_jiffies;
 		newsk->sk_txhash = treq->txhash;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6327e4d368a4..9a1a1494b9dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
 	tp->packets_out += tcp_skb_pcount(skb);
-	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
-	}
 
 	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
 		      tcp_skb_pcount(skb));
@@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	u32 timeout, tlp_time_stamp, rto_time_stamp;
 	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
 
-	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
-		return false;
 	/* No consecutive loss probes. */
 	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
 		tcp_rearm_rto(sk);
@@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	/* Schedule a loss probe in 2*RTT for SACK capable connections
 	 * in Open state, that are either limited by cwnd or application.
 	 */
-	if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
-	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+	if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+	    !tp->packets_out || !tcp_is_sack(tp) ||
+	    icsk->icsk_ca_state != TCP_CA_Open)
 		return false;
 
 	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 953c02a8566e..40d893556e67 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -566,9 +566,6 @@ void tcp_write_timer_handler(struct sock *sk)
 	case ICSK_TIME_REO_TIMEOUT:
 		tcp_rack_reo_timeout(sk);
 		break;
-	case ICSK_TIME_EARLY_RETRANS:
-		tcp_resume_early_retransmit(sk);
-		break;
 	case ICSK_TIME_LOSS_PROBE:
 		tcp_send_loss_probe(sk);
 		break;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f52c3742b404..fc14e04028bf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1745,7 +1745,6 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
-- 
cgit v1.2.3


From ac229dca7e4e582114e1ec9765fda0915aa58468 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:40 -0800
Subject: tcp: remove RFC4653 NCR

This patch removes the (partial) implementation of the aggressive
limited transmit in RFC4653 TCP Non-Congestion Robustness (NCR).

NCR is a mitigation to the problem created by the dynamic
DUPACK threshold.  With the current adaptive DUPACK threshold
(tp->reordering) could cause timeouts by preventing fast recovery.
For example, if the last packet of a cwnd burst was reordered, the
threshold will be set to the size of cwnd. But if next application
burst is smaller than threshold and has drops instead of reorderings,
the sender would not trigger fast recovery but instead resorts to a
timeout recovery.

NCR mitigates this issue by checking the number of DUPACKs against
the current flight size additionally. The techniqueue is similar to
the early retransmit RFC.

With RACK loss detection, this mitigation is not needed, because RACK
does not use DUPACK threshold to detect losses. RACK arms a reordering
timer to fire at most a quarter RTT later to start fast recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 79c819077a59..87315ab1ab1a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2161,8 +2161,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 static bool tcp_time_to_recover(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	__u32 packets_out;
-	int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 
 	/* Trick#1: The loss is proven. */
 	if (tp->lost_out)
@@ -2172,19 +2170,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	if (tcp_dupack_heuristics(tp) > tp->reordering)
 		return true;
 
-	/* Trick#4: It is still not OK... But will it be useful to delay
-	 * recovery more?
-	 */
-	packets_out = tp->packets_out;
-	if (packets_out <= tp->reordering &&
-	    tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
-	    !tcp_may_send_now(sk)) {
-		/* We have nothing to send. This connection is limited
-		 * either by receiver window or by application.
-		 */
-		return true;
-	}
-
 	/* If a thin stream is detected, retransmit after first
 	 * received dupack. Employ only if SACK is supported in order
 	 * to avoid possible corner-case series of spurious retransmissions
-- 
cgit v1.2.3


From 4a7f6009441144783e5925551c72e3f2e1b0839b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:41 -0800
Subject: tcp: remove thin_dupack feature

Thin stream DUPACK is to start fast recovery on only one DUPACK
provided the connection is a thin stream (i.e., low inflight).  But
this older feature is now subsumed with RACK. If a connection
receives only a single DUPACK, RACK would arm a reordering timer
and soon starts fast recovery instead of timeout if no further
ACKs are received.

The socket option (THIN_DUPACK) is kept as a nop for compatibility.
Note that this patch does not change another thin-stream feature
which enables linear RTO. Although it might be good to generalize
that in the future (i.e., linear RTO for the first say 3 retries).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 12 ------------
 include/linux/tcp.h                    |  2 +-
 net/ipv4/sysctl_net_ipv4.c             |  7 -------
 net/ipv4/tcp.c                         |  6 ++----
 net/ipv4/tcp_input.c                   | 13 -------------
 5 files changed, 3 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7de2cf79e16f..aa1bb49f1dc6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -703,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
-tcp_thin_dupack - BOOLEAN
-	Enable dynamic triggering of retransmissions after one dupACK
-	for thin streams. If set, a check is performed upon reception
-	of a dupACK to determine if the stream is thin (less than 4
-	packets in flight). As long as the stream is found to be thin,
-	data is retransmitted on the first received dupACK. This
-	improves retransmission latency for non-aggressive thin
-	streams, often found to be time-dependent.
-	For more information on thin streams, see
-	Documentation/networking/tcp-thin.txt
-	Default: 0
-
 tcp_limit_output_bytes - INTEGER
 	Controls TCP Small Queue limit per tcp socket.
 	TCP bulk sender tends to increase packets in flight until it
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4733368f953a..6c22332afb75 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -220,7 +220,7 @@ struct tcp_sock {
 		unused:5;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		thin_dupack : 1,/* Fast retransmit on first dupack      */
+		unused1	    : 1,
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0f2d37e8e983..c8d283615c6f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -536,13 +536,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec
 	},
-	{
-		.procname       = "tcp_thin_dupack",
-		.data           = &sysctl_tcp_thin_dupack,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec
-	},
 	{
 		.procname	= "tcp_early_retrans",
 		.data		= &sysctl_tcp_early_retrans,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d9023e8ed53e..aba6ea76338e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2474,9 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		if (val < 0 || val > 1)
 			err = -EINVAL;
-		else {
-			tp->thin_dupack = val;
-		}
 		break;
 
 	case TCP_REPAIR:
@@ -2966,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
+
 	case TCP_THIN_DUPACK:
-		val = tp->thin_dupack;
+		val = 0;
 		break;
 
 	case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87315ab1ab1a..39ebc20ca1b2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -2170,16 +2167,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	if (tcp_dupack_heuristics(tp) > tp->reordering)
 		return true;
 
-	/* If a thin stream is detected, retransmit after first
-	 * received dupack. Employ only if SACK is supported in order
-	 * to avoid possible corner-case series of spurious retransmissions
-	 * Use only if there are no unsent data.
-	 */
-	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
-	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
-	    tcp_is_sack(tp) && !tcp_send_head(sk))
-		return true;
-
 	return false;
 }
 
-- 
cgit v1.2.3


From 94bdc9785a1136cef6a982b042719783978e8a26 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Thu, 12 Jan 2017 22:11:42 -0800
Subject: tcp: disable fack by default

This patch disables FACK by default as RACK is the successor of FACK
(inspired by the insights behind FACK).

FACK[1] in Linux works as follows: a packet P is deemed lost,
if packet Q of higher sequence is s/acked and P and Q are distant
by at least dupthresh number of packets in sequence space.

FACK is more aggressive than the IETF recommened recovery for SACK
(RFC3517 A Conservative Selective Acknowledgment (SACK)-based Loss
 Recovery Algorithm for TCP), because a single SACK may trigger
fast recovery. This obviously won't work well with reordering so
FACK is dynamically disabled upon detecting reordering.

RACK supersedes FACK by using time distance instead of sequence
distance. On reordering, RACK waits for a quarter of RTT receiving
a single SACK before starting recovery. (the timer can be made more
adaptive in the future by measuring reordering distance in time,
but currently RTT/4 seem to work well.) Once the recovery starts,
RACK behaves almost like FACK because it reduces the reodering
window to 1ms, so it fast retransmits quickly. In addition RACK
can detect loss retransmission as it does not care about the packet
sequences (being repeated or not), which is extremely useful when
the connection is going through a traffic policer.

Google server experiments indicate that disabling FACK after enabling
RACK has negligible impact on the overall loss recovery performance
with more reordering events detected.  But we still keep the FACK
implementation for backup if RACK has bugs that needs to be disabled.

[1] M. Mathis, J. Mahdavi, "Forward Acknowledgment: Refining
TCP Congestion Control," In Proceedings of SIGCOMM '96, August 1996.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 39ebc20ca1b2..1a34e9278c07 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,7 @@
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
@@ -2114,7 +2114,8 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
  *		dynamically measured and adjusted. This is implemented in
  *		tcp_rack_mark_lost.
  *
- *		FACK: it is the simplest heuristics. As soon as we decided
+ *		FACK (Disabled by default. Subsumbed by RACK):
+ *		It is the simplest heuristics. As soon as we decided
  *		that something is lost, we decide that _all_ not SACKed
  *		packets until the most forward SACK are lost. I.e.
  *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
-- 
cgit v1.2.3


From 3819a35fdb8f82428c2b5190f195c4e0b8c427bf Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Fri, 13 Jan 2017 14:55:14 +0100
Subject: xfrm: fix possible null deref in xfrm_init_tempstate

Dan reports following smatch warning:
 net/xfrm/xfrm_state.c:659
 error: we previously assumed 'afinfo' could be null (see line 651)

 649  struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
 651  if (afinfo)
		...
 658  }
 659  afinfo->init_temprop(x, tmpl, daddr, saddr);

I am resonably sure afinfo cannot be NULL here.

xfrm_state4.c and state6.c are both part of ipv4/ipv6 (depends on
CONFIG_XFRM, a boolean) but even if ipv6 is a module state6.c can't
be removed (ipv6 lacks module_exit so it cannot be removed).

The only callers for xfrm6_fini that leads to state backend unregister
are error unwinding paths that can be called during ipv6 init function.

So after ipv6 module is loaded successfully the state backend cannot go
away anymore.

The family value from policy lookup path is taken from dst_entry, so
that should always be AF_INET(6).

However, since this silences the warning and avoids readers of this
code wondering about possible null deref it seems preferrable to
be defensive and just add the old check back.

Fixes: 711059b9752ad0 ("xfrm: add and use xfrm_state_afinfo_get_rcu")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index a62097e640b5..5a597dbbe564 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -648,8 +648,10 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
 
-	if (afinfo)
-		afinfo->init_tempsel(&x->sel, fl);
+	if (!afinfo)
+		return;
+
+	afinfo->init_tempsel(&x->sel, fl);
 
 	if (family != tmpl->encap_family) {
 		afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
-- 
cgit v1.2.3


From ebd89a2d0675f1325c2be5b7576fd8cb7e8defd0 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef
Date: Mon, 16 Jan 2017 13:17:55 +0200
Subject: IPsec: do not ignore crypto err in ah4 input

ah4 input processing uses the asynchronous hash crypto API which
supplies an error code as part of the operation completion but
the error code was being ignored.

Treat a crypto API error indication as a verification failure.

While a crypto API reported error would almost certainly result
in a memcpy of the digest failing anyway and thus the security
risk seems minor, performing a memory compare on what might be
uninitialized memory is wrong.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ah4.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f2a71025a770..22377c8ff14b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err)
 	int ihl = ip_hdrlen(skb);
 	int ah_hlen = (ah->hdrlen + 2) << 2;
 
+	if (err)
+		goto out;
+
 	work_iph = AH_SKB_CB(skb)->tmp;
 	auth_data = ah_tmp_auth(work_iph, ihl);
 	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
-- 
cgit v1.2.3


From 726282aa6bbe627b3876afc27f88172e37b1d01d Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef
Date: Mon, 16 Jan 2017 13:17:56 +0200
Subject: IPsec: do not ignore crypto err in ah6 input

ah6 input processing uses the asynchronous hash crypto API which
supplies an error code as part of the operation completion but
the error code was being ignored.

Treat a crypto API error indication as a verification failure.

While a crypto API reported error would almost certainly result
in a memcpy of the digest failing anyway and thus the security
risk seems minor, performing a memory compare on what might be
uninitialized memory is wrong.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ah6.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 189eb10b742d..dda6035e3b84 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -474,6 +474,9 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
 	int hdr_len = skb_network_header_len(skb);
 	int ah_hlen = (ah->hdrlen + 2) << 2;
 
+	if (err)
+		goto out;
+
 	work_iph = AH_SKB_CB(skb)->tmp;
 	auth_data = ah_tmp_auth(work_iph, hdr_len);
 	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
-- 
cgit v1.2.3


From e89df813174d81abee8be8cabc047d69ef78c26d Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 13 Jan 2017 09:11:22 -0800
Subject: netlink: do not enter direct reclaim from netlink_trim()

In commit d35c99ff77ecb ("netlink: do not enter direct reclaim from
netlink_dump()") we made sure to not trigger expensive memory reclaim.

Problem is that a bit later, netlink_trim() might be called and
trigger memory reclaim.

netlink_trim() should be best effort, and really as fast as possible.
Under memory pressure, it is fine to not trim this skb.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 161b628ab2b0..edcc1e19ad53 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1210,7 +1210,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 		skb = nskb;
 	}
 
-	if (!pskb_expand_head(skb, 0, -delta, allocation))
+	if (!pskb_expand_head(skb, 0, -delta,
+			      (allocation & ~__GFP_DIRECT_RECLAIM) |
+			      __GFP_NOWARN | __GFP_NORETRY))
 		skb->truesize -= delta;
 
 	return skb;
-- 
cgit v1.2.3


From 57b68ec2a7be9a7e9f8999850f8ee5baa49023e4 Mon Sep 17 00:00:00 2001
From: Colin Ian King
Date: Fri, 13 Jan 2017 18:48:20 +0000
Subject: flow dissector: check if arp_eth is null rather than arp

arp is being checked instead of arp_eth to see if the call to
__skb_header_pointer failed. Fix this by checking arp_eth is
null instead of arp.   Also fix to use length hlen rather than
hlen - sizeof(_arp); thanks to Eric Dumazet for spotting
this latter issue.

CoverityScan CID#1396428 ("Logically dead code") on 2nd
arp comparison (which should be arp_eth instead).

Fixes: commit 55733350e5e8b70c5 ("flow disector: ARP support")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e3dffc7a4785..c35aae13c8d2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -407,9 +407,9 @@ mpls:
 
 		arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
 					       sizeof(_arp_eth), data,
-					       hlen - sizeof(_arp),
+					       hlen,
 					       &_arp_eth);
-		if (!arp)
+		if (!arp_eth)
 			goto out_bad;
 
 		if (dissector_uses_key(flow_dissector,
-- 
cgit v1.2.3


From 11d05ac1dfb2e51e21c403f19790bf96809b2714 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Fri, 13 Jan 2017 18:27:33 -0200
Subject: sctp: remove unused var from sctp_process_asconf

Assigned but not used.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index a15d824a313d..80a9088084ac 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3210,7 +3210,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 	union sctp_params param;
 	sctp_addiphdr_t		*hdr;
 	union sctp_addr_param	*addr_param;
-	sctp_addip_param_t	*asconf_param;
 	struct sctp_chunk	*asconf_ack;
 	__be16	err_code;
 	int	length = 0;
@@ -3230,7 +3229,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 	 * asconf parameter.
 	 */
 	length = ntohs(addr_param->p.length);
-	asconf_param = (void *)addr_param + length;
 	chunk_len -= length;
 
 	/* create an ASCONF_ACK chunk.
-- 
cgit v1.2.3


From cdfb1a9f30aaf43cfcecbb6c4061cd3807a4e086 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Fri, 13 Jan 2017 18:31:15 -0200
Subject: sctp: remove useless code from sctp_apply_peer_addr_params

sctp_frag_point() doesn't store anything, and thus just calling it
cannot do anything useful.

sctp_apply_peer_addr_params is only called by
sctp_setsockopt_peer_addr_params. When operating on an asoc,
sctp_setsockopt_peer_addr_params will call sctp_apply_peer_addr_params
once for the asoc, and then once for each transport this asoc has,
meaning that the frag_point will be recomputed when updating the
transports and calling it when updating the asoc is not necessary.
IOW, no action is needed here and we can remove this call.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 318c6786d653..635e03412693 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2430,7 +2430,6 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 			sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
 		} else if (asoc) {
 			asoc->pathmtu = params->spp_pathmtu;
-			sctp_frag_point(asoc, params->spp_pathmtu);
 		} else {
 			sp->pathmtu = params->spp_pathmtu;
 		}
-- 
cgit v1.2.3


From a3308d8fd1f58c67aaae52d9468791c2082ab2c7 Mon Sep 17 00:00:00 2001
From: Paul Blakey
Date: Mon, 16 Jan 2017 10:45:13 +0200
Subject: net/sched: cls_flower: Disallow duplicate internal elements

Flower currently allows having the same filter twice with the same
priority. Actions (and statistics update) will always execute on the
first inserted rule leaving the second rule unused.
This patch disallows that.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index a3bfda3091a4..27934456d984 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -134,6 +134,14 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
 	memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
+static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
+				       struct fl_flow_key *mkey)
+{
+	return rhashtable_lookup_fast(&head->ht,
+				      fl_key_get_start(mkey, &head->mask),
+				      head->ht_params);
+}
+
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
@@ -181,9 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
 
-	f = rhashtable_lookup_fast(&head->ht,
-				   fl_key_get_start(&skb_mkey, &head->mask),
-				   head->ht_params);
+	f = fl_lookup(head, &skb_mkey);
 	if (f && !tc_skip_sw(f->flags)) {
 		*res = f->res;
 		return tcf_exts_exec(skb, &f->exts, res);
@@ -875,6 +881,11 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 
 	if (!tc_skip_sw(fnew->flags)) {
+		if (!fold && fl_lookup(head, &fnew->mkey)) {
+			err = -EEXIST;
+			goto errout;
+		}
+
 		err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
 					     head->ht_params);
 		if (err)
-- 
cgit v1.2.3


From cac2661c53f35cbe651bef9b07026a5a05ab8ce0 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Tue, 17 Jan 2017 10:22:57 +0100
Subject: esp4: Avoid skb_cow_data whenever possible

This patch tries to avoid skb_cow_data on esp4.

On the encrypt side we add the IPsec tailbits
to the linear part of the buffer if there is
space on it. If there is no space on the linear
part, we add a page fragment with the tailbits to
the buffer and use separate src and dst scatterlists.

On the decrypt side, we leave the buffer as it is
if it is not cloned.

With this, we can avoid a linearization of the buffer
in most of the cases.

Joint work with:
Sowmini Varadhan <sowmini.varadhan@oracle.com>
Ilan Tayari <ilant@mellanox.com>

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h |   2 +
 net/ipv4/esp4.c    | 338 +++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 266 insertions(+), 74 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c52197cf51dc..d9a81dcef53e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -213,6 +213,8 @@ struct xfrm_state {
 	/* Last used time */
 	unsigned long		lastused;
 
+	struct page_frag xfrag;
+
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	const struct xfrm_type	*type;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 20fb25e3027b..9e8d97133513 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
 #include <net/protocol.h>
 #include <net/udp.h>
 
+#include <linux/highmem.h>
+
 struct esp_skb_cb {
 	struct xfrm_skb_cb xfrm;
 	void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
 			     __alignof__(struct scatterlist));
 }
 
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+	struct esp_output_extra *extra = esp_tmp_extra(tmp);
+	struct crypto_aead *aead = x->data;
+	int extralen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		extralen += sizeof(*extra);
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			put_page(sg_page(sg));
+}
+
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	void *tmp;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp);
+	kfree(tmp);
 	xfrm_output_resume(skb, err);
 }
 
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
 				sizeof(__be32));
 }
 
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+					       struct ip_esp_hdr *esph,
+					       struct esp_output_extra *extra)
+{
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		extra->esphoff = (unsigned char *)esph -
+				 skb_transport_header(skb);
+		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+		extra->seqhi = esph->spi;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
 static void esp_output_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -130,16 +184,18 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 
 static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	int err;
 	struct esp_output_extra *extra;
+	int err = -ENOMEM;
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
 	struct aead_request *req;
-	struct scatterlist *sg;
+	struct scatterlist *sg, *dsg;
 	struct sk_buff *trailer;
+	struct page *page;
 	void *tmp;
 	u8 *iv;
 	u8 *tail;
+	u8 *vaddr;
 	int blksize;
 	int clen;
 	int alen;
@@ -149,7 +205,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	int nfrags;
 	int assoclen;
 	int extralen;
+	int tailen;
 	__be64 seqno;
+	__u8 proto = *skb_mac_header(skb);
 
 	/* skb is pure payload to encrypt */
 
@@ -169,12 +227,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
 	plen = clen - skb->len - tfclen;
-
-	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
-	if (err < 0)
-		goto error;
-	nfrags = err;
-
+	tailen = tfclen + plen + alen;
 	assoclen = sizeof(*esph);
 	extralen = 0;
 
@@ -183,35 +236,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += sizeof(__be32);
 	}
 
-	tmp = esp_alloc_tmp(aead, nfrags, extralen);
-	if (!tmp) {
-		err = -ENOMEM;
-		goto error;
-	}
-
-	extra = esp_tmp_extra(tmp);
-	iv = esp_tmp_iv(aead, tmp, extralen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
-
-	/* Fill padding... */
-	tail = skb_tail_pointer(trailer);
-	if (tfclen) {
-		memset(tail, 0, tfclen);
-		tail += tfclen;
-	}
-	do {
-		int i;
-		for (i = 0; i < plen - 2; i++)
-			tail[i] = i + 1;
-	} while (0);
-	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = *skb_mac_header(skb);
-	pskb_put(skb, trailer, clen - skb->len + alen);
-
-	skb_push(skb, -skb_network_offset(skb));
-	esph = ip_esp_hdr(skb);
 	*skb_mac_header(skb) = IPPROTO_ESP;
+	esph = ip_esp_hdr(skb);
 
 	/* this is non-NULL only with UDP Encapsulation */
 	if (x->encap) {
@@ -230,7 +256,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		uh = (struct udphdr *)esph;
 		uh->source = sport;
 		uh->dest = dport;
-		uh->len = htons(skb->len - skb_transport_offset(skb));
+		uh->len = htons(skb->len + tailen
+				- skb_transport_offset(skb));
 		uh->check = 0;
 
 		switch (encap_type) {
@@ -248,31 +275,170 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		*skb_mac_header(skb) = IPPROTO_UDP;
 	}
 
-	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_availroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
 
-	aead_request_set_callback(req, 0, esp_output_done, skb);
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+			   && !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		extra->esphoff = (unsigned char *)esph -
-				 skb_transport_header(skb);
-		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
-		extra->seqhi = esph->spi;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			vaddr = kmap_atomic(page);
+
+			tail = vaddr + pfrag->offset;
+
+			/* Fill padding... */
+			if (tfclen) {
+				memset(tail, 0, tfclen);
+				tail += tfclen;
+			}
+			do {
+				int i;
+				for (i = 0; i < plen - 2; i++)
+					tail[i] = i + 1;
+			} while (0);
+			tail[plen - 2] = plen - 2;
+			tail[plen - 1] = proto;
+
+			kunmap_atomic(vaddr);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+			nfrags++;
+
+			skb->len += tailen;
+			skb->data_len += tailen;
+			skb->truesize += tailen;
+			if (sk)
+				atomic_add(tailen, &sk->sk_wmem_alloc);
+
+			skb_push(skb, -skb_network_offset(skb));
+
+			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+			esph->spi = x->id.spi;
+
+			tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
+			if (!tmp) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			extra = esp_tmp_extra(tmp);
+			iv = esp_tmp_iv(aead, tmp, extralen);
+			req = esp_tmp_req(aead, iv);
+			sg = esp_req_sg(aead, req);
+			dsg = &sg[nfrags];
+
+			esph = esp_output_set_extra(skb, esph, extra);
+
+			sg_init_table(sg, nfrags);
+			skb_to_sgvec(skb, sg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			skb_shinfo(skb)->nr_frags = 1;
+
+			page = pfrag->page;
+			get_page(page);
+			/* replace page frags in skb with new page */
+			__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+			pfrag->offset = pfrag->offset + allocsize;
+
+			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+			skb_to_sgvec(skb, dsg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			spin_unlock_bh(&x->lock);
+
+			goto skip_cow2;
+		}
 	}
 
+cow:
+	err = skb_cow_data(skb, tailen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
+	tail = skb_tail_pointer(trailer);
+	esph = ip_esp_hdr(skb);
+
+skip_cow:
+	/* Fill padding... */
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = proto;
+	pskb_put(skb, trailer, clen - skb->len + alen);
+
+	skb_push(skb, -skb_network_offset(skb));
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 	esph->spi = x->id.spi;
 
+	tmp = esp_alloc_tmp(aead, nfrags, extralen);
+	if (!tmp) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+	dsg = sg;
+
+	esph = esp_output_set_extra(skb, esph, extra);
+
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg,
 		     (unsigned char *)esph - skb->data,
 		     assoclen + ivlen + clen + alen);
 
-	aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
 	aead_request_set_ad(req, assoclen);
 
 	seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +464,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 			esp_output_restore_header(skb);
 	}
 
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp);
 	kfree(tmp);
 
 error:
@@ -401,6 +569,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
 	__skb_pull(skb, 4);
 }
 
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	}
+}
+
 static void esp_input_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -437,12 +622,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	if (elen <= 0)
 		goto out;
 
-	err = skb_cow_data(skb, 0, &trailer);
-	if (err < 0)
-		goto out;
-
-	nfrags = err;
-
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
 
@@ -451,6 +630,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	err = skb_cow_data(skb, 0, &trailer);
+	if (err < 0)
+		goto out;
+
+	nfrags = err;
+
+skip_cow:
 	err = -ENOMEM;
 	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
 	if (!tmp)
@@ -462,26 +661,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
 
-	skb->ip_summed = CHECKSUM_NONE;
+	esp_input_set_header(skb, seqhi);
 
-	esph = (struct ip_esp_hdr *)skb->data;
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	aead_request_set_callback(req, 0, esp_input_done, skb);
+	skb->ip_summed = CHECKSUM_NONE;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * decryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)skb_push(skb, 4);
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	if ((x->props.flags & XFRM_STATE_ESN))
 		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	}
-
-	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
 
 	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
 	aead_request_set_ad(req, assoclen);
-- 
cgit v1.2.3


From 03e2a30f6a27e2f3e5283b777f6ddd146b38c738 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Tue, 17 Jan 2017 10:23:03 +0100
Subject: esp6: Avoid skb_cow_data whenever possible

This patch tries to avoid skb_cow_data on esp6.

On the encrypt side we add the IPsec tailbits
to the linear part of the buffer if there is
space on it. If there is no space on the linear
part, we add a page fragment with the tailbits to
the buffer and use separate src and dst scatterlists.

On the decrypt side, we leave the buffer as it is
if it is not cloned.

With this, we can avoid a linearization of the buffer
in most of the cases.

Joint work with:
Sowmini Varadhan <sowmini.varadhan@oracle.com>
Ilan Tayari <ilant@mellanox.com>

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6.c | 302 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 246 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index cbcdd5db31f4..a428ac6b7c74 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -44,6 +44,8 @@
 #include <net/protocol.h>
 #include <linux/icmpv6.h>
 
+#include <linux/highmem.h>
+
 struct esp_skb_cb {
 	struct xfrm_skb_cb xfrm;
 	void *tmp;
@@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
 			     __alignof__(struct scatterlist));
 }
 
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+	__be32 *seqhi;
+	struct crypto_aead *aead = x->data;
+	int seqhilen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		seqhilen += sizeof(__be32);
+
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			put_page(sg_page(sg));
+}
+
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	void *tmp;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp);
+	kfree(tmp);
 	xfrm_output_resume(skb, err);
 }
 
@@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb)
 	esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
 }
 
+static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
+					     struct ip_esp_hdr *esph,
+					     __be32 *seqhi)
+{
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+		*seqhi = esph->spi;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
 static void esp_output_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -152,8 +204,9 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
 	struct aead_request *req;
-	struct scatterlist *sg;
+	struct scatterlist *sg, *dsg;
 	struct sk_buff *trailer;
+	struct page *page;
 	void *tmp;
 	int blksize;
 	int clen;
@@ -164,10 +217,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	int nfrags;
 	int assoclen;
 	int seqhilen;
+	int tailen;
 	u8 *iv;
 	u8 *tail;
+	u8 *vaddr;
 	__be32 *seqhi;
 	__be64 seqno;
+	__u8 proto = *skb_mac_header(skb);
 
 	/* skb is pure payload to encrypt */
 	aead = x->data;
@@ -186,11 +242,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
 	plen = clen - skb->len - tfclen;
-
-	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
-	if (err < 0)
-		goto error;
-	nfrags = err;
+	tailen = tfclen + plen + alen;
 
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
@@ -200,19 +252,130 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
-	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
-	if (!tmp) {
-		err = -ENOMEM;
-		goto error;
+	*skb_mac_header(skb) = IPPROTO_ESP;
+	esph = ip_esp_hdr(skb);
+
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_availroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
+
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+			   && !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
+
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			vaddr = kmap_atomic(page);
+
+			tail = vaddr + pfrag->offset;
+
+			/* Fill padding... */
+			if (tfclen) {
+				memset(tail, 0, tfclen);
+				tail += tfclen;
+			}
+			do {
+				int i;
+				for (i = 0; i < plen - 2; i++)
+					tail[i] = i + 1;
+			} while (0);
+			tail[plen - 2] = plen - 2;
+			tail[plen - 1] = proto;
+
+			kunmap_atomic(vaddr);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+			nfrags++;
+
+			skb->len += tailen;
+			skb->data_len += tailen;
+			skb->truesize += tailen;
+			if (sk)
+				atomic_add(tailen, &sk->sk_wmem_alloc);
+
+			skb_push(skb, -skb_network_offset(skb));
+
+			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+			esph->spi = x->id.spi;
+
+			tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen);
+			if (!tmp) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+			seqhi = esp_tmp_seqhi(tmp);
+			iv = esp_tmp_iv(aead, tmp, seqhilen);
+			req = esp_tmp_req(aead, iv);
+			sg = esp_req_sg(aead, req);
+			dsg = &sg[nfrags];
+
+			esph = esp_output_set_esn(skb, esph, seqhi);
+
+			sg_init_table(sg, nfrags);
+			skb_to_sgvec(skb, sg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			skb_shinfo(skb)->nr_frags = 1;
+
+			page = pfrag->page;
+			get_page(page);
+			/* replace page frags in skb with new page */
+			__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+			pfrag->offset = pfrag->offset + allocsize;
+
+			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+			skb_to_sgvec(skb, dsg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			spin_unlock_bh(&x->lock);
+
+			goto skip_cow2;
+		}
 	}
 
-	seqhi = esp_tmp_seqhi(tmp);
-	iv = esp_tmp_iv(aead, tmp, seqhilen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
+cow:
+	err = skb_cow_data(skb, tailen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
 
-	/* Fill padding... */
 	tail = skb_tail_pointer(trailer);
+	esph = ip_esp_hdr(skb);
+
+skip_cow:
+	/* Fill padding... */
 	if (tfclen) {
 		memset(tail, 0, tfclen);
 		tail += tfclen;
@@ -223,36 +386,40 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			tail[i] = i + 1;
 	} while (0);
 	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = *skb_mac_header(skb);
+	tail[plen - 1] = proto;
 	pskb_put(skb, trailer, clen - skb->len + alen);
 
 	skb_push(skb, -skb_network_offset(skb));
-	esph = ip_esp_hdr(skb);
-	*skb_mac_header(skb) = IPPROTO_ESP;
 
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	esph->spi = x->id.spi;
 
-	aead_request_set_callback(req, 0, esp_output_done, skb);
-
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
-		*seqhi = esph->spi;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+	if (!tmp) {
+		err = -ENOMEM;
+		goto error;
 	}
 
-	esph->spi = x->id.spi;
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+	dsg = sg;
+
+	esph = esp_output_set_esn(skb, esph, seqhi);
 
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg,
 		     (unsigned char *)esph - skb->data,
 		     assoclen + ivlen + clen + alen);
 
-	aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
 	aead_request_set_ad(req, assoclen);
 
 	seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -278,6 +445,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			esp_output_restore_header(skb);
 	}
 
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp);
 	kfree(tmp);
 
 error:
@@ -343,6 +512,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
 	__skb_pull(skb, 4);
 }
 
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	}
+}
+
 static void esp_input_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -378,14 +564,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 		goto out;
 	}
 
-	nfrags = skb_cow_data(skb, 0, &trailer);
-	if (nfrags < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = -ENOMEM;
-
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
 
@@ -394,6 +572,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	nfrags = skb_cow_data(skb, 0, &trailer);
+	if (nfrags < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+skip_cow:
+	ret = -ENOMEM;
 	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
 	if (!tmp)
 		goto out;
@@ -404,26 +603,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
 
-	skb->ip_summed = CHECKSUM_NONE;
+	esp_input_set_header(skb, seqhi);
 
-	esph = (struct ip_esp_hdr *)skb->data;
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	aead_request_set_callback(req, 0, esp_input_done, skb);
+	skb->ip_summed = CHECKSUM_NONE;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * decryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)skb_push(skb, 4);
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	if ((x->props.flags & XFRM_STATE_ESN))
 		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	}
-
-	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
 
 	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
 	aead_request_set_ad(req, assoclen);
-- 
cgit v1.2.3


From eb758c8864d49f5786432ce38fd8a72bdbbd10cf Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Tue, 17 Jan 2017 10:23:08 +0100
Subject: esp: Introduce a helper to setup the trailer

We need to setup the trailer in two different cases,
so add a helper to avoid code duplication.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4.c | 44 +++++++++++++++++++-------------------------
 net/ipv6/esp6.c | 44 +++++++++++++++++++-------------------------
 2 files changed, 38 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9e8d97133513..b1e24446e297 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -182,6 +182,22 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 	esp_output_done(base, err);
 }
 
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+	/* Fill padding... */
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = proto;
+}
+
 static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct esp_output_extra *extra;
@@ -304,18 +320,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 
 			tail = vaddr + pfrag->offset;
 
-			/* Fill padding... */
-			if (tfclen) {
-				memset(tail, 0, tfclen);
-				tail += tfclen;
-			}
-			do {
-				int i;
-				for (i = 0; i < plen - 2; i++)
-					tail[i] = i + 1;
-			} while (0);
-			tail[plen - 2] = plen - 2;
-			tail[plen - 1] = proto;
+			esp_output_fill_trailer(tail, tfclen, plen, proto);
 
 			kunmap_atomic(vaddr);
 
@@ -395,20 +400,9 @@ cow:
 	esph = ip_esp_hdr(skb);
 
 skip_cow:
-	/* Fill padding... */
-	if (tfclen) {
-		memset(tail, 0, tfclen);
-		tail += tfclen;
-	}
-	do {
-		int i;
-		for (i = 0; i < plen - 2; i++)
-			tail[i] = i + 1;
-	} while (0);
-	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = proto;
-	pskb_put(skb, trailer, clen - skb->len + alen);
+	esp_output_fill_trailer(tail, tfclen, plen, proto);
 
+	pskb_put(skb, trailer, clen - skb->len + alen);
 	skb_push(skb, -skb_network_offset(skb));
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 	esph->spi = x->id.spi;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index a428ac6b7c74..ff54faa75631 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -198,6 +198,22 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 	esp_output_done(base, err);
 }
 
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+	/* Fill padding... */
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = proto;
+}
+
 static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int err;
@@ -284,18 +300,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 
 			tail = vaddr + pfrag->offset;
 
-			/* Fill padding... */
-			if (tfclen) {
-				memset(tail, 0, tfclen);
-				tail += tfclen;
-			}
-			do {
-				int i;
-				for (i = 0; i < plen - 2; i++)
-					tail[i] = i + 1;
-			} while (0);
-			tail[plen - 2] = plen - 2;
-			tail[plen - 1] = proto;
+			esp_output_fill_trailer(tail, tfclen, plen, proto);
 
 			kunmap_atomic(vaddr);
 
@@ -375,20 +380,9 @@ cow:
 	esph = ip_esp_hdr(skb);
 
 skip_cow:
-	/* Fill padding... */
-	if (tfclen) {
-		memset(tail, 0, tfclen);
-		tail += tfclen;
-	}
-	do {
-		int i;
-		for (i = 0; i < plen - 2; i++)
-			tail[i] = i + 1;
-	} while (0);
-	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = proto;
-	pskb_put(skb, trailer, clen - skb->len + alen);
+	esp_output_fill_trailer(tail, tfclen, plen, proto);
 
+	pskb_put(skb, trailer, clen - skb->len + alen);
 	skb_push(skb, -skb_network_offset(skb));
 
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
-- 
cgit v1.2.3


From aefb4d4ad83b608cb8e0cab8d3cd8e57d3f91feb Mon Sep 17 00:00:00 2001
From: Robert Shearman
Date: Mon, 16 Jan 2017 14:16:36 +0000
Subject: net: AF-specific RTM_GETSTATS attributes

Add the functionality for including address-family-specific per-link
stats in RTM_GETSTATS messages. This is done through adding a new
IFLA_STATS_AF_SPEC attribute under which address family attributes are
nested and then the AF-specific attributes can be further nested. This
follows the model of IFLA_AF_SPEC on RTM_*LINK messages and it has the
advantage of presenting an easily extended hierarchy. The rtnl_af_ops
structure is extended to provide AFs with the opportunity to fill and
provide the size of their stats attributes.

One alternative would have been to provide AFs with the ability to add
attributes directly into the RTM_GETSTATS message without a nested
hierarchy. I discounted this approach as it increases the rate at
which the 32 attribute number space is used up and it makes
implementation a little more tricky for stats dump resuming (at the
moment the order in which attributes are added to the message has to
match the numeric order of the attributes).

Another alternative would have been to register per-AF RTM_GETSTATS
handlers. I discounted this approach as I perceived a common use-case
to be getting all the stats for an interface and this approach would
necessitate multiple requests/dumps to retrieve them all.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h      |  4 ++++
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 50 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 4113916cc1bb..106de5f7bf06 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -139,6 +139,10 @@ struct rtnl_af_ops {
 						    const struct nlattr *attr);
 	int			(*set_link_af)(struct net_device *dev,
 					       const struct nlattr *attr);
+
+	int			(*fill_stats_af)(struct sk_buff *skb,
+						 const struct net_device *dev);
+	size_t			(*get_stats_af_size)(const struct net_device *dev);
 };
 
 void __rtnl_af_unregister(struct rtnl_af_ops *ops);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6b13e591abc9..184b16ed2b84 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -847,6 +847,7 @@ enum {
 	IFLA_STATS_LINK_XSTATS,
 	IFLA_STATS_LINK_XSTATS_SLAVE,
 	IFLA_STATS_LINK_OFFLOAD_XSTATS,
+	IFLA_STATS_AF_SPEC,
 	__IFLA_STATS_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 75e3ea7bda08..f538f764fca6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3829,6 +3829,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		*idxattr = 0;
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+		struct rtnl_af_ops *af_ops;
+
+		*idxattr = IFLA_STATS_AF_SPEC;
+		attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+			if (af_ops->fill_stats_af) {
+				struct nlattr *af;
+				int err;
+
+				af = nla_nest_start(skb, af_ops->family);
+				if (!af)
+					goto nla_put_failure;
+
+				err = af_ops->fill_stats_af(skb, dev);
+
+				if (err == -ENODATA)
+					nla_nest_cancel(skb, af);
+				else if (err < 0)
+					goto nla_put_failure;
+
+				nla_nest_end(skb, af);
+			}
+		}
+
+		nla_nest_end(skb, attr);
+
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3885,6 +3918,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
 		size += rtnl_get_offload_stats_size(dev);
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+		struct rtnl_af_ops *af_ops;
+
+		/* for IFLA_STATS_AF_SPEC */
+		size += nla_total_size(0);
+
+		list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+			if (af_ops->get_stats_af_size) {
+				size += nla_total_size(
+					af_ops->get_stats_af_size(dev));
+
+				/* for AF_* */
+				size += nla_total_size(0);
+			}
+		}
+	}
+
 	return size;
 }
 
-- 
cgit v1.2.3


From 27d691056bde4a6feca5e83fd92b787332c46302 Mon Sep 17 00:00:00 2001
From: Robert Shearman
Date: Mon, 16 Jan 2017 14:16:37 +0000
Subject: mpls: Packet stats

Having MPLS packet stats is useful for observing network operation and
for diagnosing network problems. In the absence of anything better,
RFC2863 and RFC3813 are used for guidance for which stats to expose
and the semantics of them. In particular rx_noroutes maps to in
unknown protos in RFC2863. The stats are exposed to userspace via
AF_MPLS attributes embedded in the IFLA_STATS_AF_SPEC attribute of
RTM_GETSTATS messages.

All the introduced fields are 64-bit, even error ones, to ensure no
overflow with long uptimes. Per-CPU counters are used to avoid
cache-line contention on the commonly used fields. The other fields
have also been made per-CPU for code to avoid performance problems in
error conditions on the assumption that on some platforms the cost of
atomic operations could be more expensive than sending the packet
(which is what would be done in the success case). If that's not the
case, we could instead not use per-CPU counters for these fields.

Only unicast and non-fragment are exposed at the moment, but other
counters can be exposed in the future either by adding to the end of
struct mpls_link_stats or by additional netlink attributes in the
AF_MPLS IFLA_STATS_AF_SPEC nested attribute.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mpls.h |  30 ++++++++
 net/mpls/af_mpls.c        | 181 ++++++++++++++++++++++++++++++++++++++++------
 net/mpls/internal.h       |  58 ++++++++++++++-
 net/mpls/mpls_iptunnel.c  |  11 ++-
 4 files changed, 252 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h
index 24a6cb1aec86..77a19dfe3990 100644
--- a/include/uapi/linux/mpls.h
+++ b/include/uapi/linux/mpls.h
@@ -43,4 +43,34 @@ struct mpls_label {
 
 #define MPLS_LABEL_FIRST_UNRESERVED	16 /* RFC3032 */
 
+/* These are embedded into IFLA_STATS_AF_SPEC:
+ * [IFLA_STATS_AF_SPEC]
+ * -> [AF_MPLS]
+ *    -> [MPLS_STATS_xxx]
+ *
+ * Attributes:
+ * [MPLS_STATS_LINK] = {
+ *     struct mpls_link_stats
+ * }
+ */
+enum {
+	MPLS_STATS_UNSPEC, /* also used as 64bit pad attribute */
+	MPLS_STATS_LINK,
+	__MPLS_STATS_MAX,
+};
+
+#define MPLS_STATS_MAX (__MPLS_STATS_MAX - 1)
+
+struct mpls_link_stats {
+	__u64	rx_packets;		/* total packets received	*/
+	__u64	tx_packets;		/* total packets transmitted	*/
+	__u64	rx_bytes;		/* total bytes received		*/
+	__u64	tx_bytes;		/* total bytes transmitted	*/
+	__u64	rx_errors;		/* bad packets received		*/
+	__u64	tx_errors;		/* packet transmit problems	*/
+	__u64	rx_dropped;		/* packet dropped on receive	*/
+	__u64	tx_dropped;		/* packet dropped on transmit	*/
+	__u64	rx_noroute;		/* no route for packet dest	*/
+};
+
 #endif /* _UAPI_MPLS_H */
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 15fe97644ffe..4dc81963af8f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -8,6 +8,7 @@
 #include <linux/ipv6.h>
 #include <linux/mpls.h>
 #include <linux/vmalloc.h>
+#include <linux/percpu.h>
 #include <net/ip.h>
 #include <net/dst.h>
 #include <net/sock.h>
@@ -17,8 +18,8 @@
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
-#include <net/addrconf.h>
 #endif
+#include <net/addrconf.h>
 #include <net/nexthop.h>
 #include "internal.h"
 
@@ -48,11 +49,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
 	return rt;
 }
 
-static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
-{
-	return rcu_dereference_rtnl(dev->mpls_ptr);
-}
-
 bool mpls_output_possible(const struct net_device *dev)
 {
 	return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -98,6 +94,31 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+				 const struct sk_buff *skb)
+{
+	struct mpls_dev *mdev;
+
+	if (skb->protocol == htons(ETH_P_MPLS_UC)) {
+		mdev = mpls_dev_get(dev);
+		if (mdev)
+			MPLS_INC_STATS_LEN(mdev, skb->len,
+					   tx_packets,
+					   tx_bytes);
+	} else if (skb->protocol == htons(ETH_P_IP)) {
+		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct inet6_dev *in6dev = __in6_dev_get(dev);
+
+		if (in6dev)
+			IP6_UPD_PO_STATS(dev_net(dev), in6dev,
+					 IPSTATS_MIB_OUT, skb->len);
+#endif
+	}
+}
+EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts);
+
 static u32 mpls_multipath_hash(struct mpls_route *rt,
 			       struct sk_buff *skb, bool bos)
 {
@@ -253,6 +274,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	struct mpls_nh *nh;
 	struct mpls_entry_decoded dec;
 	struct net_device *out_dev;
+	struct mpls_dev *out_mdev;
 	struct mpls_dev *mdev;
 	unsigned int hh_len;
 	unsigned int new_header_size;
@@ -262,17 +284,25 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	/* Careful this entire function runs inside of an rcu critical section */
 
 	mdev = mpls_dev_get(dev);
-	if (!mdev || !mdev->input_enabled)
+	if (!mdev)
 		goto drop;
 
-	if (skb->pkt_type != PACKET_HOST)
+	MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets,
+			   rx_bytes);
+
+	if (!mdev->input_enabled) {
+		MPLS_INC_STATS(mdev, rx_dropped);
 		goto drop;
+	}
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto err;
 
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
-		goto drop;
+		goto err;
 
 	if (!pskb_may_pull(skb, sizeof(*hdr)))
-		goto drop;
+		goto err;
 
 	/* Read and decode the label */
 	hdr = mpls_hdr(skb);
@@ -285,33 +315,35 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	skb_orphan(skb);
 
 	rt = mpls_route_input_rcu(net, dec.label);
-	if (!rt)
+	if (!rt) {
+		MPLS_INC_STATS(mdev, rx_noroute);
 		goto drop;
+	}
 
 	nh = mpls_select_multipath(rt, skb, dec.bos);
 	if (!nh)
-		goto drop;
-
-	/* Find the output device */
-	out_dev = rcu_dereference(nh->nh_dev);
-	if (!mpls_output_possible(out_dev))
-		goto drop;
+		goto err;
 
 	if (skb_warn_if_lro(skb))
-		goto drop;
+		goto err;
 
 	skb_forward_csum(skb);
 
 	/* Verify ttl is valid */
 	if (dec.ttl <= 1)
-		goto drop;
+		goto err;
 	dec.ttl -= 1;
 
+	/* Find the output device */
+	out_dev = rcu_dereference(nh->nh_dev);
+	if (!mpls_output_possible(out_dev))
+		goto tx_err;
+
 	/* Verify the destination can hold the packet */
 	new_header_size = mpls_nh_header_size(nh);
 	mtu = mpls_dev_mtu(out_dev);
 	if (mpls_pkt_too_big(skb, mtu - new_header_size))
-		goto drop;
+		goto tx_err;
 
 	hh_len = LL_RESERVED_SPACE(out_dev);
 	if (!out_dev->header_ops)
@@ -319,7 +351,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 
 	/* Ensure there is enough space for the headers in the skb */
 	if (skb_cow(skb, hh_len + new_header_size))
-		goto drop;
+		goto tx_err;
 
 	skb->dev = out_dev;
 	skb->protocol = htons(ETH_P_MPLS_UC);
@@ -327,7 +359,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	if (unlikely(!new_header_size && dec.bos)) {
 		/* Penultimate hop popping */
 		if (!mpls_egress(rt, skb, dec))
-			goto drop;
+			goto err;
 	} else {
 		bool bos;
 		int i;
@@ -343,6 +375,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	mpls_stats_inc_outucastpkts(out_dev, skb);
+
 	/* If via wasn't specified then send out using device address */
 	if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
 		err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
@@ -355,6 +389,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 				    __func__, err);
 	return 0;
 
+tx_err:
+	out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+	if (out_mdev)
+		MPLS_INC_STATS(out_mdev, tx_errors);
+	goto drop;
+err:
+	MPLS_INC_STATS(mdev, rx_errors);
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
@@ -853,6 +894,70 @@ errout:
 	return err;
 }
 
+static void mpls_get_stats(struct mpls_dev *mdev,
+			   struct mpls_link_stats *stats)
+{
+	struct mpls_pcpu_stats *p;
+	int i;
+
+	memset(stats, 0, sizeof(*stats));
+
+	for_each_possible_cpu(i) {
+		struct mpls_link_stats local;
+		unsigned int start;
+
+		p = per_cpu_ptr(mdev->stats, i);
+		do {
+			start = u64_stats_fetch_begin(&p->syncp);
+			local = p->stats;
+		} while (u64_stats_fetch_retry(&p->syncp, start));
+
+		stats->rx_packets	+= local.rx_packets;
+		stats->rx_bytes		+= local.rx_bytes;
+		stats->tx_packets	+= local.tx_packets;
+		stats->tx_bytes		+= local.tx_bytes;
+		stats->rx_errors	+= local.rx_errors;
+		stats->tx_errors	+= local.tx_errors;
+		stats->rx_dropped	+= local.rx_dropped;
+		stats->tx_dropped	+= local.tx_dropped;
+		stats->rx_noroute	+= local.rx_noroute;
+	}
+}
+
+static int mpls_fill_stats_af(struct sk_buff *skb,
+			      const struct net_device *dev)
+{
+	struct mpls_link_stats *stats;
+	struct mpls_dev *mdev;
+	struct nlattr *nla;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return -ENODATA;
+
+	nla = nla_reserve_64bit(skb, MPLS_STATS_LINK,
+				sizeof(struct mpls_link_stats),
+				MPLS_STATS_UNSPEC);
+	if (!nla)
+		return -EMSGSIZE;
+
+	stats = nla_data(nla);
+	mpls_get_stats(mdev, stats);
+
+	return 0;
+}
+
+static size_t mpls_get_stats_af_size(const struct net_device *dev)
+{
+	struct mpls_dev *mdev;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return 0;
+
+	return nla_total_size_64bit(sizeof(struct mpls_link_stats));
+}
+
 #define MPLS_PERDEV_SYSCTL_OFFSET(field)	\
 	(&((struct mpls_dev *)0)->field)
 
@@ -911,6 +1016,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 {
 	struct mpls_dev *mdev;
 	int err = -ENOMEM;
+	int i;
 
 	ASSERT_RTNL();
 
@@ -918,6 +1024,17 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	if (!mdev)
 		return ERR_PTR(err);
 
+	mdev->stats = alloc_percpu(struct mpls_pcpu_stats);
+	if (!mdev->stats)
+		goto free;
+
+	for_each_possible_cpu(i) {
+		struct mpls_pcpu_stats *mpls_stats;
+
+		mpls_stats = per_cpu_ptr(mdev->stats, i);
+		u64_stats_init(&mpls_stats->syncp);
+	}
+
 	err = mpls_dev_sysctl_register(dev, mdev);
 	if (err)
 		goto free;
@@ -927,10 +1044,19 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	return mdev;
 
 free:
+	free_percpu(mdev->stats);
 	kfree(mdev);
 	return ERR_PTR(err);
 }
 
+static void mpls_dev_destroy_rcu(struct rcu_head *head)
+{
+	struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu);
+
+	free_percpu(mdev->stats);
+	kfree(mdev);
+}
+
 static void mpls_ifdown(struct net_device *dev, int event)
 {
 	struct mpls_route __rcu **platform_label;
@@ -1045,7 +1171,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 		if (mdev) {
 			mpls_dev_sysctl_unregister(mdev);
 			RCU_INIT_POINTER(dev->mpls_ptr, NULL);
-			kfree_rcu(mdev, rcu);
+			call_rcu(&mdev->rcu, mpls_dev_destroy_rcu);
 		}
 		break;
 	case NETDEV_CHANGENAME:
@@ -1706,6 +1832,12 @@ static struct pernet_operations mpls_net_ops = {
 	.exit = mpls_net_exit,
 };
 
+static struct rtnl_af_ops mpls_af_ops __read_mostly = {
+	.family		   = AF_MPLS,
+	.fill_stats_af	   = mpls_fill_stats_af,
+	.get_stats_af_size = mpls_get_stats_af_size,
+};
+
 static int __init mpls_init(void)
 {
 	int err;
@@ -1722,6 +1854,8 @@ static int __init mpls_init(void)
 
 	dev_add_pack(&mpls_packet_type);
 
+	rtnl_af_register(&mpls_af_ops);
+
 	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
@@ -1738,6 +1872,7 @@ module_init(mpls_init);
 static void __exit mpls_exit(void)
 {
 	rtnl_unregister_all(PF_MPLS);
+	rtnl_af_unregister(&mpls_af_ops);
 	dev_remove_pack(&mpls_packet_type);
 	unregister_netdevice_notifier(&mpls_dev_notifier);
 	unregister_pernet_subsys(&mpls_net_ops);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bdfef6c3271a..d97243034605 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -9,13 +9,58 @@ struct mpls_entry_decoded {
 	u8 bos;
 };
 
+struct mpls_pcpu_stats {
+	struct mpls_link_stats	stats;
+	struct u64_stats_sync	syncp;
+};
+
 struct mpls_dev {
-	int			input_enabled;
+	int				input_enabled;
 
-	struct ctl_table_header *sysctl;
-	struct rcu_head		rcu;
+	struct mpls_pcpu_stats __percpu	*stats;
+
+	struct ctl_table_header		*sysctl;
+	struct rcu_head			rcu;
 };
 
+#if BITS_PER_LONG == 32
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field)		\
+	do {								\
+		__typeof__(*(mdev)->stats) *ptr =			\
+			raw_cpu_ptr((mdev)->stats);			\
+		local_bh_disable();					\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->stats.pkts_field++;				\
+		ptr->stats.bytes_field += (len);			\
+		u64_stats_update_end(&ptr->syncp);			\
+		local_bh_enable();					\
+	} while (0)
+
+#define MPLS_INC_STATS(mdev, field)					\
+	do {								\
+		__typeof__(*(mdev)->stats) *ptr =			\
+			raw_cpu_ptr((mdev)->stats);			\
+		local_bh_disable();					\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->stats.field++;					\
+		u64_stats_update_end(&ptr->syncp);			\
+		local_bh_enable();					\
+	} while (0)
+
+#else
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field)		\
+	do {								\
+		this_cpu_inc((mdev)->stats->stats.pkts_field);		\
+		this_cpu_add((mdev)->stats->stats.bytes_field, (len));	\
+	} while (0)
+
+#define MPLS_INC_STATS(mdev, field)			\
+	this_cpu_inc((mdev)->stats->stats.field)
+
+#endif
+
 struct sk_buff;
 
 #define LABEL_NOT_SPECIFIED (1 << 20)
@@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
 	return result;
 }
 
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
 int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
 		   const u32 label[]);
 int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
@@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
 bool mpls_output_possible(const struct net_device *dev);
 unsigned int mpls_dev_mtu(const struct net_device *dev);
 bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+				 const struct sk_buff *skb);
 
 #endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 2f7ccd934416..02531284bc49 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct rtable *rt = NULL;
 	struct rt6_info *rt6 = NULL;
+	struct mpls_dev *out_mdev;
 	int err = 0;
 	bool bos;
 	int i;
 	unsigned int ttl;
 
+	/* Find the output device */
+	out_dev = dst->dev;
+
 	/* Obtain the ttl */
 	if (dst->ops->family == AF_INET) {
 		ttl = ip_hdr(skb)->ttl;
@@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb)
 
 	skb_orphan(skb);
 
-	/* Find the output device */
-	out_dev = dst->dev;
 	if (!mpls_output_possible(out_dev) ||
 	    !dst->lwtstate || skb_warn_if_lro(skb))
 		goto drop;
@@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb)
 		bos = false;
 	}
 
+	mpls_stats_inc_outucastpkts(out_dev, skb);
+
 	if (rt)
 		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
 				 skb);
@@ -122,6 +126,9 @@ static int mpls_xmit(struct sk_buff *skb)
 	return LWTUNNEL_XMIT_DONE;
 
 drop:
+	out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+	if (out_mdev)
+		MPLS_INC_STATS(out_mdev, tx_errors);
 	kfree_skb(skb);
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 53631a5f9c6669264adb7b4e92fd95d1d6ffa7d3 Mon Sep 17 00:00:00 2001
From: Lance Richardson
Date: Mon, 16 Jan 2017 18:11:35 -0500
Subject: bridge: sparse fixes in br_ip6_multicast_alloc_query()

Changed type of csum field in struct igmpv3_query from __be16 to
__sum16 to eliminate type warning, made same change in struct
igmpv3_report for consistency.

Fixed up an ntohs() where htons() should have been used instead.

Signed-off-by: Lance Richardson <lrichard@redhat.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/igmp.h | 4 ++--
 net/bridge/br_multicast.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/igmp.h b/include/uapi/linux/igmp.h
index ccbb32aa6704..a97f9a7568cf 100644
--- a/include/uapi/linux/igmp.h
+++ b/include/uapi/linux/igmp.h
@@ -53,7 +53,7 @@ struct igmpv3_grec {
 struct igmpv3_report {
 	__u8 type;
 	__u8 resv1;
-	__be16 csum;
+	__sum16 csum;
 	__be16 resv2;
 	__be16 ngrec;
 	struct igmpv3_grec grec[0];
@@ -62,7 +62,7 @@ struct igmpv3_report {
 struct igmpv3_query {
 	__u8 type;
 	__u8 code;
-	__be16 csum;
+	__sum16 csum;
 	__be32 group;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u8 qrv:3,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b30e77e8427c..f66346122dc4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -540,7 +540,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
 		break;
 	case 2:
 		mld2q = (struct mld2_query *)icmp6_hdr(skb);
-		mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval));
+		mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval));
 		mld2q->mld2q_type = ICMPV6_MGM_QUERY;
 		mld2q->mld2q_code = 0;
 		mld2q->mld2q_cksum = 0;
-- 
cgit v1.2.3


From a7ef6715c4b5905b663dddf3832dbdf77b3bf8ac Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Tue, 17 Jan 2017 10:21:19 +0800
Subject: net: ping: Use right format specifier to avoid type casting

The inet_num is u16, so use %hu instead of casting it to int. And
the sk_bound_dev_if is int actually, so it needn't cast to int.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ping.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 86cca610f4c2..592db6a3a0e9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -433,9 +433,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	}
 
-	pr_debug("after bind(): num = %d, dif = %d\n",
-		 (int)isk->inet_num,
-		 (int)sk->sk_bound_dev_if);
+	pr_debug("after bind(): num = %hu, dif = %d\n",
+		 isk->inet_num,
+		 sk->sk_bound_dev_if);
 
 	err = 0;
 	if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
-- 
cgit v1.2.3


From 0e40f4c9593ba2c7c30150ed669da97bd581c0cd Mon Sep 17 00:00:00 2001
From: Jason Baron
Date: Tue, 17 Jan 2017 13:37:19 -0500
Subject: tcp: accept RST for rcv_nxt - 1 after receiving a FIN

Using a Mac OSX box as a client connecting to a Linux server, we have found
that when certain applications (such as 'ab'), are abruptly terminated
(via ^C), a FIN is sent followed by a RST packet on tcp connections. The
FIN is accepted by the Linux stack but the RST is sent with the same
sequence number as the FIN, and Linux responds with a challenge ACK per
RFC 5961. The OSX client then sometimes (they are rate-limited) does not
reply with any RST as would be expected on a closed socket.

This results in sockets accumulating on the Linux server left mostly in
the CLOSE_WAIT state, although LAST_ACK and CLOSING are also possible.
This sequence of events can tie up a lot of resources on the Linux server
since there may be a lot of data in write buffers at the time of the RST.
Accepting a RST equal to rcv_nxt - 1, after we have already successfully
processed a FIN, has made a significant difference for us in practice, by
freeing up unneeded resources in a more expedient fashion.

A packetdrill test demonstrating the behavior:

// testing mac osx rst behavior

// Establish a connection
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < S 0:0(0) win 32768 <mss 1460,nop,wscale 10>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 5>
0.200 < . 1:1(0) ack 1 win 32768
0.200 accept(3, ..., ...) = 4

// Client closes the connection
0.300 < F. 1:1(0) ack 1 win 32768

// now send rst with same sequence
0.300 < R. 1:1(0) ack 1 win 32768

// make sure we are in TCP_CLOSE
0.400 %{
assert tcpi_state == 7
}%

Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1a34e9278c07..bfa165cc455a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5199,6 +5199,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 	return err;
 }
 
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+					       TCPF_CLOSING));
+}
+
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5237,20 +5254,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 						  LINUX_MIB_TCPACKSKIPPEDSEQ,
 						  &tp->last_oow_ack_time))
 				tcp_send_dupack(sk, skb);
+		} else if (tcp_reset_check(sk, skb)) {
+			tcp_reset(sk);
 		}
 		goto discard;
 	}
 
 	/* Step 2: check RST bit */
 	if (th->rst) {
-		/* RFC 5961 3.2 (extend to match against SACK too if available):
-		 * If seq num matches RCV.NXT or the right-most SACK block,
+		/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+		 * FIN and SACK too if available):
+		 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+		 * the right-most SACK block,
 		 * then
 		 *     RESET the connection
 		 * else
 		 *     Send a challenge ACK
 		 */
-		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+		    tcp_reset_check(sk, skb)) {
 			rst_seq_match = true;
 		} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
 			struct tcp_sack_block *sp = &tp->selective_acks[0];
-- 
cgit v1.2.3


From fe38d2a1c8bee0b3a0be40de5b621a28200612e5 Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Tue, 17 Jan 2017 07:51:01 -0800
Subject: inet: collapse ipv4/v6 rcv_saddr_equal functions into one

We pass these per-protocol equal functions around in various places, but
we can just have one function that checks the sk->sk_family and then do
the right comparison function.  I've also changed the ipv4 version to
not cast to inet_sock since it is unneeded.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h           |  4 +--
 include/net/inet_hashtables.h    |  5 +--
 include/net/udp.h                |  1 -
 net/ipv4/inet_connection_sock.c  | 72 ++++++++++++++++++++++++++++++++++++++++
 net/ipv4/inet_hashtables.c       | 16 +++------
 net/ipv4/udp.c                   | 58 +++++++-------------------------
 net/ipv6/inet6_connection_sock.c |  4 +--
 net/ipv6/inet6_hashtables.c      | 46 +------------------------
 net/ipv6/udp.c                   |  2 +-
 9 files changed, 95 insertions(+), 113 deletions(-)

(limited to 'net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8f998afc1384..17c6fd84e287 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -88,9 +88,7 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
 		      u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
-int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard);
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 			 bool match_wildcard);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 0574493e3899..756ed1692906 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -203,10 +203,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
-int __inet_hash(struct sock *sk, struct sock *osk,
-		int (*saddr_same)(const struct sock *sk1,
-				  const struct sock *sk2,
-				  bool match_wildcard));
+int __inet_hash(struct sock *sk, struct sock *osk);
 int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 1661791e8ca1..c9d8b8e848e0 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -204,7 +204,6 @@ static inline void udp_lib_close(struct sock *sk, long timeout)
 }
 
 int udp_lib_get_port(struct sock *sk, unsigned short snum,
-		     int (*)(const struct sock *, const struct sock *, bool),
 		     unsigned int hash2_nulladdr);
 
 u32 udp_flow_hashrnd(void);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19ea045c50ed..ba597cb504ff 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,78 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
 #endif
 
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ *                          only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ *                          and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+				bool match_wildcard)
+{
+	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
+	int sk2_ipv6only = inet_v6_ipv6only(sk2);
+	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+	/* if both are mapped, treat as IPv4 */
+	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+		if (!sk2_ipv6only) {
+			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+				return 1;
+			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+				return match_wildcard;
+		}
+		return 0;
+	}
+
+	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+		return 1;
+
+	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (sk2_rcv_saddr6 &&
+	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
+		return 1;
+
+	return 0;
+}
+#endif
+
+/* match_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ *                          0.0.0.0 only equals to 0.0.0.0
+ */
+static int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+				bool match_wildcard)
+{
+	if (!ipv6_only_sock(sk2)) {
+		if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+			return 1;
+		if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+			return match_wildcard;
+	}
+	return 0;
+}
+
+int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			 bool match_wildcard)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return ipv6_rcv_saddr_equal(sk, sk2, match_wildcard);
+#endif
+	return ipv4_rcv_saddr_equal(sk, sk2, match_wildcard);
+}
+EXPORT_SYMBOL(inet_rcv_saddr_equal);
+
 void inet_get_local_port_range(struct net *net, int *low, int *high)
 {
 	unsigned int seq;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ca97835bfec4..2ef9b010bd34 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -435,10 +435,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
 EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
 
 static int inet_reuseport_add_sock(struct sock *sk,
-				   struct inet_listen_hashbucket *ilb,
-				   int (*saddr_same)(const struct sock *sk1,
-						     const struct sock *sk2,
-						     bool match_wildcard))
+				   struct inet_listen_hashbucket *ilb)
 {
 	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
 	struct sock *sk2;
@@ -451,7 +448,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
 		    inet_csk(sk2)->icsk_bind_hash == tb &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
-		    saddr_same(sk, sk2, false))
+		    inet_rcv_saddr_equal(sk, sk2, false))
 			return reuseport_add_sock(sk, sk2);
 	}
 
@@ -461,10 +458,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 	return 0;
 }
 
-int __inet_hash(struct sock *sk, struct sock *osk,
-		 int (*saddr_same)(const struct sock *sk1,
-				   const struct sock *sk2,
-				   bool match_wildcard))
+int __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct inet_listen_hashbucket *ilb;
@@ -479,7 +473,7 @@ int __inet_hash(struct sock *sk, struct sock *osk,
 
 	spin_lock(&ilb->lock);
 	if (sk->sk_reuseport) {
-		err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+		err = inet_reuseport_add_sock(sk, ilb);
 		if (err)
 			goto unlock;
 	}
@@ -503,7 +497,7 @@ int inet_hash(struct sock *sk)
 
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
+		err = __inet_hash(sk, NULL);
 		local_bh_enable();
 	}
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4318d72e0248..d6dddcf59e79 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -137,11 +137,7 @@ EXPORT_SYMBOL(udp_memory_allocated);
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       const struct udp_hslot *hslot,
 			       unsigned long *bitmap,
-			       struct sock *sk,
-			       int (*saddr_comp)(const struct sock *sk1,
-						 const struct sock *sk2,
-						 bool match_wildcard),
-			       unsigned int log)
+			       struct sock *sk, unsigned int log)
 {
 	struct sock *sk2;
 	kuid_t uid = sock_i_uid(sk);
@@ -153,7 +149,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    saddr_comp(sk, sk2, true)) {
+		    inet_rcv_saddr_equal(sk, sk2, true)) {
 			if (sk2->sk_reuseport && sk->sk_reuseport &&
 			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
 			    uid_eq(uid, sock_i_uid(sk2))) {
@@ -176,10 +172,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
  */
 static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 				struct udp_hslot *hslot2,
-				struct sock *sk,
-				int (*saddr_comp)(const struct sock *sk1,
-						  const struct sock *sk2,
-						  bool match_wildcard))
+				struct sock *sk)
 {
 	struct sock *sk2;
 	kuid_t uid = sock_i_uid(sk);
@@ -193,7 +186,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    saddr_comp(sk, sk2, true)) {
+		    inet_rcv_saddr_equal(sk, sk2, true)) {
 			if (sk2->sk_reuseport && sk->sk_reuseport &&
 			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
 			    uid_eq(uid, sock_i_uid(sk2))) {
@@ -208,10 +201,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 	return res;
 }
 
-static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
-				  int (*saddr_same)(const struct sock *sk1,
-						    const struct sock *sk2,
-						    bool match_wildcard))
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 {
 	struct net *net = sock_net(sk);
 	kuid_t uid = sock_i_uid(sk);
@@ -225,7 +215,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
 		    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
 		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
-		    (*saddr_same)(sk, sk2, false)) {
+		    inet_rcv_saddr_equal(sk, sk2, false)) {
 			return reuseport_add_sock(sk, sk2);
 		}
 	}
@@ -241,14 +231,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
  *
  *  @sk:          socket struct in question
  *  @snum:        port number to look up
- *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
  *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
  *                   with NULL address
  */
 int udp_lib_get_port(struct sock *sk, unsigned short snum,
-		     int (*saddr_comp)(const struct sock *sk1,
-				       const struct sock *sk2,
-				       bool match_wildcard),
 		     unsigned int hash2_nulladdr)
 {
 	struct udp_hslot *hslot, *hslot2;
@@ -277,7 +263,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 			bitmap_zero(bitmap, PORTS_PER_CHAIN);
 			spin_lock_bh(&hslot->lock);
 			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
-					    saddr_comp, udptable->log);
+					    udptable->log);
 
 			snum = first;
 			/*
@@ -310,12 +296,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 			if (hslot->count < hslot2->count)
 				goto scan_primary_hash;
 
-			exist = udp_lib_lport_inuse2(net, snum, hslot2,
-						     sk, saddr_comp);
+			exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
 			if (!exist && (hash2_nulladdr != slot2)) {
 				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
 				exist = udp_lib_lport_inuse2(net, snum, hslot2,
-							     sk, saddr_comp);
+							     sk);
 			}
 			if (exist)
 				goto fail_unlock;
@@ -323,8 +308,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 				goto found;
 		}
 scan_primary_hash:
-		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
-					saddr_comp, 0))
+		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
 			goto fail_unlock;
 	}
 found:
@@ -333,7 +317,7 @@ found:
 	udp_sk(sk)->udp_portaddr_hash ^= snum;
 	if (sk_unhashed(sk)) {
 		if (sk->sk_reuseport &&
-		    udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
+		    udp_reuseport_add_sock(sk, hslot)) {
 			inet_sk(sk)->inet_num = 0;
 			udp_sk(sk)->udp_port_hash = 0;
 			udp_sk(sk)->udp_portaddr_hash ^= snum;
@@ -365,24 +349,6 @@ fail:
 }
 EXPORT_SYMBOL(udp_lib_get_port);
 
-/* match_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          0.0.0.0 only equals to 0.0.0.0
- */
-int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
-			 bool match_wildcard)
-{
-	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
-
-	if (!ipv6_only_sock(sk2)) {
-		if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
-			return 1;
-		if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
-			return match_wildcard;
-	}
-	return 0;
-}
-
 static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
 			      unsigned int port)
 {
@@ -398,7 +364,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
-	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+	return udp_lib_get_port(sk, snum, hash2_nulladdr);
 }
 
 static int compute_score(struct sock *sk, struct net *net,
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 7396e75e161b..55ee2ea2aee0 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -54,12 +54,12 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			     (sk2->sk_state != TCP_TIME_WAIT &&
 			      !uid_eq(uid,
 				      sock_i_uid((struct sock *)sk2))))) {
-				if (ipv6_rcv_saddr_equal(sk, sk2, true))
+				if (inet_rcv_saddr_equal(sk, sk2, true))
 					break;
 			}
 			if (!relax && reuse && sk2->sk_reuse &&
 			    sk2->sk_state != TCP_LISTEN &&
-			    ipv6_rcv_saddr_equal(sk, sk2, true))
+			    inet_rcv_saddr_equal(sk, sk2, true))
 				break;
 		}
 	}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 02761c9fe43e..d0900918a19e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -268,54 +268,10 @@ int inet6_hash(struct sock *sk)
 
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
+		err = __inet_hash(sk, NULL);
 		local_bh_enable();
 	}
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(inet6_hash);
-
-/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
- *                          only, and any IPv4 addresses if not IPv6 only
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
- *                          and 0.0.0.0 equals to 0.0.0.0 only
- */
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard)
-{
-	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	int sk2_ipv6only = inet_v6_ipv6only(sk2);
-	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
-	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
-
-	/* if both are mapped, treat as IPv4 */
-	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
-		if (!sk2_ipv6only) {
-			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
-				return 1;
-			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
-				return match_wildcard;
-		}
-		return 0;
-	}
-
-	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
-		return 1;
-
-	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
-	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
-	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (sk2_rcv_saddr6 &&
-	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
-		return 1;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4d5c4eee4b3f..05d69324862e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -103,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
-	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
+	return udp_lib_get_port(sk, snum, hash2_nulladdr);
 }
 
 static void udp_v6_rehash(struct sock *sk)
-- 
cgit v1.2.3


From aa078842b702b4a45111f028a604a6c8f69cb27d Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Tue, 17 Jan 2017 07:51:02 -0800
Subject: inet: drop ->bind_conflict

The only difference between inet6_csk_bind_conflict and inet_csk_bind_conflict
is how they check the rcv_saddr, so delete this call back and simply
change inet_csk_bind_conflict to call inet_rcv_saddr_equal.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_connection_sock.h |  5 -----
 include/net/inet_connection_sock.h  |  6 ------
 net/dccp/ipv4.c                     |  1 -
 net/dccp/ipv6.c                     |  2 --
 net/ipv4/inet_connection_sock.c     | 22 +++++++-------------
 net/ipv4/tcp_ipv4.c                 |  1 -
 net/ipv6/inet6_connection_sock.c    | 40 -------------------------------------
 net/ipv6/tcp_ipv6.c                 |  2 --
 8 files changed, 7 insertions(+), 72 deletions(-)

(limited to 'net')

diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index 3212b39b5bfc..8ec87b62257b 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -15,16 +15,11 @@
 
 #include <linux/types.h>
 
-struct inet_bind_bucket;
 struct request_sock;
 struct sk_buff;
 struct sock;
 struct sockaddr;
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-			    const struct inet_bind_bucket *tb, bool relax,
-			    bool soreuseport_ok);
-
 struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6,
 				      const struct request_sock *req, u8 proto);
 
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 84b2edde09b1..826f198374f8 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -62,9 +62,6 @@ struct inet_connection_sock_af_ops {
 				char __user *optval, int __user *optlen);
 #endif
 	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
-	int	    (*bind_conflict)(const struct sock *sk,
-				     const struct inet_bind_bucket *tb,
-				     bool relax, bool soreuseport_ok);
 	void	    (*mtu_reduced)(struct sock *sk);
 };
 
@@ -263,9 +260,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
 
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-			   const struct inet_bind_bucket *tb, bool relax,
-			   bool soreuseport_ok);
 int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d859a5c36e70..b043ec833785 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -904,7 +904,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
-	.bind_conflict	   = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index adfc790f7193..08bcdc3d1717 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -937,7 +937,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -958,7 +957,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ba597cb504ff..a1c9055769fc 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -116,9 +116,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
 }
 EXPORT_SYMBOL(inet_get_local_port_range);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-			   const struct inet_bind_bucket *tb, bool relax,
-			   bool reuseport_ok)
+static int inet_csk_bind_conflict(const struct sock *sk,
+				  const struct inet_bind_bucket *tb,
+				  bool relax, bool reuseport_ok)
 {
 	struct sock *sk2;
 	bool reuse = sk->sk_reuse;
@@ -134,7 +134,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
 
 	sk_for_each_bound(sk2, &tb->owners) {
 		if (sk != sk2 &&
-		    !inet_v6_ipv6only(sk2) &&
 		    (!sk->sk_bound_dev_if ||
 		     !sk2->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -144,23 +143,18 @@ int inet_csk_bind_conflict(const struct sock *sk,
 			     rcu_access_pointer(sk->sk_reuseport_cb) ||
 			     (sk2->sk_state != TCP_TIME_WAIT &&
 			     !uid_eq(uid, sock_i_uid(sk2))))) {
-
-				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
-				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+				if (inet_rcv_saddr_equal(sk, sk2, true))
 					break;
 			}
 			if (!relax && reuse && sk2->sk_reuse &&
 			    sk2->sk_state != TCP_LISTEN) {
-
-				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
-				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+				if (inet_rcv_saddr_equal(sk, sk2, true))
 					break;
 			}
 		}
 	}
 	return sk2 != NULL;
 }
-EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
@@ -239,8 +233,7 @@ other_parity_scan:
 					smallest_size = tb->num_owners;
 					smallest_port = port;
 				}
-				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false,
-									      reuseport_ok))
+				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
 					goto tb_found;
 				goto next_port;
 			}
@@ -281,8 +274,7 @@ tb_found:
 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 		    smallest_size == -1)
 			goto success;
-		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true,
-							     reuseport_ok)) {
+		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
 			if ((reuse ||
 			     (tb->fastreuseport > 0 &&
 			      sk->sk_reuseport &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 63214136cf1c..3644fc117691 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1817,7 +1817,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
-	.bind_conflict	   = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 55ee2ea2aee0..97074c459fe6 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -28,46 +28,6 @@
 #include <net/inet6_connection_sock.h>
 #include <net/sock_reuseport.h>
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-			    const struct inet_bind_bucket *tb, bool relax,
-			    bool reuseport_ok)
-{
-	const struct sock *sk2;
-	bool reuse = !!sk->sk_reuse;
-	bool reuseport = !!sk->sk_reuseport && reuseport_ok;
-	kuid_t uid = sock_i_uid((struct sock *)sk);
-
-	/* We must walk the whole port owner list in this case. -DaveM */
-	/*
-	 * See comment in inet_csk_bind_conflict about sock lookup
-	 * vs net namespaces issues.
-	 */
-	sk_for_each_bound(sk2, &tb->owners) {
-		if (sk != sk2 &&
-		    (!sk->sk_bound_dev_if ||
-		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
-			if ((!reuse || !sk2->sk_reuse ||
-			     sk2->sk_state == TCP_LISTEN) &&
-			    (!reuseport || !sk2->sk_reuseport ||
-			     rcu_access_pointer(sk->sk_reuseport_cb) ||
-			     (sk2->sk_state != TCP_TIME_WAIT &&
-			      !uid_eq(uid,
-				      sock_i_uid((struct sock *)sk2))))) {
-				if (inet_rcv_saddr_equal(sk, sk2, true))
-					break;
-			}
-			if (!relax && reuse && sk2->sk_reuse &&
-			    sk2->sk_state != TCP_LISTEN &&
-			    inet_rcv_saddr_equal(sk, sk2, true))
-				break;
-		}
-	}
-
-	return sk2 != NULL;
-}
-EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
-
 struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 				      struct flowi6 *fl6,
 				      const struct request_sock *req,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fc14e04028bf..f72100eedd5d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1621,7 +1621,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1652,7 +1651,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
-- 
cgit v1.2.3


From b9470c27607bed1ad3450de789c154f225530112 Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Tue, 17 Jan 2017 07:51:03 -0800
Subject: inet: kill smallest_size and smallest_port

In inet_csk_get_port we seem to be using smallest_port to figure out where the
best place to look for a SO_REUSEPORT sk that matches with an existing set of
SO_REUSEPORT's.  However if we get to the logic

if (smallest_size != -1) {
	port = smallest_port;
	goto have_port;
}

we will do a useless search, because we would have already done the
inet_csk_bind_conflict for that port and it would have returned 1, otherwise we
would have gone to found_tb and succeeded.  Since this logic makes us do yet
another trip through inet_csk_bind_conflict for a port we know won't work just
delete this code and save us the time.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h   |  1 -
 net/ipv4/inet_connection_sock.c | 26 ++++----------------------
 net/ipv4/inet_hashtables.c      |  3 ---
 3 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 756ed1692906..3fc0366743da 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -80,7 +80,6 @@ struct inet_bind_bucket {
 	signed char		fastreuse;
 	signed char		fastreuseport;
 	kuid_t			fastuid;
-	int			num_owners;
 	struct hlist_node	node;
 	struct hlist_head	owners;
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a1c9055769fc..d3523661c905 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -165,7 +165,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
 	int ret = 1, attempts = 5, port = snum;
-	int smallest_size = -1, smallest_port;
 	struct inet_bind_hashbucket *head;
 	struct net *net = sock_net(sk);
 	int i, low, high, attempt_half;
@@ -175,7 +174,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 	bool reuseport_ok = !!snum;
 
 	if (port) {
-have_port:
 		head = &hinfo->bhash[inet_bhashfn(net, port,
 						  hinfo->bhash_size)];
 		spin_lock_bh(&head->lock);
@@ -209,8 +207,6 @@ other_half_scan:
 	 * We do the opposite to not pollute connect() users.
 	 */
 	offset |= 1U;
-	smallest_size = -1;
-	smallest_port = low; /* avoid compiler warning */
 
 other_parity_scan:
 	port = low + offset;
@@ -224,15 +220,6 @@ other_parity_scan:
 		spin_lock_bh(&head->lock);
 		inet_bind_bucket_for_each(tb, &head->chain)
 			if (net_eq(ib_net(tb), net) && tb->port == port) {
-				if (((tb->fastreuse > 0 && reuse) ||
-				     (tb->fastreuseport > 0 &&
-				      sk->sk_reuseport &&
-				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-				      uid_eq(tb->fastuid, uid))) &&
-				    (tb->num_owners < smallest_size || smallest_size == -1)) {
-					smallest_size = tb->num_owners;
-					smallest_port = port;
-				}
 				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
 					goto tb_found;
 				goto next_port;
@@ -243,10 +230,6 @@ next_port:
 		cond_resched();
 	}
 
-	if (smallest_size != -1) {
-		port = smallest_port;
-		goto have_port;
-	}
 	offset--;
 	if (!(offset & 1))
 		goto other_parity_scan;
@@ -268,19 +251,18 @@ tb_found:
 		if (sk->sk_reuse == SK_FORCE_REUSE)
 			goto success;
 
-		if (((tb->fastreuse > 0 && reuse) ||
+		if ((tb->fastreuse > 0 && reuse) ||
 		     (tb->fastreuseport > 0 &&
 		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-		    smallest_size == -1)
+		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
 			goto success;
 		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
 			if ((reuse ||
 			     (tb->fastreuseport > 0 &&
 			      sk->sk_reuseport &&
 			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-			      uid_eq(tb->fastuid, uid))) &&
-			    !snum && smallest_size != -1 && --attempts >= 0) {
+			      uid_eq(tb->fastuid, uid))) && !snum &&
+			    --attempts >= 0) {
 				spin_unlock_bh(&head->lock);
 				goto again;
 			}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2ef9b010bd34..8bea74298173 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		tb->port      = snum;
 		tb->fastreuse = 0;
 		tb->fastreuseport = 0;
-		tb->num_owners = 0;
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
 	}
@@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 {
 	inet_sk(sk)->inet_num = snum;
 	sk_add_bind_node(sk, &tb->owners);
-	tb->num_owners++;
 	inet_csk(sk)->icsk_bind_hash = tb;
 }
 
@@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk)
 	spin_lock(&head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
 	__sk_del_bind_node(sk);
-	tb->num_owners--;
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
-- 
cgit v1.2.3


From 6cd66616834c89b8a6c8a182c4c99e5478cf6d6b Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Tue, 17 Jan 2017 07:51:04 -0800
Subject: inet: don't check for bind conflicts twice when searching for a port

This is just wasted time, we've already found a tb that doesn't have a bind
conflict, and we don't drop the head lock so scanning again isn't going to give
us a different answer.  Instead move the tb->reuse setting logic outside of the
found_tb path and put it in the success: path.  Then make it so that we don't
goto again if we find a bind conflict in the found_tb path as we won't reach
this anymore when we are scanning for an ephemeral port.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d3523661c905..f7e844d84836 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -164,7 +164,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
 	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-	int ret = 1, attempts = 5, port = snum;
+	int ret = 1, port = snum;
 	struct inet_bind_hashbucket *head;
 	struct net *net = sock_net(sk);
 	int i, low, high, attempt_half;
@@ -183,7 +183,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 
 		goto tb_not_found;
 	}
-again:
 	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
 	inet_get_local_port_range(net, &low, &high);
@@ -221,7 +220,7 @@ other_parity_scan:
 		inet_bind_bucket_for_each(tb, &head->chain)
 			if (net_eq(ib_net(tb), net) && tb->port == port) {
 				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
-					goto tb_found;
+					goto success;
 				goto next_port;
 			}
 		goto tb_not_found;
@@ -256,23 +255,11 @@ tb_found:
 		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
 			goto success;
-		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
-			if ((reuse ||
-			     (tb->fastreuseport > 0 &&
-			      sk->sk_reuseport &&
-			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-			      uid_eq(tb->fastuid, uid))) && !snum &&
-			    --attempts >= 0) {
-				spin_unlock_bh(&head->lock);
-				goto again;
-			}
+		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok))
 			goto fail_unlock;
-		}
-		if (!reuse)
-			tb->fastreuse = 0;
-		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
-			tb->fastreuseport = 0;
-	} else {
+	}
+success:
+	if (!hlist_empty(&tb->owners)) {
 		tb->fastreuse = reuse;
 		if (sk->sk_reuseport) {
 			tb->fastreuseport = 1;
@@ -280,8 +267,12 @@ tb_found:
 		} else {
 			tb->fastreuseport = 0;
 		}
+	} else {
+		if (!reuse)
+			tb->fastreuse = 0;
+		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+			tb->fastreuseport = 0;
 	}
-success:
 	if (!inet_csk(sk)->icsk_bind_hash)
 		inet_bind_hash(sk, tb, port);
 	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
-- 
cgit v1.2.3


From 289141b7688b71dc69b8d7a54bf67a4d7bc79f96 Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Tue, 17 Jan 2017 07:51:05 -0800
Subject: inet: split inet_csk_get_port into two functions

inet_csk_get_port does two different things, it either scans for an open port,
or it tries to see if the specified port is available for use.  Since these two
operations have different rules and are basically independent lets split them
into two different functions to make them both more readable.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 66 +++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f7e844d84836..bbe28920e2d8 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -156,33 +156,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 	return sk2 != NULL;
 }
 
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket.  Returns with the
+ * inet_bind_hashbucket lock held.
  */
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
 {
-	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-	int ret = 1, port = snum;
+	int port = 0;
 	struct inet_bind_hashbucket *head;
 	struct net *net = sock_net(sk);
 	int i, low, high, attempt_half;
 	struct inet_bind_bucket *tb;
-	kuid_t uid = sock_i_uid(sk);
 	u32 remaining, offset;
-	bool reuseport_ok = !!snum;
 
-	if (port) {
-		head = &hinfo->bhash[inet_bhashfn(net, port,
-						  hinfo->bhash_size)];
-		spin_lock_bh(&head->lock);
-		inet_bind_bucket_for_each(tb, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == port)
-				goto tb_found;
-
-		goto tb_not_found;
-	}
 	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
 	inet_get_local_port_range(net, &low, &high);
@@ -219,11 +207,12 @@ other_parity_scan:
 		spin_lock_bh(&head->lock);
 		inet_bind_bucket_for_each(tb, &head->chain)
 			if (net_eq(ib_net(tb), net) && tb->port == port) {
-				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
+				if (!inet_csk_bind_conflict(sk, tb, false, false))
 					goto success;
 				goto next_port;
 			}
-		goto tb_not_found;
+		tb = NULL;
+		goto success;
 next_port:
 		spin_unlock_bh(&head->lock);
 		cond_resched();
@@ -238,8 +227,41 @@ next_port:
 		attempt_half = 2;
 		goto other_half_scan;
 	}
-	return ret;
+	return NULL;
+success:
+	*port_ret = port;
+	*tb_ret = tb;
+	return head;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	int ret = 1, port = snum;
+	struct inet_bind_hashbucket *head;
+	struct net *net = sock_net(sk);
+	struct inet_bind_bucket *tb = NULL;
+	kuid_t uid = sock_i_uid(sk);
 
+	if (!port) {
+		head = inet_csk_find_open_port(sk, &tb, &port);
+		if (!head)
+			return ret;
+		if (!tb)
+			goto tb_not_found;
+		goto success;
+	}
+	head = &hinfo->bhash[inet_bhashfn(net, port,
+					  hinfo->bhash_size)];
+	spin_lock_bh(&head->lock);
+	inet_bind_bucket_for_each(tb, &head->chain)
+		if (net_eq(ib_net(tb), net) && tb->port == port)
+			goto tb_found;
 tb_not_found:
 	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
 				     net, head, port);
@@ -255,7 +277,7 @@ tb_found:
 		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
 			goto success;
-		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok))
+		if (inet_csk_bind_conflict(sk, tb, true, true))
 			goto fail_unlock;
 	}
 success:
-- 
cgit v1.2.3


From 637bc8bbe6c0a288a596edfdcdd5657c72a848db Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Tue, 17 Jan 2017 07:51:06 -0800
Subject: inet: reset tb->fastreuseport when adding a reuseport sk

If we have non reuseport sockets on a tb we will set tb->fastreuseport to 0 and
never set it again.  Which means that in the future if we end up adding a bunch
of reuseport sk's to that tb we'll have to do the expensive scan every time.
Instead add the ipv4/ipv6 saddr fields to the bind bucket, as well as the family
so we know what comparison to make, and the ipv6 only setting so we can make
sure to compare with new sockets appropriately.  Once one sk has made it onto
the list we know that there are no potential bind conflicts on the owners list
that match that sk's rcv_addr.  So copy the sk's information into our bind
bucket and set tb->fastruseport to FASTREUSESOCK_STRICT so we know we have to do
an extra check for subsequent reuseport sockets and skip the expensive bind
conflict check.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h   |   9 ++++
 net/ipv4/inet_connection_sock.c | 106 ++++++++++++++++++++++++++++++++--------
 2 files changed, 95 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 3fc0366743da..1178931288cb 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -74,12 +74,21 @@ struct inet_ehash_bucket {
  * users logged onto your box, isn't it nice to know that new data
  * ports are created in O(1) time?  I thought so. ;-)	-DaveM
  */
+#define FASTREUSEPORT_ANY	1
+#define FASTREUSEPORT_STRICT	2
+
 struct inet_bind_bucket {
 	possible_net_t		ib_net;
 	unsigned short		port;
 	signed char		fastreuse;
 	signed char		fastreuseport;
 	kuid_t			fastuid;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct in6_addr		fast_v6_rcv_saddr;
+#endif
+	__be32			fast_rcv_saddr;
+	unsigned short		fast_sk_family;
+	bool			fast_ipv6_only;
 	struct hlist_node	node;
 	struct hlist_head	owners;
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index bbe28920e2d8..096a085611ab 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -38,20 +38,21 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
  *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
  *                          and 0.0.0.0 equals to 0.0.0.0 only
  */
-static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+				const struct in6_addr *sk2_rcv_saddr6,
+				__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				bool sk1_ipv6only, bool sk2_ipv6only,
 				bool match_wildcard)
 {
-	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	int sk2_ipv6only = inet_v6_ipv6only(sk2);
-	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+	int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
 	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
 
 	/* if both are mapped, treat as IPv4 */
 	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
 		if (!sk2_ipv6only) {
-			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+			if (sk1_rcv_saddr == sk2_rcv_saddr)
 				return 1;
-			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+			if (!sk1_rcv_saddr || !sk2_rcv_saddr)
 				return match_wildcard;
 		}
 		return 0;
@@ -65,11 +66,11 @@ static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 		return 1;
 
 	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
-	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+	    !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
 		return 1;
 
 	if (sk2_rcv_saddr6 &&
-	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
+	    ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
 		return 1;
 
 	return 0;
@@ -80,13 +81,13 @@ static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
  * match_wildcard == false: addresses must be exactly the same, i.e.
  *                          0.0.0.0 only equals to 0.0.0.0
  */
-static int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-				bool match_wildcard)
+static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				bool sk2_ipv6only, bool match_wildcard)
 {
-	if (!ipv6_only_sock(sk2)) {
-		if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+	if (!sk2_ipv6only) {
+		if (sk1_rcv_saddr == sk2_rcv_saddr)
 			return 1;
-		if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+		if (!sk1_rcv_saddr || !sk2_rcv_saddr)
 			return match_wildcard;
 	}
 	return 0;
@@ -97,9 +98,16 @@ int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 {
 #if IS_ENABLED(CONFIG_IPV6)
 	if (sk->sk_family == AF_INET6)
-		return ipv6_rcv_saddr_equal(sk, sk2, match_wildcard);
+		return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
+					    &sk2->sk_v6_rcv_saddr,
+					    sk->sk_rcv_saddr,
+					    sk2->sk_rcv_saddr,
+					    ipv6_only_sock(sk),
+					    ipv6_only_sock(sk2),
+					    match_wildcard);
 #endif
-	return ipv4_rcv_saddr_equal(sk, sk2, match_wildcard);
+	return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
+				    ipv6_only_sock(sk2), match_wildcard);
 }
 EXPORT_SYMBOL(inet_rcv_saddr_equal);
 
@@ -234,6 +242,39 @@ success:
 	return head;
 }
 
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+				     struct sock *sk)
+{
+	kuid_t uid = sock_i_uid(sk);
+
+	if (tb->fastreuseport <= 0)
+		return 0;
+	if (!sk->sk_reuseport)
+		return 0;
+	if (rcu_access_pointer(sk->sk_reuseport_cb))
+		return 0;
+	if (!uid_eq(tb->fastuid, uid))
+		return 0;
+	/* We only need to check the rcv_saddr if this tb was once marked
+	 * without fastreuseport and then was reset, as we can only know that
+	 * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
+	 * owners list.
+	 */
+	if (tb->fastreuseport == FASTREUSEPORT_ANY)
+		return 1;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (tb->fast_sk_family == AF_INET6)
+		return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
+					    &sk->sk_v6_rcv_saddr,
+					    tb->fast_rcv_saddr,
+					    sk->sk_rcv_saddr,
+					    tb->fast_ipv6_only,
+					    ipv6_only_sock(sk), true);
+#endif
+	return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
+				    ipv6_only_sock(sk), true);
+}
+
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
  * We try to allocate an odd port (and leave even ports for connect())
@@ -273,9 +314,7 @@ tb_found:
 			goto success;
 
 		if ((tb->fastreuse > 0 && reuse) ||
-		     (tb->fastreuseport > 0 &&
-		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
+		    sk_reuseport_match(tb, sk))
 			goto success;
 		if (inet_csk_bind_conflict(sk, tb, true, true))
 			goto fail_unlock;
@@ -284,16 +323,43 @@ success:
 	if (!hlist_empty(&tb->owners)) {
 		tb->fastreuse = reuse;
 		if (sk->sk_reuseport) {
-			tb->fastreuseport = 1;
+			tb->fastreuseport = FASTREUSEPORT_ANY;
 			tb->fastuid = uid;
+			tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+			tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+			tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
 		} else {
 			tb->fastreuseport = 0;
 		}
 	} else {
 		if (!reuse)
 			tb->fastreuse = 0;
-		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+		if (sk->sk_reuseport) {
+			/* We didn't match or we don't have fastreuseport set on
+			 * the tb, but we have sk_reuseport set on this socket
+			 * and we know that there are no bind conflicts with
+			 * this socket in this tb, so reset our tb's reuseport
+			 * settings so that any subsequent sockets that match
+			 * our current socket will be put on the fast path.
+			 *
+			 * If we reset we need to set FASTREUSEPORT_STRICT so we
+			 * do extra checking for all subsequent sk_reuseport
+			 * socks.
+			 */
+			if (!sk_reuseport_match(tb, sk)) {
+				tb->fastreuseport = FASTREUSEPORT_STRICT;
+				tb->fastuid = uid;
+				tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+				tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+				tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+			}
+		} else {
 			tb->fastreuseport = 0;
+		}
 	}
 	if (!inet_csk(sk)->icsk_bind_hash)
 		inet_bind_hash(sk, tb, port);
-- 
cgit v1.2.3


From 9a6d87626252496311cd122aaa9fbf21ae19e449 Mon Sep 17 00:00:00 2001
From: Liping Zhang
Date: Sat, 7 Jan 2017 21:33:54 +0800
Subject: netfilter: pkttype: unnecessary to check ipv6 multicast address

Since there's no broadcast address in IPV6, so in ipv6 family, the
PACKET_LOOPBACK must be multicast packets, there's no need to check
it again.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_meta.c   | 5 +----
 net/netfilter/xt_pkttype.c | 3 +--
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 66c7f4b4c49b..9a22b24346b8 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -154,10 +154,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 				*dest = PACKET_BROADCAST;
 			break;
 		case NFPROTO_IPV6:
-			if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
-				*dest = PACKET_MULTICAST;
-			else
-				*dest = PACKET_BROADCAST;
+			*dest = PACKET_MULTICAST;
 			break;
 		default:
 			WARN_ON(1);
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 57efb703ff18..1ef99151b3ba 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -33,8 +33,7 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	else if (xt_family(par) == NFPROTO_IPV4 &&
 	    ipv4_is_multicast(ip_hdr(skb)->daddr))
 		type = PACKET_MULTICAST;
-	else if (xt_family(par) == NFPROTO_IPV6 &&
-	    ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
+	else if (xt_family(par) == NFPROTO_IPV6)
 		type = PACKET_MULTICAST;
 	else
 		type = PACKET_BROADCAST;
-- 
cgit v1.2.3


From f169fd695b192dd7b23aff8e69d25a1bc881bbfa Mon Sep 17 00:00:00 2001
From: Liping Zhang
Date: Sat, 7 Jan 2017 21:33:55 +0800
Subject: netfilter: nft_meta: deal with PACKET_LOOPBACK in netdev family

After adding the following nft rule, then ping 224.0.0.1:
  # nft add rule netdev t c pkttype host counter

The warning complain message will be printed out again and again:
  WARNING: CPU: 0 PID: 10182 at net/netfilter/nft_meta.c:163 \
           nft_meta_get_eval+0x3fe/0x460 [nft_meta]
  [...]
  Call Trace:
  <IRQ>
  dump_stack+0x85/0xc2
  __warn+0xcb/0xf0
  warn_slowpath_null+0x1d/0x20
  nft_meta_get_eval+0x3fe/0x460 [nft_meta]
  nft_do_chain+0xff/0x5e0 [nf_tables]

So we should deal with PACKET_LOOPBACK in netdev family too. For ipv4,
convert it to PACKET_BROADCAST/MULTICAST according to the destination
address's type; For ipv6, convert it to PACKET_MULTICAST directly.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_meta.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 9a22b24346b8..e1f5ca9b423b 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -156,8 +156,34 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		case NFPROTO_IPV6:
 			*dest = PACKET_MULTICAST;
 			break;
+		case NFPROTO_NETDEV:
+			switch (skb->protocol) {
+			case htons(ETH_P_IP): {
+				int noff = skb_network_offset(skb);
+				struct iphdr *iph, _iph;
+
+				iph = skb_header_pointer(skb, noff,
+							 sizeof(_iph), &_iph);
+				if (!iph)
+					goto err;
+
+				if (ipv4_is_multicast(iph->daddr))
+					*dest = PACKET_MULTICAST;
+				else
+					*dest = PACKET_BROADCAST;
+
+				break;
+			}
+			case htons(ETH_P_IPV6):
+				*dest = PACKET_MULTICAST;
+				break;
+			default:
+				WARN_ON_ONCE(1);
+				goto err;
+			}
+			break;
 		default:
-			WARN_ON(1);
+			WARN_ON_ONCE(1);
 			goto err;
 		}
 		break;
-- 
cgit v1.2.3


From cc16f00f6529aa2378f2b949a6f68e9dc6dec363 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Wed, 18 Jan 2017 00:44:42 +0800
Subject: sctp: add support for generating stream reconf ssn reset request
 chunk

This patch is to add asoc strreset_outseq and strreset_inseq for
saving the reconf request sequence, initialize them when create
assoc and process init, and also to define Incoming and Outgoing
SSN Reset Request Parameter described in rfc6525 section 4.1 and
4.2, As they can be in one same chunk as section rfc6525 3.1-3
describes, it makes them in one function.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       |  27 ++++++++++
 include/net/sctp/sm.h      |   5 +-
 include/net/sctp/structs.h |   3 ++
 net/sctp/associola.c       |   1 +
 net/sctp/sm_make_chunk.c   | 121 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 156 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index fcb4c3646173..a9e790685af3 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -108,6 +108,7 @@ typedef enum {
 	/* Use hex, as defined in ADDIP sec. 3.1 */
 	SCTP_CID_ASCONF			= 0xC1,
 	SCTP_CID_ASCONF_ACK		= 0x80,
+	SCTP_CID_RECONF			= 0x82,
 } sctp_cid_t; /* enum */
 
 
@@ -199,6 +200,13 @@ typedef enum {
 	SCTP_PARAM_SUCCESS_REPORT	= cpu_to_be16(0xc005),
 	SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006),
 
+	/* RE-CONFIG. Section 4 */
+	SCTP_PARAM_RESET_OUT_REQUEST		= cpu_to_be16(0x000d),
+	SCTP_PARAM_RESET_IN_REQUEST		= cpu_to_be16(0x000e),
+	SCTP_PARAM_RESET_TSN_REQUEST		= cpu_to_be16(0x000f),
+	SCTP_PARAM_RESET_RESPONSE		= cpu_to_be16(0x0010),
+	SCTP_PARAM_RESET_ADD_OUT_STREAMS	= cpu_to_be16(0x0011),
+	SCTP_PARAM_RESET_ADD_IN_STREAMS		= cpu_to_be16(0x0012),
 } sctp_param_t; /* enum */
 
 
@@ -710,4 +718,23 @@ struct sctp_infox {
 	struct sctp_association *asoc;
 };
 
+struct sctp_reconf_chunk {
+	sctp_chunkhdr_t chunk_hdr;
+	__u8 params[0];
+} __packed;
+
+struct sctp_strreset_outreq {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+	__u32 response_seq;
+	__u32 send_reset_at_tsn;
+	__u16 list_of_streams[0];
+} __packed;
+
+struct sctp_strreset_inreq {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+	__u16 list_of_streams[0];
+} __packed;
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index ca6c971dd74a..3462cb08f51a 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -259,7 +259,10 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 				    __u32 new_cum_tsn, size_t nstreams,
 				    struct sctp_fwdtsn_skip *skiplist);
 struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc);
-
+struct sctp_chunk *sctp_make_strreset_req(
+				const struct sctp_association *asoc,
+				__u16 stream_num, __u16 *stream_list,
+				bool out, bool in);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 4741ec240caf..3dc983e97564 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1865,6 +1865,9 @@ struct sctp_association {
 	     temp:1,		/* Is it a temporary association? */
 	     prsctp_enable:1;
 
+	__u32 strreset_outseq; /* Update after receiving response */
+	__u32 strreset_inseq; /* Update after receiving request */
+
 	struct sctp_priv_assoc_stats stats;
 
 	int sent_cnt_removable;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 36294f7fb9a7..42ece6f35b98 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -207,6 +207,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * association to the same value as the initial TSN.
 	 */
 	asoc->addip_serial = asoc->c.initial_tsn;
+	asoc->strreset_outseq = asoc->c.initial_tsn;
 
 	INIT_LIST_HEAD(&asoc->addip_chunk_list);
 	INIT_LIST_HEAD(&asoc->asconf_ack_list);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 80a9088084ac..df81fce08890 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1844,6 +1844,7 @@ no_hmac:
 	retval->next_tsn = retval->c.initial_tsn;
 	retval->ctsn_ack_point = retval->next_tsn - 1;
 	retval->addip_serial = retval->c.initial_tsn;
+	retval->strreset_outseq = retval->c.initial_tsn;
 	retval->adv_peer_ack_point = retval->ctsn_ack_point;
 	retval->peer.prsctp_capable = retval->c.prsctp_capable;
 	retval->peer.adaptation_ind = retval->c.adaptation_ind;
@@ -2387,6 +2388,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 	asoc->peer.i.initial_tsn =
 		ntohl(peer_init->init_hdr.initial_tsn);
 
+	asoc->strreset_inseq = asoc->peer.i.initial_tsn;
+
 	/* Apply the upper bounds for output streams based on peer's
 	 * number of inbound streams.
 	 */
@@ -3524,3 +3527,121 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 
 	return retval;
 }
+
+/* RE-CONFIG 3.1 (RE-CONFIG chunk)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  | Type = 130    |  Chunk Flags  |      Chunk Length             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \                                                               \
+ *  /                  Re-configuration Parameter                   /
+ *  \                                                               \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \                                                               \
+ *  /             Re-configuration Parameter (optional)             /
+ *  \                                                               \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+static struct sctp_chunk *sctp_make_reconf(
+				const struct sctp_association *asoc,
+				int length)
+{
+	struct sctp_reconf_chunk *reconf;
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
+				   GFP_ATOMIC);
+	if (!retval)
+		return NULL;
+
+	reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
+	retval->param_hdr.v = reconf->params;
+
+	return retval;
+}
+
+/* RE-CONFIG 4.1 (STREAM OUT RESET)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 13       | Parameter Length = 16 + 2 * N |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |           Re-configuration Request Sequence Number            |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |           Re-configuration Response Sequence Number           |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                Sender's Last Assigned TSN                     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  /                            ......                             /
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * RE-CONFIG 4.2 (STREAM IN RESET)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 14       |  Parameter Length = 8 + 2 * N |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Re-configuration Request Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  /                            ......                             /
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_req(
+				const struct sctp_association *asoc,
+				__u16 stream_num, __u16 *stream_list,
+				bool out, bool in)
+{
+	struct sctp_strreset_outreq outreq;
+	__u16 stream_len = stream_num * 2;
+	struct sctp_strreset_inreq inreq;
+	struct sctp_chunk *retval;
+	__u16 outlen, inlen, i;
+
+	outlen = (sizeof(outreq) + stream_len) * out;
+	inlen = (sizeof(inreq) + stream_len) * in;
+
+	retval = sctp_make_reconf(asoc, outlen + inlen);
+	if (!retval)
+		return NULL;
+
+	for (i = 0; i < stream_num; i++)
+		stream_list[i] = htons(stream_list[i]);
+
+	if (outlen) {
+		outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
+		outreq.param_hdr.length = htons(outlen);
+		outreq.request_seq = htonl(asoc->strreset_outseq);
+		outreq.response_seq = htonl(asoc->strreset_inseq - 1);
+		outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1);
+
+		sctp_addto_chunk(retval, sizeof(outreq), &outreq);
+
+		if (stream_len)
+			sctp_addto_chunk(retval, stream_len, stream_list);
+	}
+
+	if (inlen) {
+		inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST;
+		inreq.param_hdr.length = htons(inlen);
+		inreq.request_seq = htonl(asoc->strreset_outseq + out);
+
+		sctp_addto_chunk(retval, sizeof(inreq), &inreq);
+
+		if (stream_len)
+			sctp_addto_chunk(retval, stream_len, stream_list);
+	}
+
+	for (i = 0; i < stream_num; i++)
+		stream_list[i] = ntohs(stream_list[i]);
+
+	return retval;
+}
-- 
cgit v1.2.3


From 7b9438de0cd4b46a6914416bfede6cf839cd9e68 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Wed, 18 Jan 2017 00:44:43 +0800
Subject: sctp: add stream reconf timer

This patch is to add a per transport timer based on sctp timer frame
for stream reconf chunk retransmission. It would start after sending
a reconf request chunk, and stop after receiving the response chunk.

If the timer expires, besides retransmitting the reconf request chunk,
it would also do the same thing with data RTO timer. like to increase
the appropriate error counts, and perform threshold management, possibly
destroying the asoc if sctp retransmission thresholds are exceeded, just
as section 5.1.1 describes.

This patch is also to add asoc strreset_chunk, it is used to save the
reconf request chunk, so that it can be retransmitted, and to check if
the response is really for this request by comparing the information
inside with the response chunk as well.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  1 +
 include/net/sctp/sm.h        |  2 ++
 include/net/sctp/structs.h   |  6 ++++++
 net/sctp/associola.c         |  9 +++++++++
 net/sctp/sm_sideeffect.c     | 32 ++++++++++++++++++++++++++++++++
 net/sctp/sm_statefuns.c      | 28 ++++++++++++++++++++++++++++
 net/sctp/sm_statetable.c     | 20 ++++++++++++++++++++
 net/sctp/transport.c         | 17 +++++++++++++++--
 8 files changed, 113 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 5b847e49f7e9..8307c862b5c2 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -90,6 +90,7 @@ typedef enum {
 	SCTP_EVENT_TIMEOUT_T4_RTO,
 	SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
 	SCTP_EVENT_TIMEOUT_HEARTBEAT,
+	SCTP_EVENT_TIMEOUT_RECONF,
 	SCTP_EVENT_TIMEOUT_SACK,
 	SCTP_EVENT_TIMEOUT_AUTOCLOSE,
 } sctp_event_timeout_t;
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 3462cb08f51a..d2d9e28fe783 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -167,6 +167,7 @@ sctp_state_fn_t sctp_sf_cookie_wait_icmp_abort;
 
 /* Prototypes for timeout event state functions.  */
 sctp_state_fn_t sctp_sf_do_6_3_3_rtx;
+sctp_state_fn_t sctp_sf_send_reconf;
 sctp_state_fn_t sctp_sf_do_6_2_sack;
 sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 
@@ -278,6 +279,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
 /* 2nd level prototypes */
 void sctp_generate_t3_rtx_event(unsigned long peer);
 void sctp_generate_heartbeat_event(unsigned long peer);
+void sctp_generate_reconf_event(unsigned long peer);
 void sctp_generate_proto_unreach_event(unsigned long peer);
 
 void sctp_ootb_pkt_free(struct sctp_packet *);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 3dc983e97564..463b4d642d68 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -877,6 +877,9 @@ struct sctp_transport {
 	/* Timer to handle ICMP proto unreachable envets */
 	struct timer_list proto_unreach_timer;
 
+	/* Timer to handler reconf chunk rtx */
+	struct timer_list reconf_timer;
+
 	/* Since we're using per-destination retransmission timers
 	 * (see above), we're also using per-destination "transmitted"
 	 * queues.  This probably ought to be a private struct
@@ -935,6 +938,7 @@ void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk);
 void sctp_transport_free(struct sctp_transport *);
 void sctp_transport_reset_t3_rtx(struct sctp_transport *);
 void sctp_transport_reset_hb_timer(struct sctp_transport *);
+void sctp_transport_reset_reconf_timer(struct sctp_transport *transport);
 int sctp_transport_hold(struct sctp_transport *);
 void sctp_transport_put(struct sctp_transport *);
 void sctp_transport_update_rto(struct sctp_transport *, __u32);
@@ -1868,6 +1872,8 @@ struct sctp_association {
 	__u32 strreset_outseq; /* Update after receiving response */
 	__u32 strreset_inseq; /* Update after receiving request */
 
+	struct sctp_chunk *strreset_chunk; /* save request chunk */
+
 	struct sctp_priv_assoc_stats stats;
 
 	int sent_cnt_removable;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 42ece6f35b98..fc33540d2f11 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -362,6 +362,9 @@ void sctp_association_free(struct sctp_association *asoc)
 	/* Free stream information. */
 	sctp_stream_free(asoc->stream);
 
+	if (asoc->strreset_chunk)
+		sctp_chunk_free(asoc->strreset_chunk);
+
 	/* Clean up the bound address list. */
 	sctp_bind_addr_free(&asoc->base.bind_addr);
 
@@ -520,6 +523,12 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
 	if (asoc->peer.last_data_from == peer)
 		asoc->peer.last_data_from = transport;
 
+	if (asoc->strreset_chunk &&
+	    asoc->strreset_chunk->transport == peer) {
+		asoc->strreset_chunk->transport = transport;
+		sctp_transport_reset_reconf_timer(transport);
+	}
+
 	/* If we remove the transport an INIT was last sent to, set it to
 	 * NULL. Combined with the update of the retran path above, this
 	 * will cause the next INIT to be sent to the next available
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c345bf153bed..a4552712b882 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -436,6 +436,37 @@ out_unlock:
 	sctp_association_put(asoc);
 }
 
+ /* Handle the timeout of the RE-CONFIG timer. */
+void sctp_generate_reconf_event(unsigned long data)
+{
+	struct sctp_transport *transport = (struct sctp_transport *)data;
+	struct sctp_association *asoc = transport->asoc;
+	struct sock *sk = asoc->base.sk;
+	struct net *net = sock_net(sk);
+	int error = 0;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		pr_debug("%s: sock is busy\n", __func__);
+
+		/* Try again later.  */
+		if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20)))
+			sctp_transport_hold(transport);
+		goto out_unlock;
+	}
+
+	error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
+			   SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF),
+			   asoc->state, asoc->ep, asoc,
+			   transport, GFP_ATOMIC);
+
+	if (error)
+		sk->sk_err = -error;
+
+out_unlock:
+	bh_unlock_sock(sk);
+	sctp_transport_put(transport);
+}
 
 /* Inject a SACK Timeout event into the state machine.  */
 static void sctp_generate_sack_event(unsigned long data)
@@ -453,6 +484,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
 	sctp_generate_t4_rto_event,
 	sctp_generate_t5_shutdown_guard_event,
 	NULL,
+	NULL,
 	sctp_generate_sack_event,
 	sctp_generate_autoclose_event,
 };
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 0ceded37d20b..2ae186aba9a8 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1021,6 +1021,34 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
 	return SCTP_DISPOSITION_CONSUME;
 }
 
+/* resend asoc strreset_chunk.  */
+sctp_disposition_t sctp_sf_send_reconf(struct net *net,
+				       const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const sctp_subtype_t type, void *arg,
+				       sctp_cmd_seq_t *commands)
+{
+	struct sctp_transport *transport = arg;
+
+	if (asoc->overall_error_count >= asoc->max_retrans) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		/* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_ERROR));
+		SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	sctp_chunk_hold(asoc->strreset_chunk);
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+			SCTP_CHUNK(asoc->strreset_chunk));
+	sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
 /*
  * Process an heartbeat request.
  *
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index a987d54b379c..3da521abfc57 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -888,6 +888,25 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
 	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
 }
 
+#define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_send_reconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
 static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_EVENT_TIMEOUT_NONE,
 	TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
@@ -897,6 +916,7 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S
 	TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
 	TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
 	TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
+	TYPE_SCTP_EVENT_TIMEOUT_RECONF,
 	TYPE_SCTP_EVENT_TIMEOUT_SACK,
 	TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
 };
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a1652ab63918..baa1ac00d7b5 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -88,9 +88,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
 	INIT_LIST_HEAD(&peer->transports);
 
 	setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
-			(unsigned long)peer);
+		    (unsigned long)peer);
 	setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
-			(unsigned long)peer);
+		    (unsigned long)peer);
+	setup_timer(&peer->reconf_timer, sctp_generate_reconf_event,
+		    (unsigned long)peer);
 	setup_timer(&peer->proto_unreach_timer,
 		    sctp_generate_proto_unreach_event, (unsigned long)peer);
 
@@ -144,6 +146,9 @@ void sctp_transport_free(struct sctp_transport *transport)
 	if (del_timer(&transport->T3_rtx_timer))
 		sctp_transport_put(transport);
 
+	if (del_timer(&transport->reconf_timer))
+		sctp_transport_put(transport);
+
 	/* Delete the ICMP proto unreachable timer if it's active. */
 	if (del_timer(&transport->proto_unreach_timer))
 		sctp_association_put(transport->asoc);
@@ -211,6 +216,14 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport)
 		sctp_transport_hold(transport);
 }
 
+void sctp_transport_reset_reconf_timer(struct sctp_transport *transport)
+{
+	if (!timer_pending(&transport->reconf_timer))
+		if (!mod_timer(&transport->reconf_timer,
+			       jiffies + transport->rto))
+			sctp_transport_hold(transport);
+}
+
 /* This transport has been assigned to an association.
  * Initialize fields from the association or from the sock itself.
  * Register the reference count in the association.
-- 
cgit v1.2.3


From 7a090b04522b46a219c271d4cd2abbf572623e03 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Wed, 18 Jan 2017 00:44:44 +0800
Subject: sctp: add stream reconf primitive

This patch is to add a primitive based on sctp primitive frame for
sending stream reconf request. It works as the other primitives,
and create a SCTP_CMD_REPLY command to send the request chunk out.

sctp_primitive_RECONF would be the api to send a reconf request
chunk.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  3 ++-
 include/net/sctp/sctp.h      |  2 ++
 include/net/sctp/sm.h        |  1 +
 net/sctp/primitive.c         |  3 +++
 net/sctp/sm_statefuns.c      | 13 +++++++++++++
 net/sctp/sm_statetable.c     | 20 ++++++++++++++++++++
 6 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8307c862b5c2..3567c971cf3b 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -114,9 +114,10 @@ typedef enum {
 	SCTP_PRIMITIVE_SEND,
 	SCTP_PRIMITIVE_REQUESTHEARTBEAT,
 	SCTP_PRIMITIVE_ASCONF,
+	SCTP_PRIMITIVE_RECONF,
 } sctp_event_primitive_t;
 
-#define SCTP_EVENT_PRIMITIVE_MAX	SCTP_PRIMITIVE_ASCONF
+#define SCTP_EVENT_PRIMITIVE_MAX	SCTP_PRIMITIVE_RECONF
 #define SCTP_NUM_PRIMITIVE_TYPES	(SCTP_EVENT_PRIMITIVE_MAX + 1)
 
 /* We define here a utility type for manipulating subtypes.
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 598d938b0d0a..bc0e049b1474 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -141,6 +141,8 @@ int sctp_primitive_ABORT(struct net *, struct sctp_association *, void *arg);
 int sctp_primitive_SEND(struct net *, struct sctp_association *, void *arg);
 int sctp_primitive_REQUESTHEARTBEAT(struct net *, struct sctp_association *, void *arg);
 int sctp_primitive_ASCONF(struct net *, struct sctp_association *, void *arg);
+int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc,
+			  void *arg);
 
 /*
  * sctp/input.c
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index d2d9e28fe783..430ed139fbbb 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -157,6 +157,7 @@ sctp_state_fn_t sctp_sf_error_shutdown;
 sctp_state_fn_t sctp_sf_ignore_primitive;
 sctp_state_fn_t sctp_sf_do_prm_requestheartbeat;
 sctp_state_fn_t sctp_sf_do_prm_asconf;
+sctp_state_fn_t sctp_sf_do_prm_reconf;
 
 /* Prototypes for other event state functions.  */
 sctp_state_fn_t sctp_sf_do_no_pending_tsn;
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index ab8d9f96a177..f0553a022859 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -211,3 +211,6 @@ DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
 */
 
 DECLARE_PRIMITIVE(ASCONF);
+
+/* RE-CONFIG 5.1 */
+DECLARE_PRIMITIVE(RECONF);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 2ae186aba9a8..782e579472c9 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5185,6 +5185,19 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
 	return SCTP_DISPOSITION_CONSUME;
 }
 
+/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
+sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
+					 const struct sctp_endpoint *ep,
+					 const struct sctp_association *asoc,
+					 const sctp_subtype_t type,
+					 void *arg, sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
 /*
  * Ignore the primitive event
  *
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 3da521abfc57..b5438b4f6c1e 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -643,6 +643,25 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
 } /* TYPE_SCTP_PRIMITIVE_ASCONF */
 
+#define TYPE_SCTP_PRIMITIVE_RECONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_RECONF */
+
 /* The primary index for this table is the primitive type.
  * The secondary index for this table is the state.
  */
@@ -653,6 +672,7 @@ static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE
 	TYPE_SCTP_PRIMITIVE_SEND,
 	TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
 	TYPE_SCTP_PRIMITIVE_ASCONF,
+	TYPE_SCTP_PRIMITIVE_RECONF,
 };
 
 #define TYPE_SCTP_OTHER_NO_PENDING_TSN  { \
-- 
cgit v1.2.3


From c28445c3cb07ba1da2c1dc7b5f3066c686a6acc6 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Wed, 18 Jan 2017 00:44:45 +0800
Subject: sctp: add reconf_enable in asoc ep and netns

This patch is to add reconf_enable field in all of asoc ep and netns
to indicate if they support stream reset.

When initializing, asoc reconf_enable get the default value from ep
reconf_enable which is from netns netns reconf_enable by default.

It is also to add reconf_capable in asoc peer part to know if peer
supports reconf_enable, the value is set if ext params have reconf
chunk support when processing init chunk, just as rfc6525 section
5.1.1 demands.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/sctp.h   |  3 +++
 include/net/sctp/structs.h |  7 +++++--
 net/sctp/associola.c       |  1 +
 net/sctp/endpointola.c     |  1 +
 net/sctp/protocol.c        |  3 +++
 net/sctp/sm_make_chunk.c   | 15 +++++++++++++++
 6 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index c501d67172b1..b7871d018354 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -118,6 +118,9 @@ struct netns_sctp {
 	/* Flag to indicate if PR-SCTP is enabled. */
 	int prsctp_enable;
 
+	/* Flag to indicate if PR-CONFIG is enabled. */
+	int reconf_enable;
+
 	/* Flag to idicate if SCTP-AUTH is enabled */
 	int auth_enable;
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 463b4d642d68..ee037ef15d65 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1255,7 +1255,8 @@ struct sctp_endpoint {
 	struct list_head endpoint_shared_keys;
 	__u16 active_key_id;
 	__u8  auth_enable:1,
-	      prsctp_enable:1;
+	      prsctp_enable:1,
+	      reconf_enable:1;
 };
 
 /* Recover the outter endpoint structure. */
@@ -1508,6 +1509,7 @@ struct sctp_association {
 			hostname_address:1, /* Peer understands DNS addresses? */
 			asconf_capable:1,   /* Does peer support ADDIP? */
 			prsctp_capable:1,   /* Can peer do PR-SCTP? */
+			reconf_capable:1,   /* Can peer do RE-CONFIG? */
 			auth_capable:1;     /* Is peer doing SCTP-AUTH? */
 
 		/* sack_needed : This flag indicates if the next received
@@ -1867,7 +1869,8 @@ struct sctp_association {
 
 	__u8 need_ecne:1,	/* Need to send an ECNE Chunk? */
 	     temp:1,		/* Is it a temporary association? */
-	     prsctp_enable:1;
+	     prsctp_enable:1,
+	     reconf_enable:1;
 
 	__u32 strreset_outseq; /* Update after receiving response */
 	__u32 strreset_inseq; /* Update after receiving request */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index fc33540d2f11..68b99adc21a3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -270,6 +270,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 
 	asoc->active_key_id = ep->active_key_id;
 	asoc->prsctp_enable = ep->prsctp_enable;
+	asoc->reconf_enable = ep->reconf_enable;
 
 	/* Save the hmacs and chunks list into this association */
 	if (ep->auth_hmacs_list)
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 410ddc1e3443..8c589230794f 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -164,6 +164,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	ep->auth_hmacs_list = auth_hmacs;
 	ep->auth_chunk_list = auth_chunks;
 	ep->prsctp_enable = net->sctp.prsctp_enable;
+	ep->reconf_enable = net->sctp.reconf_enable;
 
 	return ep;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f9c3c37c9ae0..8227bbbd077a 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1258,6 +1258,9 @@ static int __net_init sctp_defaults_init(struct net *net)
 	/* Enable PR-SCTP by default. */
 	net->sctp.prsctp_enable = 1;
 
+	/* Disable RECONF by default. */
+	net->sctp.reconf_enable = 0;
+
 	/* Disable AUTH by default. */
 	net->sctp.auth_enable = 0;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index df81fce08890..ad3445b3408e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -270,6 +270,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		num_ext += 2;
 	}
 
+	if (asoc->reconf_enable) {
+		extensions[num_ext] = SCTP_CID_RECONF;
+		num_ext += 1;
+	}
+
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
@@ -434,6 +439,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 		num_ext += 2;
 	}
 
+	if (asoc->peer.reconf_capable) {
+		extensions[num_ext] = SCTP_CID_RECONF;
+		num_ext += 1;
+	}
+
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
@@ -2012,6 +2022,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 
 	for (i = 0; i < num_ext; i++) {
 		switch (param.ext->chunks[i]) {
+		case SCTP_CID_RECONF:
+			if (asoc->reconf_enable &&
+			    !asoc->peer.reconf_capable)
+				asoc->peer.reconf_capable = 1;
+			break;
 		case SCTP_CID_FWD_TSN:
 			if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
 				asoc->peer.prsctp_capable = 1;
-- 
cgit v1.2.3


From 9fb657aec0e20b4ed4401c44a4140f8d7b7a9ca0 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Wed, 18 Jan 2017 00:44:46 +0800
Subject: sctp: add sockopt SCTP_ENABLE_STREAM_RESET

This patch is to add sockopt SCTP_ENABLE_STREAM_RESET to get/set
strreset_enable to indicate which reconf request type it supports,
which is described in rfc6525 section 6.3.1.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 +++
 include/uapi/linux/sctp.h  |  7 ++++
 net/sctp/associola.c       |  1 +
 net/sctp/socket.c          | 84 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ee037ef15d65..d99b76e33b2e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1257,6 +1257,8 @@ struct sctp_endpoint {
 	__u8  auth_enable:1,
 	      prsctp_enable:1,
 	      reconf_enable:1;
+
+	__u8  strreset_enable;
 };
 
 /* Recover the outter endpoint structure. */
@@ -1872,6 +1874,8 @@ struct sctp_association {
 	     prsctp_enable:1,
 	     reconf_enable:1;
 
+	__u8 strreset_enable;
+
 	__u32 strreset_outseq; /* Update after receiving response */
 	__u32 strreset_inseq; /* Update after receiving request */
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index a406adcc0793..867be0f32fd7 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -115,6 +115,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_SUPPORTED	113
 #define SCTP_DEFAULT_PRINFO	114
 #define SCTP_PR_ASSOC_STATUS	115
+#define SCTP_ENABLE_STREAM_RESET	118
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -138,6 +139,12 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_RTX_ENABLED(x)	(SCTP_PR_POLICY(x) == SCTP_PR_SCTP_RTX)
 #define SCTP_PR_PRIO_ENABLED(x)	(SCTP_PR_POLICY(x) == SCTP_PR_SCTP_PRIO)
 
+/* For enable stream reset */
+#define SCTP_ENABLE_RESET_STREAM_REQ	0x01
+#define SCTP_ENABLE_RESET_ASSOC_REQ	0x02
+#define SCTP_ENABLE_CHANGE_ASSOC_REQ	0x04
+#define SCTP_ENABLE_STRRESET_MASK	0x07
+
 /* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
 /* On user space Linux, these live in <bits/socket.h> as an enum.  */
 enum sctp_msg_flags {
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 68b99adc21a3..e50dc6d7543f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -271,6 +271,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->active_key_id = ep->active_key_id;
 	asoc->prsctp_enable = ep->prsctp_enable;
 	asoc->reconf_enable = ep->reconf_enable;
+	asoc->strreset_enable = ep->strreset_enable;
 
 	/* Save the hmacs and chunks list into this association */
 	if (ep->auth_hmacs_list)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 635e03412693..0a9bc984b6c8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3750,6 +3750,42 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_enable_strreset(struct sock *sk,
+					   char __user *optval,
+					   unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		asoc->strreset_enable = params.assoc_value;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		sp->ep->strreset_enable = params.assoc_value;
+	} else {
+		goto out;
+	}
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3916,6 +3952,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_DEFAULT_PRINFO:
 		retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
 		break;
+	case SCTP_ENABLE_STREAM_RESET:
+		retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6400,6 +6439,47 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
+					   char __user *optval,
+					   int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		params.assoc_value = asoc->strreset_enable;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		params.assoc_value = sp->ep->strreset_enable;
+	} else {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6567,6 +6647,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
 							optlen);
 		break;
+	case SCTP_ENABLE_STREAM_RESET:
+		retval = sctp_getsockopt_enable_strreset(sk, len, optval,
+							 optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 7f9d68ac944e24ee5f9ac8d059ca00b1c1d34137 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Wed, 18 Jan 2017 00:44:47 +0800
Subject: sctp: implement sender-side procedures for SSN Reset Request
 Parameter

This patch is to implement sender-side procedures for the Outgoing
and Incoming SSN Reset Request Parameter described in rfc6525 section
5.1.2 and 5.1.3.

It is also add sockopt SCTP_RESET_STREAMS in rfc6525 section 6.3.2
for users.

Note that the new asoc member strreset_outstanding is to make sure
only one reconf request chunk on the fly as rfc6525 section 5.1.1
demands.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    |  6 ++++
 include/net/sctp/structs.h |  1 +
 include/uapi/linux/sctp.h  | 11 +++++++
 net/sctp/outqueue.c        | 33 +++++++++++++------
 net/sctp/socket.c          | 29 +++++++++++++++++
 net/sctp/stream.c          | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 149 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index bc0e049b1474..3cfd365bcfbc 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -193,6 +193,12 @@ void sctp_remaddr_proc_exit(struct net *net);
  */
 int sctp_offload_init(void);
 
+/*
+ * sctp/stream.c
+ */
+int sctp_send_reset_streams(struct sctp_association *asoc,
+			    struct sctp_reset_streams *params);
+
 /*
  * Module global variables
  */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d99b76e33b2e..231fa9ac50bd 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1875,6 +1875,7 @@ struct sctp_association {
 	     reconf_enable:1;
 
 	__u8 strreset_enable;
+	__u8 strreset_outstanding; /* request param count on the fly */
 
 	__u32 strreset_outseq; /* Update after receiving response */
 	__u32 strreset_inseq; /* Update after receiving request */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 867be0f32fd7..03c27cefffb1 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -116,6 +116,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_DEFAULT_PRINFO	114
 #define SCTP_PR_ASSOC_STATUS	115
 #define SCTP_ENABLE_STREAM_RESET	118
+#define SCTP_RESET_STREAMS	119
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -145,6 +146,9 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ENABLE_CHANGE_ASSOC_REQ	0x04
 #define SCTP_ENABLE_STRRESET_MASK	0x07
 
+#define SCTP_STREAM_RESET_INCOMING	0x01
+#define SCTP_STREAM_RESET_OUTGOING	0x02
+
 /* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
 /* On user space Linux, these live in <bits/socket.h> as an enum.  */
 enum sctp_msg_flags {
@@ -1015,4 +1019,11 @@ struct sctp_info {
 	__u32	__reserved3;
 };
 
+struct sctp_reset_streams {
+	sctp_assoc_t srs_assoc_id;
+	uint16_t srs_flags;
+	uint16_t srs_number_streams;	/* 0 == ALL */
+	uint16_t srs_stream_list[];	/* list if srs_num_streams is not 0 */
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 34efaa4ef2f6..65abe22d8691 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -915,22 +915,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_ECN_ECNE:
 		case SCTP_CID_ASCONF:
 		case SCTP_CID_FWD_TSN:
+		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet, gfp);
 			if (status  != SCTP_XMIT_OK) {
 				/* put the chunk back */
 				list_add(&chunk->list, &q->control_chunk_list);
-			} else {
-				asoc->stats.octrlchunks++;
-				/* PR-SCTP C5) If a FORWARD TSN is sent, the
-				 * sender MUST assure that at least one T3-rtx
-				 * timer is running.
-				 */
-				if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
-					sctp_transport_reset_t3_rtx(transport);
-					transport->last_time_sent = jiffies;
-				}
+				break;
+			}
+
+			asoc->stats.octrlchunks++;
+			/* PR-SCTP C5) If a FORWARD TSN is sent, the
+			 * sender MUST assure that at least one T3-rtx
+			 * timer is running.
+			 */
+			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+				sctp_transport_reset_t3_rtx(transport);
+				transport->last_time_sent = jiffies;
 			}
+
+			if (chunk == asoc->strreset_chunk)
+				sctp_transport_reset_reconf_timer(transport);
+
 			break;
 
 		default:
@@ -1016,6 +1022,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 		/* Finally, transmit new packets.  */
 		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
+
 			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
 			 * stream identifier.
 			 */
@@ -1038,6 +1046,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				continue;
 			}
 
+			if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) {
+				sctp_outq_head_data(q, chunk);
+				goto sctp_flush_out;
+			}
+
 			/* If there is a specified transport, use it.
 			 * Otherwise, we want to use the active path.
 			 */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0a9bc984b6c8..bee4dd3feabb 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3786,6 +3786,32 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_reset_streams(struct sock *sk,
+					 char __user *optval,
+					 unsigned int optlen)
+{
+	struct sctp_reset_streams *params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(struct sctp_reset_streams))
+		return -EINVAL;
+
+	params = memdup_user(optval, optlen);
+	if (IS_ERR(params))
+		return PTR_ERR(params);
+
+	asoc = sctp_id2assoc(sk, params->srs_assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_reset_streams(asoc, params);
+
+out:
+	kfree(params);
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3955,6 +3981,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ENABLE_STREAM_RESET:
 		retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
 		break;
+	case SCTP_RESET_STREAMS:
+		retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f86de43cbbe5..13d5e07dcd7d 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -33,6 +33,7 @@
  */
 
 #include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
 
 struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp)
 {
@@ -83,3 +84,81 @@ void sctp_stream_clear(struct sctp_stream *stream)
 	for (i = 0; i < stream->incnt; i++)
 		stream->in[i].ssn = 0;
 }
+
+static int sctp_send_reconf(struct sctp_association *asoc,
+			    struct sctp_chunk *chunk)
+{
+	struct net *net = sock_net(asoc->base.sk);
+	int retval = 0;
+
+	retval = sctp_primitive_RECONF(net, asoc, chunk);
+	if (retval)
+		sctp_chunk_free(chunk);
+
+	return retval;
+}
+
+int sctp_send_reset_streams(struct sctp_association *asoc,
+			    struct sctp_reset_streams *params)
+{
+	struct sctp_stream *stream = asoc->stream;
+	__u16 i, str_nums, *str_list;
+	struct sctp_chunk *chunk;
+	int retval = -EINVAL;
+	bool out, in;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) {
+		retval = -ENOPROTOOPT;
+		goto out;
+	}
+
+	if (asoc->strreset_outstanding) {
+		retval = -EINPROGRESS;
+		goto out;
+	}
+
+	out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING;
+	in  = params->srs_flags & SCTP_STREAM_RESET_INCOMING;
+	if (!out && !in)
+		goto out;
+
+	str_nums = params->srs_number_streams;
+	str_list = params->srs_stream_list;
+	if (out && str_nums)
+		for (i = 0; i < str_nums; i++)
+			if (str_list[i] >= stream->outcnt)
+				goto out;
+
+	if (in && str_nums)
+		for (i = 0; i < str_nums; i++)
+			if (str_list[i] >= stream->incnt)
+				goto out;
+
+	chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
+	if (!chunk)
+		goto out;
+
+	if (out) {
+		if (str_nums)
+			for (i = 0; i < str_nums; i++)
+				stream->out[str_list[i]].state =
+						       SCTP_STREAM_CLOSED;
+		else
+			for (i = 0; i < stream->outcnt; i++)
+				stream->out[i].state = SCTP_STREAM_CLOSED;
+	}
+
+	asoc->strreset_outstanding = out + in;
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+	}
+
+out:
+	return retval;
+}
-- 
cgit v1.2.3


From 1a28ad74ebd8f9d3c7eae0d781f72a6d30545e17 Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Mon, 16 Jan 2017 22:02:57 +0800
Subject: netfilter: nf_tables: eliminate useless condition checks

The return value of nf_tables_table_lookup() is valid pointer or one
pointer error. There are two cases:

1) IS_ERR(table) is true, it would return the error or reset the
   table as NULL, it is unnecessary to perform the latter check
   "table != NULL".

2) IS_ERR(obj) is false, the table is one valid pointer. It is also
   unnecessary to perform that check.

The nf_tables_newset() and nf_tables_newobj() have same logic codes.

In summary, we could move the block of condition check "table != NULL"
in the else block to eliminate the original condition checks.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a019a87e58ee..6e07c214c208 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -696,10 +696,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
-		table = NULL;
-	}
-
-	if (table != NULL) {
+	} else {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -2963,10 +2960,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (IS_ERR(set)) {
 		if (PTR_ERR(set) != -ENOENT)
 			return PTR_ERR(set);
-		set = NULL;
-	}
-
-	if (set != NULL) {
+	} else {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -4153,10 +4147,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		if (err != -ENOENT)
 			return err;
 
-		obj = NULL;
-	}
-
-	if (obj != NULL) {
+	} else {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 
-- 
cgit v1.2.3


From fd61c6ba313de6758aeeab58fe03bd9fbbc8cea9 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Tue, 17 Jan 2017 15:51:07 -0800
Subject: net: ipv6: remove nowait arg to rt6_fill_node

All callers of rt6_fill_node pass 0 for nowait arg. Remove the arg and
simplify rt6_fill_node accordingly.

rt6_fill_node passes the nowait of 0 to ip6mr_get_route. Remove the
nowait arg from it as well.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute6.h |  2 +-
 net/ipv6/ip6mr.c        |  9 ++-------
 net/ipv6/route.c        | 27 ++++++++++-----------------
 3 files changed, 13 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 19a1c0c2993b..ce44e3e96d27 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -116,7 +116,7 @@ struct mfc6_cache {
 
 struct rtmsg;
 extern int ip6mr_get_route(struct net *net, struct sk_buff *skb,
-			   struct rtmsg *rtm, int nowait, u32 portid);
+			   struct rtmsg *rtm, u32 portid);
 
 #ifdef CONFIG_IPV6_MROUTE
 extern struct sock *mroute6_socket(struct net *net, struct sk_buff *skb);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e275077e8af2..babaf3ec2742 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2288,7 +2288,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 }
 
 int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
-		    int nowait, u32 portid)
+		    u32 portid)
 {
 	int err;
 	struct mr6_table *mrt;
@@ -2315,11 +2315,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		struct net_device *dev;
 		int vif;
 
-		if (nowait) {
-			read_unlock(&mrt_lock);
-			return -EAGAIN;
-		}
-
 		dev = skb->dev;
 		if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
 			read_unlock(&mrt_lock);
@@ -2357,7 +2352,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		return err;
 	}
 
-	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		cache->mfc_flags |= MFC_NOTIFY;
 
 	err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4f6b067c8753..b2044dd71724 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3169,7 +3169,7 @@ static int rt6_fill_node(struct net *net,
 			 struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *dst, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
-			 int prefix, int nowait, unsigned int flags)
+			 int prefix, unsigned int flags)
 {
 	u32 metrics[RTAX_MAX];
 	struct rtmsg *rtm;
@@ -3261,19 +3261,12 @@ static int rt6_fill_node(struct net *net,
 	if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
-			int err = ip6mr_get_route(net, skb, rtm, nowait,
-						  portid);
-
-			if (err <= 0) {
-				if (!nowait) {
-					if (err == 0)
-						return 0;
-					goto nla_put_failure;
-				} else {
-					if (err == -EMSGSIZE)
-						goto nla_put_failure;
-				}
-			}
+			int err = ip6mr_get_route(net, skb, rtm, portid);
+
+			if (err == 0)
+				return 0;
+			if (err < 0)
+				goto nla_put_failure;
 		} else
 #endif
 			if (nla_put_u32(skb, RTA_IIF, iif))
@@ -3342,7 +3335,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 	return rt6_fill_node(arg->net,
 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
-		     prefix, 0, NLM_F_MULTI);
+		     prefix, NLM_F_MULTI);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
@@ -3433,7 +3426,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 
 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
-			    nlh->nlmsg_seq, 0, 0, 0);
+			    nlh->nlmsg_seq, 0, 0);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
@@ -3460,7 +3453,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
 		goto errout;
 
 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-				event, info->portid, seq, 0, 0, nlm_flags);
+				event, info->portid, seq, 0, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
cgit v1.2.3


From f8cfe2ceb1c2ebe1469040d037a733815d2255d5 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Tue, 17 Jan 2017 15:51:08 -0800
Subject: net: ipv6: remove prefix arg to rt6_fill_node

The prefix arg to rt6_fill_node is non-0 in only 1 path - rt6_dump_route
where a user is requesting a prefix only dump. Simplify rt6_fill_node
by removing the prefix arg and moving the prefix check to rt6_dump_route.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b2044dd71724..5585c501a540 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3169,7 +3169,7 @@ static int rt6_fill_node(struct net *net,
 			 struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *dst, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
-			 int prefix, unsigned int flags)
+			 unsigned int flags)
 {
 	u32 metrics[RTAX_MAX];
 	struct rtmsg *rtm;
@@ -3177,13 +3177,6 @@ static int rt6_fill_node(struct net *net,
 	long expires;
 	u32 table;
 
-	if (prefix) {	/* user wants prefix routes only */
-		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
-			/* success since this is not a prefix route */
-			return 1;
-		}
-	}
-
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
 	if (!nlh)
 		return -EMSGSIZE;
@@ -3324,18 +3317,22 @@ nla_put_failure:
 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
-	int prefix;
 
 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
-		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
-	} else
-		prefix = 0;
+
+		/* user wants prefix routes only */
+		if (rtm->rtm_flags & RTM_F_PREFIX &&
+		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
+			/* success since this is not a prefix route */
+			return 1;
+		}
+	}
 
 	return rt6_fill_node(arg->net,
 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
-		     prefix, NLM_F_MULTI);
+		     NLM_F_MULTI);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
@@ -3426,7 +3423,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 
 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
-			    nlh->nlmsg_seq, 0, 0);
+			    nlh->nlmsg_seq, 0);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
@@ -3453,7 +3450,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
 		goto errout;
 
 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-				event, info->portid, seq, 0, nlm_flags);
+				event, info->portid, seq, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
cgit v1.2.3


From b22de490869da354116ea4cbbaa09dcbc260b2b4 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Tue, 17 Jan 2017 20:41:38 -0500
Subject: net: dsa: store CPU switch structure in the tree

Store a dsa_switch pointer to the CPU switch in the tree instead of only
its index. This avoids the need to initialize it to -1.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 8 ++++----
 net/dsa/dsa.c     | 7 +++----
 net/dsa/dsa2.c    | 5 ++---
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index b94d1f2ef912..c72ed7af2a2a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -124,7 +124,7 @@ struct dsa_switch_tree {
 	/*
 	 * The switch and port to which the CPU is attached.
 	 */
-	s8			cpu_switch;
+	struct dsa_switch	*cpu_switch;
 	s8			cpu_port;
 
 	/*
@@ -204,7 +204,7 @@ struct dsa_switch {
 
 static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
 {
-	return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port);
+	return !!(ds == ds->dst->cpu_switch && p == ds->dst->cpu_port);
 }
 
 static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
@@ -227,10 +227,10 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds)
 	 * Else return the (DSA) port number that connects to the
 	 * switch that is one hop closer to the cpu.
 	 */
-	if (dst->cpu_switch == ds->index)
+	if (dst->cpu_switch == ds)
 		return dst->cpu_port;
 	else
-		return ds->rtable[dst->cpu_switch];
+		return ds->rtable[dst->cpu_switch->index];
 }
 
 struct switchdev_trans;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index fd532487dfdf..b220609cfe6f 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -225,12 +225,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 			continue;
 
 		if (!strcmp(name, "cpu")) {
-			if (dst->cpu_switch != -1) {
+			if (!dst->cpu_switch) {
 				netdev_err(dst->master_netdev,
 					   "multiple cpu ports?!\n");
 				return -EINVAL;
 			}
-			dst->cpu_switch = index;
+			dst->cpu_switch = ds;
 			dst->cpu_port = i;
 			ds->cpu_port_mask |= 1 << i;
 		} else if (!strcmp(name, "dsa")) {
@@ -254,7 +254,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	 * tagging protocol to the preferred tagging format of this
 	 * switch.
 	 */
-	if (dst->cpu_switch == index) {
+	if (dst->cpu_switch == ds) {
 		enum dsa_tag_protocol tag_protocol;
 
 		tag_protocol = ops->get_tag_protocol(ds);
@@ -757,7 +757,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
 
 	dst->pd = pd;
 	dst->master_netdev = dev;
-	dst->cpu_switch = -1;
 	dst->cpu_port = -1;
 
 	for (i = 0; i < pd->nr_chips; i++) {
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 42a41d84053c..020e072b4299 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -57,7 +57,6 @@ static struct dsa_switch_tree *dsa_add_dst(u32 tree)
 	if (!dst)
 		return NULL;
 	dst->tree = tree;
-	dst->cpu_switch = -1;
 	INIT_LIST_HEAD(&dst->list);
 	list_add_tail(&dsa_switch_trees, &dst->list);
 	kref_init(&dst->refcount);
@@ -448,8 +447,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
 	if (!dst->master_netdev)
 		dst->master_netdev = ethernet_dev;
 
-	if (dst->cpu_switch == -1) {
-		dst->cpu_switch = ds->index;
+	if (!dst->cpu_switch) {
+		dst->cpu_switch = ds;
 		dst->cpu_port = index;
 	}
 
-- 
cgit v1.2.3


From 9520ed8fb8410dcb6babf751561a08f73ca03812 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Tue, 17 Jan 2017 20:41:39 -0500
Subject: net: dsa: use cpu_switch instead of ds[0]

Now that the DSA Ethernet switches are true Linux devices, the CPU
switch is not necessarily the first one. If its address is higher than
the second switch on the same MDIO bus, its index will be 1, not 0.

Avoid any confusion by using dst->cpu_switch instead of dst->ds[0].

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c         | 2 +-
 net/dsa/dsa2.c        | 8 ++++----
 net/dsa/slave.c       | 6 +++---
 net/dsa/tag_brcm.c    | 2 +-
 net/dsa/tag_qca.c     | 2 +-
 net/dsa/tag_trailer.c | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index b220609cfe6f..91f96e1bd2ec 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -868,7 +868,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 			dsa_switch_destroy(ds);
 	}
 
-	dsa_cpu_port_ethtool_restore(dst->ds[0]);
+	dsa_cpu_port_ethtool_restore(dst->cpu_switch);
 
 	dev_put(dst->master_netdev);
 }
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 020e072b4299..866222a8f9bf 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -377,8 +377,8 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 			return err;
 	}
 
-	if (dst->ds[0]) {
-		err = dsa_cpu_port_ethtool_setup(dst->ds[0]);
+	if (dst->cpu_switch) {
+		err = dsa_cpu_port_ethtool_setup(dst->cpu_switch);
 		if (err)
 			return err;
 	}
@@ -418,8 +418,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 		dsa_ds_unapply(dst, ds);
 	}
 
-	if (dst->ds[0])
-		dsa_cpu_port_ethtool_restore(dst->ds[0]);
+	if (dst->cpu_switch)
+		dsa_cpu_port_ethtool_restore(dst->cpu_switch);
 
 	pr_info("DSA: tree %d unapplied\n", dst->tree);
 	dst->applied = false;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 0cdcaf526987..b8e58689a9a1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -781,7 +781,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
 					   uint64_t *data)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds = dst->ds[0];
+	struct dsa_switch *ds = dst->cpu_switch;
 	s8 cpu_port = dst->cpu_port;
 	int count = 0;
 
@@ -798,7 +798,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
 static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds = dst->ds[0];
+	struct dsa_switch *ds = dst->cpu_switch;
 	int count = 0;
 
 	if (dst->master_ethtool_ops.get_sset_count)
@@ -814,7 +814,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
 				     uint32_t stringset, uint8_t *data)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds = dst->ds[0];
+	struct dsa_switch *ds = dst->cpu_switch;
 	s8 cpu_port = dst->cpu_port;
 	int len = ETH_GSTRING_LEN;
 	int mcount = 0, count;
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 21bffde6e4bf..af82927674e0 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -102,7 +102,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (unlikely(dst == NULL))
 		goto out_drop;
 
-	ds = dst->ds[0];
+	ds = dst->cpu_switch;
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
 	if (skb == NULL)
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 0c90cacee7aa..736ca8e8c31e 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -104,7 +104,7 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	/* This protocol doesn't support cascading multiple switches so it's
 	 * safe to assume the switch is first in the tree
 	 */
-	ds = dst->ds[0];
+	ds = dst->cpu_switch;
 	if (!ds)
 		goto out_drop;
 
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 5e3903eb1afa..271128a2dc64 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -67,7 +67,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	if (unlikely(dst == NULL))
 		goto out_drop;
-	ds = dst->ds[0];
+	ds = dst->cpu_switch;
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
 	if (skb == NULL)
-- 
cgit v1.2.3


From 4a7c972644c1151f6dd34ff4b5f7eacb239e22ee Mon Sep 17 00:00:00 2001
From: Tobias Klauser
Date: Wed, 18 Jan 2017 17:45:01 +0100
Subject: net: Remove usage of net_device last_rx member

The network stack no longer uses the last_rx member of struct net_device
since the bonding driver switched to use its own private last_rx in
commit 9f242738376d ("bonding: use last_arp_rx in slave_last_rx()").

However, some drivers still (ab)use the field for their own purposes and
some driver just update it without actually using it.

Previously, there was an accompanying comment for the last_rx member
added in commit 4dc89133f49b ("net: add a comment on netdev->last_rx")
which asked drivers not to update is, unless really needed. However,
this commend was removed in commit f8ff080dacec ("bonding: remove
useless updating of slave->dev->last_rx"), so some drivers added later
on still did update last_rx.

Remove all usage of last_rx and switch three drivers (sky2, atp and
smc91c92_cs) which actually read and write it to use their own private
copy in netdev_priv.

Compile-tested with allyesconfig and allmodconfig on x86 and arm.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Mirko Lindner <mlindner@marvell.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/m68k/emu/nfeth.c                              | 1 -
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 1 -
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 1 -
 drivers/net/ethernet/hisilicon/hns/hns_enet.c      | 1 -
 drivers/net/ethernet/intel/e1000e/netdev.c         | 6 +++---
 drivers/net/ethernet/intel/igb/igb_main.c          | 6 +++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      | 7 +++----
 drivers/net/ethernet/marvell/sky2.c                | 6 +++---
 drivers/net/ethernet/marvell/sky2.h                | 1 +
 drivers/net/ethernet/qualcomm/emac/emac-mac.c      | 1 -
 drivers/net/ethernet/realtek/atp.c                 | 7 +++----
 drivers/net/ethernet/smsc/smc91c92_cs.c            | 6 ++++--
 drivers/net/irda/bfin_sir.c                        | 5 ++---
 drivers/net/irda/sh_sir.c                          | 1 -
 drivers/staging/ks7010/ks_hostif.c                 | 2 --
 drivers/staging/netlogic/xlr_net.c                 | 1 -
 drivers/staging/rtl8192e/rtllib_rx.c               | 1 -
 drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c  | 4 ----
 drivers/staging/wlan-ng/hfa384x_usb.c              | 1 -
 drivers/staging/wlan-ng/p80211netdev.c             | 2 --
 include/linux/netdevice.h                          | 3 ---
 net/batman-adv/bridge_loop_avoidance.c             | 1 -
 net/batman-adv/distributed-arp-table.c             | 1 -
 net/batman-adv/soft-interface.c                    | 2 --
 24 files changed, 22 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/arch/m68k/emu/nfeth.c b/arch/m68k/emu/nfeth.c
index fc4be028c418..e45ce4243aaa 100644
--- a/arch/m68k/emu/nfeth.c
+++ b/arch/m68k/emu/nfeth.c
@@ -124,7 +124,6 @@ static inline void recv_packet(struct net_device *dev)
 
 	skb->protocol = eth_type_trans(skb, dev);
 	netif_rx(skb);
-	dev->last_rx = jiffies;
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += pktlen;
 
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 2b89ec291b8b..5ee3f007c613 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2360,7 +2360,6 @@ liquidio_push_packet(u32 octeon_id __attribute__((unused)),
 		if (packet_was_received) {
 			droq->stats.rx_bytes_received += len;
 			droq->stats.rx_pkts_received++;
-			netdev->last_rx = jiffies;
 		} else {
 			droq->stats.rx_dropped++;
 			netif_info(lio, rx_err, lio->netdev,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 19d88fb387ce..e96cf6cdecfd 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1571,7 +1571,6 @@ liquidio_push_packet(u32 octeon_id __attribute__((unused)),
 		if (packet_was_received) {
 			droq->stats.rx_bytes_received += len;
 			droq->stats.rx_pkts_received++;
-			netdev->last_rx = jiffies;
 		} else {
 			droq->stats.rx_dropped++;
 			netif_info(lio, rx_err, lio->netdev,
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index b7cb61385ad8..f7b75e96c1c3 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -797,7 +797,6 @@ static void hns_nic_rx_up_pro(struct hns_nic_ring_data *ring_data,
 
 	skb->protocol = eth_type_trans(skb, ndev);
 	(void)napi_gro_receive(&ring_data->napi, skb);
-	ndev->last_rx = jiffies;
 }
 
 static int hns_desc_unused(struct hnae_ring *ring)
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 79651eb608ff..2175cced402f 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -240,9 +240,9 @@ static void e1000e_dump(struct e1000_adapter *adapter)
 	/* Print netdevice Info */
 	if (netdev) {
 		dev_info(&adapter->pdev->dev, "Net device Info\n");
-		pr_info("Device Name     state            trans_start      last_rx\n");
-		pr_info("%-15s %016lX %016lX %016lX\n", netdev->name,
-			netdev->state, dev_trans_start(netdev), netdev->last_rx);
+		pr_info("Device Name     state            trans_start\n");
+		pr_info("%-15s %016lX %016lX\n", netdev->name,
+			netdev->state, dev_trans_start(netdev));
 	}
 
 	/* Print Registers */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 7fc95493b692..be456bae8169 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -383,9 +383,9 @@ static void igb_dump(struct igb_adapter *adapter)
 	/* Print netdevice Info */
 	if (netdev) {
 		dev_info(&adapter->pdev->dev, "Net device Info\n");
-		pr_info("Device Name     state            trans_start      last_rx\n");
-		pr_info("%-15s %016lX %016lX %016lX\n", netdev->name,
-			netdev->state, dev_trans_start(netdev), netdev->last_rx);
+		pr_info("Device Name     state            trans_start\n");
+		pr_info("%-15s %016lX %016lX\n", netdev->name,
+			netdev->state, dev_trans_start(netdev));
 	}
 
 	/* Print Registers */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ffe7d940d9ff..3b3b52b62a5f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -611,12 +611,11 @@ static void ixgbe_dump(struct ixgbe_adapter *adapter)
 	if (netdev) {
 		dev_info(&adapter->pdev->dev, "Net device Info\n");
 		pr_info("Device Name     state            "
-			"trans_start      last_rx\n");
-		pr_info("%-15s %016lX %016lX %016lX\n",
+			"trans_start\n");
+		pr_info("%-15s %016lX %016lX\n",
 			netdev->name,
 			netdev->state,
-			dev_trans_start(netdev),
-			netdev->last_rx);
+			dev_trans_start(netdev));
 	}
 
 	/* Print Registers */
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index be003c5a4f5f..2b2cc3f3ca10 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2666,7 +2666,7 @@ static inline void sky2_rx_done(struct sky2_hw *hw, unsigned port,
 	sky2->rx_stats.bytes += bytes;
 	u64_stats_update_end(&sky2->rx_stats.syncp);
 
-	dev->last_rx = jiffies;
+	sky2->last_rx = jiffies;
 	sky2_rx_update(netdev_priv(dev), rxqaddr[port]);
 }
 
@@ -2953,7 +2953,7 @@ static int sky2_rx_hung(struct net_device *dev)
 	u8 fifo_lev = sky2_read8(hw, Q_ADDR(rxq, Q_RL));
 
 	/* If idle and MAC or PCI is stuck */
-	if (sky2->check.last == dev->last_rx &&
+	if (sky2->check.last == sky2->last_rx &&
 	    ((mac_rp == sky2->check.mac_rp &&
 	      mac_lev != 0 && mac_lev >= sky2->check.mac_lev) ||
 	     /* Check if the PCI RX hang */
@@ -2965,7 +2965,7 @@ static int sky2_rx_hung(struct net_device *dev)
 			      fifo_rp, sky2_read8(hw, Q_ADDR(rxq, Q_WP)));
 		return 1;
 	} else {
-		sky2->check.last = dev->last_rx;
+		sky2->check.last = sky2->last_rx;
 		sky2->check.mac_rp = mac_rp;
 		sky2->check.mac_lev = mac_lev;
 		sky2->check.fifo_rp = fifo_rp;
diff --git a/drivers/net/ethernet/marvell/sky2.h b/drivers/net/ethernet/marvell/sky2.h
index ec6dcd80152b..0fe160796842 100644
--- a/drivers/net/ethernet/marvell/sky2.h
+++ b/drivers/net/ethernet/marvell/sky2.h
@@ -2247,6 +2247,7 @@ struct sky2_port {
 	u16		     rx_data_size;
 	u16		     rx_nfrags;
 
+	unsigned long	     last_rx;
 	struct {
 		unsigned long last;
 		u32	mac_rp;
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
index 0b4deb31e742..d297ed961da6 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
@@ -1213,7 +1213,6 @@ void emac_mac_rx_process(struct emac_adapter *adpt, struct emac_rx_queue *rx_q,
 		emac_receive_skb(rx_q, skb, (u16)RRD_CVALN_TAG(&rrd),
 				 (bool)RRD_CVTAG(&rrd));
 
-		netdev->last_rx = jiffies;
 		(*num_pkts)++;
 	} while (*num_pkts < max_pkts);
 
diff --git a/drivers/net/ethernet/realtek/atp.c b/drivers/net/ethernet/realtek/atp.c
index 570ed3bd3cbf..9bcd4aefc9c5 100644
--- a/drivers/net/ethernet/realtek/atp.c
+++ b/drivers/net/ethernet/realtek/atp.c
@@ -170,7 +170,7 @@ struct net_local {
     spinlock_t lock;
     struct net_device *next_module;
     struct timer_list timer;	/* Media selection timer. */
-    long last_rx_time;		/* Last Rx, in jiffies, to handle Rx hang. */
+    unsigned long last_rx_time;	/* Last Rx, in jiffies, to handle Rx hang. */
     int saved_tx_size;
     unsigned int tx_unit_busy:1;
     unsigned char re_tx,	/* Number of packet retransmissions. */
@@ -668,11 +668,11 @@ static irqreturn_t atp_interrupt(int irq, void *dev_instance)
 			}
 			num_tx_since_rx++;
 		} else if (num_tx_since_rx > 8 &&
-			   time_after(jiffies, dev->last_rx + HZ)) {
+			   time_after(jiffies, lp->last_rx_time + HZ)) {
 			if (net_debug > 2)
 				printk(KERN_DEBUG "%s: Missed packet? No Rx after %d Tx and "
 					   "%ld jiffies status %02x  CMR1 %02x.\n", dev->name,
-					   num_tx_since_rx, jiffies - dev->last_rx, status,
+					   num_tx_since_rx, jiffies - lp->last_rx_time, status,
 					   (read_nibble(ioaddr, CMR1) >> 3) & 15);
 			dev->stats.rx_missed_errors++;
 			hardware_init(dev);
@@ -789,7 +789,6 @@ static void net_rx(struct net_device *dev)
 		read_block(ioaddr, pkt_len, skb_put(skb,pkt_len), dev->if_port);
 		skb->protocol = eth_type_trans(skb, dev);
 		netif_rx(skb);
-		dev->last_rx = jiffies;
 		dev->stats.rx_packets++;
 		dev->stats.rx_bytes += pkt_len;
 	}
diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c
index 67154621abcf..97280daba27f 100644
--- a/drivers/net/ethernet/smsc/smc91c92_cs.c
+++ b/drivers/net/ethernet/smsc/smc91c92_cs.c
@@ -113,6 +113,7 @@ struct smc_private {
     struct mii_if_info		mii_if;
     int				duplex;
     int				rx_ovrn;
+    unsigned long		last_rx;
 };
 
 /* Special definitions for Megahertz multifunction cards */
@@ -1491,6 +1492,7 @@ static void smc_rx(struct net_device *dev)
     if (!(rx_status & RS_ERRORS)) {
 	/* do stuff to make a new packet */
 	struct sk_buff *skb;
+	struct smc_private *smc = netdev_priv(dev);
 	
 	/* Note: packet_length adds 5 or 6 extra bytes here! */
 	skb = netdev_alloc_skb(dev, packet_length+2);
@@ -1509,7 +1511,7 @@ static void smc_rx(struct net_device *dev)
 	skb->protocol = eth_type_trans(skb, dev);
 	
 	netif_rx(skb);
-	dev->last_rx = jiffies;
+	smc->last_rx = jiffies;
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += packet_length;
 	if (rx_status & RS_MULTICAST)
@@ -1790,7 +1792,7 @@ static void media_check(u_long arg)
     }
 
     /* Ignore collisions unless we've had no rx's recently */
-    if (time_after(jiffies, dev->last_rx + HZ)) {
+    if (time_after(jiffies, smc->last_rx + HZ)) {
 	if (smc->tx_err || (smc->media_status & EPH_16COL))
 	    media |= EPH_16COL;
     }
diff --git a/drivers/net/irda/bfin_sir.c b/drivers/net/irda/bfin_sir.c
index be5bb0b7f29c..3151b580dbd6 100644
--- a/drivers/net/irda/bfin_sir.c
+++ b/drivers/net/irda/bfin_sir.c
@@ -22,7 +22,7 @@ static int max_rate = 57600;
 static int max_rate = 115200;
 #endif
 
-static void turnaround_delay(unsigned long last_jif, int mtt)
+static void turnaround_delay(int mtt)
 {
 	long ticks;
 
@@ -209,7 +209,6 @@ static void bfin_sir_rx_chars(struct net_device *dev)
 	UART_CLEAR_LSR(port);
 	ch = UART_GET_CHAR(port);
 	async_unwrap_char(dev, &self->stats, &self->rx_buff, ch);
-	dev->last_rx = jiffies;
 }
 
 static irqreturn_t bfin_sir_rx_int(int irq, void *dev_id)
@@ -510,7 +509,7 @@ static void bfin_sir_send_work(struct work_struct *work)
 	int tx_cnt = 10;
 
 	while (bfin_sir_is_receiving(dev) && --tx_cnt)
-		turnaround_delay(dev->last_rx, self->mtt);
+		turnaround_delay(self->mtt);
 
 	bfin_sir_stop_rx(port);
 
diff --git a/drivers/net/irda/sh_sir.c b/drivers/net/irda/sh_sir.c
index e3fe9a286136..fede6864c737 100644
--- a/drivers/net/irda/sh_sir.c
+++ b/drivers/net/irda/sh_sir.c
@@ -547,7 +547,6 @@ static void sh_sir_rx(struct sh_sir_self *self)
 
 		async_unwrap_char(self->ndev, &self->ndev->stats,
 				  &self->rx_buff, (u8)data);
-		self->ndev->last_rx = jiffies;
 
 		if (EOFD & sh_sir_read(self, IRIF_SIR_FRM))
 			continue;
diff --git a/drivers/staging/ks7010/ks_hostif.c b/drivers/staging/ks7010/ks_hostif.c
index 1fbd495e5e63..c7652c35be19 100644
--- a/drivers/staging/ks7010/ks_hostif.c
+++ b/drivers/staging/ks7010/ks_hostif.c
@@ -461,7 +461,6 @@ void hostif_data_indication(struct ks_wlan_private *priv)
 			skb->protocol = eth_type_trans(skb, skb->dev);
 			priv->nstats.rx_packets++;
 			priv->nstats.rx_bytes += rx_ind_size;
-			skb->dev->last_rx = jiffies;
 			netif_rx(skb);
 		} else {
 			priv->nstats.rx_dropped++;
@@ -494,7 +493,6 @@ void hostif_data_indication(struct ks_wlan_private *priv)
 			skb->protocol = eth_type_trans(skb, skb->dev);
 			priv->nstats.rx_packets++;
 			priv->nstats.rx_bytes += rx_ind_size;
-			skb->dev->last_rx = jiffies;
 			netif_rx(skb);
 		} else {
 			priv->nstats.rx_dropped++;
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index f84069ffa8c6..781ef623233e 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -155,7 +155,6 @@ static void xlr_net_fmn_handler(int bkt, int src_stnid, int size, int code,
 		skb_reserve(skb, BYTE_OFFSET);
 		skb_put(skb, length);
 		skb->protocol = eth_type_trans(skb, skb->dev);
-		skb->dev->last_rx = jiffies;
 		netif_rx(skb);
 		/* Fill rx ring */
 		skb_data = xlr_alloc_skb();
diff --git a/drivers/staging/rtl8192e/rtllib_rx.c b/drivers/staging/rtl8192e/rtllib_rx.c
index e5ba7d1a809f..43a77745e6fb 100644
--- a/drivers/staging/rtl8192e/rtllib_rx.c
+++ b/drivers/staging/rtl8192e/rtllib_rx.c
@@ -1375,7 +1375,6 @@ static int rtllib_rx_InfraAdhoc(struct rtllib_device *ieee, struct sk_buff *skb,
 		ieee->LinkDetectInfo.NumRecvDataInPeriod++;
 		ieee->LinkDetectInfo.NumRxOkInPeriod++;
 	}
-	dev->last_rx = jiffies;
 
 	/* Data frame - extract src/dst addresses */
 	rtllib_rx_extract_addr(ieee, hdr, dst, src, bssid);
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
index 82f654305414..b1f2fdfcb718 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
@@ -1103,11 +1103,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
 		stats = hostap_get_stats(dev);
 		from_assoc_ap = 1;
 	}
-#endif
-
-	dev->last_rx = jiffies;
 
-#ifdef NOT_YET
 	if ((ieee->iw_mode == IW_MODE_MASTER ||
 	     ieee->iw_mode == IW_MODE_REPEAT) &&
 	    !from_assoc_ap) {
diff --git a/drivers/staging/wlan-ng/hfa384x_usb.c b/drivers/staging/wlan-ng/hfa384x_usb.c
index 4fe037aeef12..6134eba5cad4 100644
--- a/drivers/staging/wlan-ng/hfa384x_usb.c
+++ b/drivers/staging/wlan-ng/hfa384x_usb.c
@@ -3409,7 +3409,6 @@ static void hfa384x_usbin_rx(struct wlandevice *wlandev, struct sk_buff *skb)
 			&usbin->rxfrm.desc.frame_control, hdrlen);
 
 		skb->dev = wlandev->netdev;
-		skb->dev->last_rx = jiffies;
 
 		/* And set the frame length properly */
 		skb_trim(skb, data_len + hdrlen);
diff --git a/drivers/staging/wlan-ng/p80211netdev.c b/drivers/staging/wlan-ng/p80211netdev.c
index 73fcf07254fe..53dbbd69e552 100644
--- a/drivers/staging/wlan-ng/p80211netdev.c
+++ b/drivers/staging/wlan-ng/p80211netdev.c
@@ -252,7 +252,6 @@ static int p80211_convert_to_ether(struct wlandevice *wlandev,
 	}
 
 	if (skb_p80211_to_ether(wlandev, wlandev->ethconv, skb) == 0) {
-		skb->dev->last_rx = jiffies;
 		wlandev->netdev->stats.rx_packets++;
 		wlandev->netdev->stats.rx_bytes += skb->len;
 		netif_rx_ni(skb);
@@ -287,7 +286,6 @@ static void p80211netdev_rx_bh(unsigned long arg)
 				skb->ip_summed = CHECKSUM_NONE;
 				skb->pkt_type = PACKET_OTHERHOST;
 				skb->protocol = htons(ETH_P_80211_RAW);
-				dev->last_rx = jiffies;
 
 				dev->stats.rx_packets++;
 				dev->stats.rx_bytes += skb->len;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97ae0ac513ee..3868c32d98af 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1551,7 +1551,6 @@ enum netdev_priv_flags {
  *	@ax25_ptr:	AX.25 specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
  *
- *	@last_rx:	Time of last Rx
  *	@dev_addr:	Hw address (before bcast,
  *			because most packets are unicast)
  *
@@ -1777,8 +1776,6 @@ struct net_device {
 /*
  * Cache lines mostly used on receive path (including eth_type_trans())
  */
-	unsigned long		last_rx;
-
 	/* Interface address info used in eth_type_trans() */
 	unsigned char		*dev_addr;
 
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..36917a7b1b59 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -449,7 +449,6 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
 	batadv_inc_counter(bat_priv, BATADV_CNT_RX);
 	batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
 			   skb->len + ETH_HLEN);
-	soft_iface->last_rx = jiffies;
 
 	netif_rx(skb);
 out:
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 49576c5a3fe3..6394206bfcae 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1050,7 +1050,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
 						   bat_priv->soft_iface);
 		bat_priv->stats.rx_packets++;
 		bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
-		bat_priv->soft_iface->last_rx = jiffies;
 
 		netif_rx(skb_new);
 		batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7b3494ae6ad9..420e19b501f2 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -481,8 +481,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
 	batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
 			   skb->len + ETH_HLEN);
 
-	soft_iface->last_rx = jiffies;
-
 	/* Let the bridge loop avoidance check the packet. If will
 	 * not handle it, we can safely push it up.
 	 */
-- 
cgit v1.2.3


From 39b7b6a6247568f99df059e8a4c9bd674f6b99ac Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Thu, 19 Jan 2017 10:45:31 +0100
Subject: net/sched: cls_flower: reduce fl_change stack size

The new ARP support has pushed the stack size over the edge on ARM,
as there are two large objects on the stack in this function (mask
and tb) and both have now grown a bit more:

net/sched/cls_flower.c: In function 'fl_change':
net/sched/cls_flower.c:928:1: error: the frame size of 1072 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

We can solve this by dynamically allocating one or both of them.
I first tried to do it just for the mask, but that only saved
152 bytes on ARM, while this version just does it for the 'tb'
array, bringing the stack size back down to 664 bytes.

Fixes: 99d31326cbe6 ("net/sched: cls_flower: Support matching on ARP")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 27934456d984..9e74b0fa4b89 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -832,23 +832,31 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg;
 	struct cls_fl_filter *fnew;
-	struct nlattr *tb[TCA_FLOWER_MAX + 1];
+	struct nlattr **tb;
 	struct fl_flow_mask mask = {};
 	int err;
 
 	if (!tca[TCA_OPTIONS])
 		return -EINVAL;
 
+	tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
+	if (!tb)
+		return -ENOBUFS;
+
 	err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy);
 	if (err < 0)
-		return err;
+		goto errout_tb;
 
-	if (fold && handle && fold->handle != handle)
-		return -EINVAL;
+	if (fold && handle && fold->handle != handle) {
+		err = -EINVAL;
+		goto errout_tb;
+	}
 
 	fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
-	if (!fnew)
-		return -ENOBUFS;
+	if (!fnew) {
+		err = -ENOBUFS;
+		goto errout_tb;
+	}
 
 	err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
 	if (err < 0)
@@ -919,11 +927,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		list_add_tail_rcu(&fnew->list, &head->filters);
 	}
 
+	kfree(tb);
 	return 0;
 
 errout:
 	tcf_exts_destroy(&fnew->exts);
 	kfree(fnew);
+errout_tb:
+	kfree(tb);
 	return err;
 }
 
-- 
cgit v1.2.3


From 54c30f604dcc110dba21db7c2c09ddf25b65acbe Mon Sep 17 00:00:00 2001
From: Tobias Klauser
Date: Thu, 19 Jan 2017 16:04:25 +0100
Subject: net: caif: Remove unused stats member from struct chnl_net

The stats member of struct chnl_net is used nowhere in the code, so it
might as well be removed.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/chnl_net.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 3408ed51b611..1816fc9f1ee7 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -44,7 +44,6 @@ enum caif_states {
 
 struct chnl_net {
 	struct cflayer chnl;
-	struct net_device_stats stats;
 	struct caif_connect_request conn_req;
 	struct list_head list_field;
 	struct net_device *netdev;
-- 
cgit v1.2.3


From a1a22c12060e4b9c52f45d4b3460f614e00162a2 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Wed, 18 Jan 2017 07:40:36 -0800
Subject: net: ipv6: Keep nexthop of multipath route on admin down

IPv6 deletes route entries associated with multipath routes on an
admin down where IPv4 does not. For example:
    $ ip ro ls vrf red
    unreachable default metric 8192
    1.1.1.0/24 metric 64
            nexthop via 10.100.1.254  dev eth1 weight 1
            nexthop via 10.100.2.254  dev eth2 weight 1
    10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.4
    10.100.2.0/24 dev eth2 proto kernel scope link src 10.100.2.4

    $ ip -6 ro ls vrf red
    2001:db8:1::/120 dev eth1 proto kernel metric 256  pref medium
    2001:db8:2:: dev red proto none metric 0  pref medium
    2001:db8:2::/120 dev eth2 proto kernel metric 256  pref medium
    2001:db8:11::/120 via 2001:db8:1::16 dev eth1 metric 1024  pref medium
    2001:db8:11::/120 via 2001:db8:2::17 dev eth2 metric 1024  pref medium
    ...

Set link down:
    $ ip li set eth1 down

IPv4 retains the multihop route but flags eth1 route as dead:

    $ ip ro ls vrf red
    unreachable default metric 8192
    1.1.1.0/24
            nexthop via 10.100.1.16  dev eth1 weight 1 dead linkdown
            nexthop via 10.100.2.16  dev eth2 weight 1
    10.100.2.0/24 dev eth2 proto kernel scope link src 10.100.2.4

and IPv6 deletes the route as part of flushing all routes for the device:

    $ ip -6 ro ls vrf red
    2001:db8:2:: dev red proto none metric 0  pref medium
    2001:db8:2::/120 dev eth2 proto kernel metric 256  pref medium
    2001:db8:11::/120 via 2001:db8:2::17 dev eth2 metric 1024  pref medium
    ...

Worse, on admin up of the device the multipath route has to be deleted
to get this leg of the route re-added.

This patch keeps routes that are part of a multipath route if
ignore_routes_with_linkdown is set with the dead and linkdown flags
enabling consistency between IPv4 and IPv6:

    $ ip -6 ro ls vrf red
    2001:db8:2:: dev red proto none metric 0  pref medium
    2001:db8:2::/120 dev eth2 proto kernel metric 256  pref medium
    2001:db8:11::/120 via 2001:db8:1::16 dev eth1 metric 1024 dead linkdown  pref medium
    2001:db8:11::/120 via 2001:db8:2::17 dev eth2 metric 1024  pref medium
    ...

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5585c501a540..4b1f0f98a0e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2711,13 +2711,16 @@ struct arg_dev_net {
 	struct net *net;
 };
 
+/* called with write lock held for table with rt */
 static int fib6_ifdown(struct rt6_info *rt, void *arg)
 {
 	const struct arg_dev_net *adn = arg;
 	const struct net_device *dev = adn->dev;
 
 	if ((rt->dst.dev == dev || !dev) &&
-	    rt != adn->net->ipv6.ip6_null_entry)
+	    rt != adn->net->ipv6.ip6_null_entry &&
+	    (rt->rt6i_nsiblings == 0 ||
+	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
 		return -1;
 
 	return 0;
@@ -3216,7 +3219,7 @@ static int rt6_fill_node(struct net *net,
 	else
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
-	if (!netif_carrier_ok(rt->dst.dev)) {
+	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
 			rtm->rtm_flags |= RTNH_F_DEAD;
-- 
cgit v1.2.3


From c2a2efbbfcb31bedcf81170fc1aa920255c33b8f Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 20 Jan 2017 05:06:08 -0800
Subject: net: remove bh disabling around percpu_counter accesses

Shaohua Li made percpu_counter irq safe in commit 098faf5805c8
("percpu_counter: make APIs irq safe")

We can safely remove BH disable/enable sections around various
percpu_counter manipulations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_ops.h           | 9 +--------
 include/net/inet_frag.h         | 8 +-------
 net/ipv4/inet_connection_sock.c | 3 +--
 net/ipv4/proc.c                 | 2 --
 net/ipv4/tcp.c                  | 2 --
 net/ipv4/tcp_ipv4.c             | 2 --
 6 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index a0d443ca16fc..8a2b66d8d78d 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -46,19 +46,12 @@ static inline int dst_entries_get_fast(struct dst_ops *dst)
 
 static inline int dst_entries_get_slow(struct dst_ops *dst)
 {
-	int res;
-
-	local_bh_disable();
-	res = percpu_counter_sum_positive(&dst->pcpuc_entries);
-	local_bh_enable();
-	return res;
+	return percpu_counter_sum_positive(&dst->pcpuc_entries);
 }
 
 static inline void dst_entries_add(struct dst_ops *dst, int val)
 {
-	local_bh_disable();
 	percpu_counter_add(&dst->pcpuc_entries, val);
-	local_bh_enable();
 }
 
 static inline int dst_entries_init(struct dst_ops *dst)
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 909972aa3acd..5894730ec82a 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -164,13 +164,7 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 
 static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
 {
-	unsigned int res;
-
-	local_bh_disable();
-	res = percpu_counter_sum_positive(&nf->mem);
-	local_bh_enable();
-
-	return res;
+	return percpu_counter_sum_positive(&nf->mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 096a085611ab..c7f7c5335369 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -836,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk)
 
 	sk_refcnt_debug_release(sk);
 
-	local_bh_disable();
 	percpu_counter_dec(sk->sk_prot->orphan_count);
-	local_bh_enable();
+
 	sock_put(sk);
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0247ca032232..a9deeb90dd36 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -57,10 +57,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	unsigned int frag_mem;
 	int orphans, sockets;
 
-	local_bh_disable();
 	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
 	sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
-	local_bh_enable();
 
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index aba6ea76338e..c43eb1a831d7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -420,9 +420,7 @@ void tcp_init_sock(struct sock *sk)
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
-	local_bh_disable();
 	sk_sockets_allocated_inc(sk);
-	local_bh_enable();
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3644fc117691..f7325b25b06e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1887,9 +1887,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	tcp_free_fastopen_req(tp);
 	tcp_saved_syn_free(tp);
 
-	local_bh_disable();
 	sk_sockets_allocated_dec(sk);
-	local_bh_enable();
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
 
-- 
cgit v1.2.3


From c10aa71b9d823603306a3bcd19f6c584bdf14ef7 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki
Date: Fri, 20 Jan 2017 14:53:06 +0100
Subject: gre6: Clean up unused struct ipv6_tel_txoption definition

Commit b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE
functions") removed the ip6gre specific transmit function, but left the
struct ipv6_tel_txoption definition. Clean it up.

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 75b6108234dd..65bdfd1cca80 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -484,11 +484,6 @@ drop:
 	return 0;
 }
 
-struct ipv6_tel_txoption {
-	struct ipv6_txoptions ops;
-	__u8 dst_opt[8];
-};
-
 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 {
 	return iptunnel_handle_offloads(skb,
-- 
cgit v1.2.3


From 9af15c38254d81c9991eba89335ca7c537d7f2c3 Mon Sep 17 00:00:00 2001
From: Phil Sutter
Date: Wed, 18 Jan 2017 14:04:39 +0100
Subject: device: Implement a bus agnostic dev_num_vf routine

Now that pci_bus_type has num_vf callback set, dev_num_vf can be
implemented in a bus type independent way and the check for whether a
PCI device is being handled in rtnetlink can be dropped.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/device.h | 7 +++++++
 include/linux/pci.h    | 2 --
 net/core/rtnetlink.c   | 3 +--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/device.h b/include/linux/device.h
index 6d73b70a4a5d..bd684fc8ec1d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1144,6 +1144,13 @@ extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
 extern void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
 
+static inline int dev_num_vf(struct device *dev)
+{
+	if (dev->bus && dev->bus->num_vf)
+		return dev->bus->num_vf(dev);
+	return 0;
+}
+
 /*
  * Root device objects for grouping under /sys/devices
  */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e2d1a124216a..adbc859fe7c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -885,7 +885,6 @@ void pcibios_setup_bridge(struct pci_bus *bus, unsigned long type);
 void pci_sort_breadthfirst(void);
 #define dev_is_pci(d) ((d)->bus == &pci_bus_type)
 #define dev_is_pf(d) ((dev_is_pci(d) ? to_pci_dev(d)->is_physfn : false))
-#define dev_num_vf(d) ((dev_is_pci(d) ? pci_num_vf(to_pci_dev(d)) : 0))
 
 /* Generic PCI functions exported to card drivers */
 
@@ -1630,7 +1629,6 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 
 #define dev_is_pci(d) (false)
 #define dev_is_pf(d) (false)
-#define dev_num_vf(d) (0)
 #endif /* CONFIG_PCI */
 
 /* Include architecture-dependent settings and functions */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f538f764fca6..152744643074 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 static inline int rtnl_vfinfo_size(const struct net_device *dev,
 				   u32 ext_filter_mask)
 {
-	if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
-	    (ext_filter_mask & RTEXT_FILTER_VF)) {
+	if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
 		int num_vfs = dev_num_vf(dev->dev.parent);
 		size_t size = nla_total_size(0);
 		size += num_vfs *
-- 
cgit v1.2.3


From 9999974a8318b605ebae08a87e86232659e56a52 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy
Date: Wed, 18 Jan 2017 13:50:50 -0500
Subject: tipc: add function for checking broadcast support in bearer

As a preparation for the 'replicast' functionality we are going to
introduce in the next commits, we need the broadcast base structure to
store whether bearer broadcast is available at all from the currently
used bearer or bearers.

We do this by adding a new function tipc_bearer_bcast_support() to
the bearer layer, and letting the bearer selection function in
bcast.c use this to give a new boolean field, 'bcast_support' the
appropriate value.

Reviewed-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c     | 12 +++++++++---
 net/tipc/bearer.c    | 15 ++++++++++++++-
 net/tipc/bearer.h    |  8 +++++++-
 net/tipc/udp_media.c |  8 ++++----
 4 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index c35fad3e08e8..325627612bac 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/bcast.c: TIPC broadcast code
  *
- * Copyright (c) 2004-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
  * Copyright (c) 2004, Intel Corporation.
  * Copyright (c) 2005, 2010-2011, Wind River Systems
  * All rights reserved.
@@ -54,12 +54,14 @@ const char tipc_bclink_name[] = "broadcast-link";
  * @inputq: data input queue; will only carry SOCK_WAKEUP messages
  * @dest: array keeping number of reachable destinations per bearer
  * @primary_bearer: a bearer having links to all broadcast destinations, if any
+ * @bcast_support: indicates if primary bearer, if any, supports broadcast
  */
 struct tipc_bc_base {
 	struct tipc_link *link;
 	struct sk_buff_head inputq;
 	int dests[MAX_BEARERS];
 	int primary_bearer;
+	bool bcast_support;
 };
 
 static struct tipc_bc_base *tipc_bc_base(struct net *net)
@@ -79,9 +81,10 @@ static void tipc_bcbase_select_primary(struct net *net)
 {
 	struct tipc_bc_base *bb = tipc_bc_base(net);
 	int all_dests =  tipc_link_bc_peers(bb->link);
-	int i, mtu;
+	int i, mtu, prim;
 
 	bb->primary_bearer = INVALID_BEARER_ID;
+	bb->bcast_support = true;
 
 	if (!all_dests)
 		return;
@@ -93,7 +96,7 @@ static void tipc_bcbase_select_primary(struct net *net)
 		mtu = tipc_bearer_mtu(net, i);
 		if (mtu < tipc_link_mtu(bb->link))
 			tipc_link_set_mtu(bb->link, mtu);
-
+		bb->bcast_support &= tipc_bearer_bcast_support(net, i);
 		if (bb->dests[i] < all_dests)
 			continue;
 
@@ -103,6 +106,9 @@ static void tipc_bcbase_select_primary(struct net *net)
 		if ((i ^ tipc_own_addr(net)) & 1)
 			break;
 	}
+	prim = bb->primary_bearer;
+	if (prim != INVALID_BEARER_ID)
+		bb->bcast_support = tipc_bearer_bcast_support(net, prim);
 }
 
 void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id)
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52d74760fb68..33a5bdfbef76 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -431,7 +431,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
 	memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
 	memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
 	b->bcast_addr.media_id = b->media->type_id;
-	b->bcast_addr.broadcast = 1;
+	b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
 	b->mtu = dev->mtu;
 	b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr);
 	rcu_assign_pointer(dev->tipc_ptr, b);
@@ -482,6 +482,19 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
 	return 0;
 }
 
+bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id)
+{
+	bool supp = false;
+	struct tipc_bearer *b;
+
+	rcu_read_lock();
+	b = bearer_get(net, bearer_id);
+	if (b)
+		supp = (b->bcast_addr.broadcast == TIPC_BROADCAST_SUPPORT);
+	rcu_read_unlock();
+	return supp;
+}
+
 int tipc_bearer_mtu(struct net *net, u32 bearer_id)
 {
 	int mtu = 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 278ff7f616f9..635c9086e19a 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -60,9 +60,14 @@
 #define TIPC_MEDIA_TYPE_IB	2
 #define TIPC_MEDIA_TYPE_UDP	3
 
-/* minimum bearer MTU */
+/* Minimum bearer MTU */
 #define TIPC_MIN_BEARER_MTU	(MAX_H_SIZE + INT_H_SIZE)
 
+/* Identifiers for distinguishing between broadcast/multicast and replicast
+ */
+#define TIPC_BROADCAST_SUPPORT  1
+#define TIPC_REPLICAST_SUPPORT  2
+
 /**
  * struct tipc_media_addr - destination address used by TIPC bearers
  * @value: address info (format defined by media)
@@ -210,6 +215,7 @@ int tipc_bearer_setup(void);
 void tipc_bearer_cleanup(void);
 void tipc_bearer_stop(struct net *net);
 int tipc_bearer_mtu(struct net *net, u32 bearer_id);
+bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id);
 void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
 			  struct sk_buff *skb,
 			  struct tipc_media_addr *dest);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index b58dc95f3d35..46061cf48cd1 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -113,7 +113,7 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
 	memcpy(addr->value, ua, sizeof(struct udp_media_addr));
 
 	if (tipc_udp_is_mcast_addr(ua))
-		addr->broadcast = 1;
+		addr->broadcast = TIPC_BROADCAST_SUPPORT;
 }
 
 /* tipc_udp_addr2str - convert ip/udp address to string */
@@ -229,7 +229,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
 		goto out;
 	}
 
-	if (!addr->broadcast || list_empty(&ub->rcast.list))
+	if (addr->broadcast != TIPC_REPLICAST_SUPPORT)
 		return tipc_udp_xmit(net, skb, ub, src, dst);
 
 	/* Replicast, send an skb to each configured IP address */
@@ -296,7 +296,7 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b,
 	else if (ntohs(addr->proto) == ETH_P_IPV6)
 		pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
 #endif
-
+	b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT;
 	list_add_rcu(&rcast->list, &ub->rcast.list);
 	return 0;
 }
@@ -681,7 +681,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 		goto err;
 
 	b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
-	b->bcast_addr.broadcast = 1;
+	b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
 	rcu_assign_pointer(b->media_ptr, ub);
 	rcu_assign_pointer(ub->bearer, b);
 	tipc_udp_media_addr_set(&b->addr, &local);
-- 
cgit v1.2.3


From 2ae0b8af1fe35ddaa2e46704ae31a2f9cac0349d Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy
Date: Wed, 18 Jan 2017 13:50:51 -0500
Subject: tipc: add functionality to lookup multicast destination nodes

As a further preparation for the upcoming 'replicast' functionality,
we add some necessary structs and functions for looking up and returning
a list of all nodes that host destinations for a given multicast message.

Reviewed-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c      | 33 +++++++++++++++++++++++++++++++--
 net/tipc/bcast.h      | 15 ++++++++++++++-
 net/tipc/name_table.c | 38 +++++++++++++++++++++++++++++++++-----
 net/tipc/name_table.h |  9 +++++++++
 4 files changed, 87 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 325627612bac..412d3351abb7 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -39,9 +39,8 @@
 #include "socket.h"
 #include "msg.h"
 #include "bcast.h"
-#include "name_distr.h"
 #include "link.h"
-#include "node.h"
+#include "name_table.h"
 
 #define	BCLINK_WIN_DEFAULT	50	/* bcast link window size (default) */
 #define	BCLINK_WIN_MIN	        32	/* bcast minimum link window size */
@@ -434,3 +433,33 @@ void tipc_bcast_stop(struct net *net)
 	kfree(tn->bcbase);
 	kfree(tn->bcl);
 }
+
+void tipc_nlist_init(struct tipc_nlist *nl, u32 self)
+{
+	memset(nl, 0, sizeof(*nl));
+	INIT_LIST_HEAD(&nl->list);
+	nl->self = self;
+}
+
+void tipc_nlist_add(struct tipc_nlist *nl, u32 node)
+{
+	if (node == nl->self)
+		nl->local = true;
+	else if (u32_push(&nl->list, node))
+		nl->remote++;
+}
+
+void tipc_nlist_del(struct tipc_nlist *nl, u32 node)
+{
+	if (node == nl->self)
+		nl->local = false;
+	else if (u32_del(&nl->list, node))
+		nl->remote--;
+}
+
+void tipc_nlist_purge(struct tipc_nlist *nl)
+{
+	u32_list_purge(&nl->list);
+	nl->remote = 0;
+	nl->local = 0;
+}
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 855d53c64ab3..18f379198f8f 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -42,9 +42,22 @@
 struct tipc_node;
 struct tipc_msg;
 struct tipc_nl_msg;
-struct tipc_node_map;
+struct tipc_nlist;
+struct tipc_nitem;
 extern const char tipc_bclink_name[];
 
+struct tipc_nlist {
+	struct list_head list;
+	u32 self;
+	u16 remote;
+	bool local;
+};
+
+void tipc_nlist_init(struct tipc_nlist *nl, u32 self);
+void tipc_nlist_purge(struct tipc_nlist *nl);
+void tipc_nlist_add(struct tipc_nlist *nl, u32 node);
+void tipc_nlist_del(struct tipc_nlist *nl, u32 node);
+
 int tipc_bcast_init(struct net *net);
 void tipc_bcast_stop(struct net *net);
 void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 5a86df1e5fc2..9be6592e4a6f 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -645,6 +645,39 @@ exit:
 	return res;
 }
 
+/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
+ * - Creates list of nodes that overlap the given multicast address
+ * - Determines if any node local ports overlap
+ */
+void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
+				   u32 upper, u32 domain,
+				   struct tipc_nlist *nodes)
+{
+	struct sub_seq *sseq, *stop;
+	struct publication *publ;
+	struct name_info *info;
+	struct name_seq *seq;
+
+	rcu_read_lock();
+	seq = nametbl_find_seq(net, type);
+	if (!seq)
+		goto exit;
+
+	spin_lock_bh(&seq->lock);
+	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
+	stop = seq->sseqs + seq->first_free;
+	for (; sseq->lower <= upper && sseq != stop; sseq++) {
+		info = sseq->info;
+		list_for_each_entry(publ, &info->zone_list, zone_list) {
+			if (tipc_in_scope(domain, publ->node))
+				tipc_nlist_add(nodes, publ->node);
+		}
+	}
+	spin_unlock_bh(&seq->lock);
+exit:
+	rcu_read_unlock();
+}
+
 /*
  * tipc_nametbl_publish - add name publication to network name tables
  */
@@ -1022,11 +1055,6 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-struct u32_item {
-	struct list_head list;
-	u32 value;
-};
-
 bool u32_find(struct list_head *l, u32 value)
 {
 	struct u32_item *item;
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index c89bb3f5c364..6ebdeb1d84a5 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -39,6 +39,7 @@
 
 struct tipc_subscription;
 struct tipc_plist;
+struct tipc_nlist;
 
 /*
  * TIPC name types reserved for internal TIPC use (both current and planned)
@@ -100,6 +101,9 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 			      u32 limit, struct list_head *dports);
+void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
+				   u32 upper, u32 domain,
+				   struct tipc_nlist *nodes);
 struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
 					 u32 upper, u32 scope, u32 port_ref,
 					 u32 key);
@@ -116,6 +120,11 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
 int tipc_nametbl_init(struct net *net);
 void tipc_nametbl_stop(struct net *net);
 
+struct u32_item {
+	struct list_head list;
+	u32 value;
+};
+
 bool u32_push(struct list_head *l, u32 value);
 u32 u32_pop(struct list_head *l);
 bool u32_find(struct list_head *l, u32 value);
-- 
cgit v1.2.3


From a853e4c6d0843729e1f25a7a7beff168e1dd7420 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy
Date: Wed, 18 Jan 2017 13:50:52 -0500
Subject: tipc: introduce replicast as transport option for multicast

TIPC multicast messages are currently carried over a reliable
'broadcast link', making use of the underlying media's ability to
transport packets as L2 broadcast or IP multicast to all nodes in
the cluster.

When the used bearer is lacking that ability, we can instead emulate
the broadcast service by replicating and sending the packets over as
many unicast links as needed to reach all identified destinations.
We now introduce a new TIPC link-level 'replicast' service that does
this.

Reviewed-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c  | 105 ++++++++++++++++++++++++++++++++++++++++++------------
 net/tipc/bcast.h  |   3 +-
 net/tipc/link.c   |   8 ++++-
 net/tipc/msg.c    |  17 +++++++++
 net/tipc/msg.h    |   9 +++--
 net/tipc/node.c   |  27 +++++++++-----
 net/tipc/socket.c |  27 +++++++++-----
 7 files changed, 149 insertions(+), 47 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 412d3351abb7..672e6ef93cab 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -70,7 +70,7 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
 
 int tipc_bcast_get_mtu(struct net *net)
 {
-	return tipc_link_mtu(tipc_bc_sndlink(net));
+	return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
 }
 
 /* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
@@ -175,42 +175,101 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
 	__skb_queue_purge(&_xmitq);
 }
 
-/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster
- *                    and to identified node local sockets
+/* tipc_bcast_xmit - broadcast the buffer chain to all external nodes
  * @net: the applicable net namespace
- * @list: chain of buffers containing message
+ * @pkts: chain of buffers containing message
+ * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0
  * Consumes the buffer chain.
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE
  */
-int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
+static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
+			   u16 *cong_link_cnt)
 {
 	struct tipc_link *l = tipc_bc_sndlink(net);
-	struct sk_buff_head xmitq, inputq, rcvq;
+	struct sk_buff_head xmitq;
 	int rc = 0;
 
-	__skb_queue_head_init(&rcvq);
 	__skb_queue_head_init(&xmitq);
-	skb_queue_head_init(&inputq);
-
-	/* Prepare message clone for local node */
-	if (unlikely(!tipc_msg_reassemble(list, &rcvq)))
-		return -EHOSTUNREACH;
-
 	tipc_bcast_lock(net);
 	if (tipc_link_bc_peers(l))
-		rc = tipc_link_xmit(l, list, &xmitq);
+		rc = tipc_link_xmit(l, pkts, &xmitq);
 	tipc_bcast_unlock(net);
+	tipc_bcbase_xmit(net, &xmitq);
+	__skb_queue_purge(pkts);
+	if (rc == -ELINKCONG) {
+		*cong_link_cnt = 1;
+		rc = 0;
+	}
+	return rc;
+}
 
-	/* Don't send to local node if adding to link failed */
-	if (unlikely(rc && (rc != -ELINKCONG))) {
-		__skb_queue_purge(&rcvq);
-		return rc;
+/* tipc_rcast_xmit - replicate and send a message to given destination nodes
+ * @net: the applicable net namespace
+ * @pkts: chain of buffers containing message
+ * @dests: list of destination nodes
+ * @cong_link_cnt: returns number of congested links
+ * @cong_links: returns identities of congested links
+ * Returns 0 if success, otherwise errno
+ */
+static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
+			   struct tipc_nlist *dests, u16 *cong_link_cnt)
+{
+	struct sk_buff_head _pkts;
+	struct u32_item *n, *tmp;
+	u32 dst, selector;
+
+	selector = msg_link_selector(buf_msg(skb_peek(pkts)));
+	__skb_queue_head_init(&_pkts);
+
+	list_for_each_entry_safe(n, tmp, &dests->list, list) {
+		dst = n->value;
+		if (!tipc_msg_pskb_copy(dst, pkts, &_pkts))
+			return -ENOMEM;
+
+		/* Any other return value than -ELINKCONG is ignored */
+		if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG)
+			(*cong_link_cnt)++;
 	}
+	return 0;
+}
 
-	/* Broadcast to all nodes, inluding local node */
-	tipc_bcbase_xmit(net, &xmitq);
-	tipc_sk_mcast_rcv(net, &rcvq, &inputq);
-	__skb_queue_purge(list);
+/* tipc_mcast_xmit - deliver message to indicated destination nodes
+ *                   and to identified node local sockets
+ * @net: the applicable net namespace
+ * @pkts: chain of buffers containing message
+ * @dests: destination nodes for message. Not consumed.
+ * @cong_link_cnt: returns number of encountered congested destination links
+ * @cong_links: returns identities of congested links
+ * Consumes buffer chain.
+ * Returns 0 if success, otherwise errno
+ */
+int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
+		    struct tipc_nlist *dests, u16 *cong_link_cnt)
+{
+	struct tipc_bc_base *bb = tipc_bc_base(net);
+	struct sk_buff_head inputq, localq;
+	int rc = 0;
+
+	skb_queue_head_init(&inputq);
+	skb_queue_head_init(&localq);
+
+	/* Clone packets before they are consumed by next call */
+	if (dests->local && !tipc_msg_reassemble(pkts, &localq)) {
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	if (dests->remote) {
+		if (!bb->bcast_support)
+			rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
+		else
+			rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
+	}
+
+	if (dests->local)
+		tipc_sk_mcast_rcv(net, &localq, &inputq);
+exit:
+	__skb_queue_purge(pkts);
 	return rc;
 }
 
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 18f379198f8f..dd772e6f6fa4 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -66,7 +66,8 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
 void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
 void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
 int  tipc_bcast_get_mtu(struct net *net);
-int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
+int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
+		    struct tipc_nlist *dests, u16 *cong_link_cnt);
 int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
 void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index b0f8646e0631..b17b9e155469 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1032,11 +1032,17 @@ int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to,
 static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
 			    struct sk_buff_head *inputq)
 {
-	switch (msg_user(buf_msg(skb))) {
+	struct tipc_msg *hdr = buf_msg(skb);
+
+	switch (msg_user(hdr)) {
 	case TIPC_LOW_IMPORTANCE:
 	case TIPC_MEDIUM_IMPORTANCE:
 	case TIPC_HIGH_IMPORTANCE:
 	case TIPC_CRITICAL_IMPORTANCE:
+		if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) {
+			skb_queue_tail(l->bc_rcvlink->inputq, skb);
+			return true;
+		}
 	case CONN_MANAGER:
 		skb_queue_tail(inputq, skb);
 		return true;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index ab02d0742476..312ef7de57d7 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -607,6 +607,23 @@ error:
 	return false;
 }
 
+bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
+			struct sk_buff_head *cpy)
+{
+	struct sk_buff *skb, *_skb;
+
+	skb_queue_walk(msg, skb) {
+		_skb = pskb_copy(skb, GFP_ATOMIC);
+		if (!_skb) {
+			__skb_queue_purge(cpy);
+			return false;
+		}
+		msg_set_destnode(buf_msg(_skb), dst);
+		__skb_queue_tail(cpy, _skb);
+	}
+	return true;
+}
+
 /* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
  * @list: list to be appended to
  * @seqno: sequence number of buffer to add
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index f07b51e3f6f1..c843fd2bc48d 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -631,14 +631,11 @@ static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
 
 static inline u32 msg_link_selector(struct tipc_msg *m)
 {
+	if (msg_user(m) == MSG_FRAGMENTER)
+		m = (void *)msg_data(m);
 	return msg_bits(m, 4, 0, 1);
 }
 
-static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
-{
-	msg_set_bits(m, 4, 0, 1, n);
-}
-
 /*
  * Word 5
  */
@@ -835,6 +832,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		   int offset, int dsz, int mtu, struct sk_buff_head *list);
 bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
 bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
+bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
+			struct sk_buff_head *cpy);
 void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
 			     struct sk_buff *skb);
 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 2883f6a0ed98..f96dacf173ab 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1257,6 +1257,19 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+static void tipc_node_mcast_rcv(struct tipc_node *n)
+{
+	struct tipc_bclink_entry *be = &n->bc_entry;
+
+	/* 'arrvq' is under inputq2's lock protection */
+	spin_lock_bh(&be->inputq2.lock);
+	spin_lock_bh(&be->inputq1.lock);
+	skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
+	spin_unlock_bh(&be->inputq1.lock);
+	spin_unlock_bh(&be->inputq2.lock);
+	tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2);
+}
+
 static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
 				  int bearer_id, struct sk_buff_head *xmitq)
 {
@@ -1330,15 +1343,8 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
 	if (!skb_queue_empty(&xmitq))
 		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
 
-	/* Deliver. 'arrvq' is under inputq2's lock protection */
-	if (!skb_queue_empty(&be->inputq1)) {
-		spin_lock_bh(&be->inputq2.lock);
-		spin_lock_bh(&be->inputq1.lock);
-		skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
-		spin_unlock_bh(&be->inputq1.lock);
-		spin_unlock_bh(&be->inputq2.lock);
-		tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2);
-	}
+	if (!skb_queue_empty(&be->inputq1))
+		tipc_node_mcast_rcv(n);
 
 	if (rc & TIPC_LINK_DOWN_EVT) {
 		/* Reception reassembly failure => reset all links to peer */
@@ -1565,6 +1571,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 	if (unlikely(!skb_queue_empty(&n->bc_entry.namedq)))
 		tipc_named_rcv(net, &n->bc_entry.namedq);
 
+	if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1)))
+		tipc_node_mcast_rcv(n);
+
 	if (!skb_queue_empty(&le->inputq))
 		tipc_sk_rcv(net, &le->inputq);
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index d2f353934f82..93b6ae3154c9 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -740,32 +740,43 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
 	int mtu = tipc_bcast_get_mtu(net);
+	u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
 	struct sk_buff_head pkts;
+	struct tipc_nlist dsts;
 	int rc;
 
+	/* Block or return if any destination link is congested */
 	rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
 	if (unlikely(rc))
 		return rc;
 
+	/* Lookup destination nodes */
+	tipc_nlist_init(&dsts, tipc_own_addr(net));
+	tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower,
+				      seq->upper, domain, &dsts);
+	if (!dsts.local && !dsts.remote)
+		return -EHOSTUNREACH;
+
+	/* Build message header */
 	msg_set_type(hdr, TIPC_MCAST_MSG);
+	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
 	msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
 	msg_set_destport(hdr, 0);
 	msg_set_destnode(hdr, 0);
 	msg_set_nametype(hdr, seq->type);
 	msg_set_namelower(hdr, seq->lower);
 	msg_set_nameupper(hdr, seq->upper);
-	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
 
+	/* Build message as chain of buffers */
 	skb_queue_head_init(&pkts);
 	rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
-	if (unlikely(rc != dlen))
-		return rc;
 
-	rc = tipc_bcast_xmit(net, &pkts);
-	if (unlikely(rc == -ELINKCONG)) {
-		tsk->cong_link_cnt = 1;
-		rc = 0;
-	}
+	/* Send message if build was successful */
+	if (unlikely(rc == dlen))
+		rc = tipc_mcast_xmit(net, &pkts, &dsts,
+				     &tsk->cong_link_cnt);
+
+	tipc_nlist_purge(&dsts);
 
 	return rc ? rc : dlen;
 }
-- 
cgit v1.2.3


From 01fd12bb189a0772301dd37e9b31e53761269a1b Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy
Date: Wed, 18 Jan 2017 13:50:53 -0500
Subject: tipc: make replicast a user selectable option

If the bearer carrying multicast messages supports broadcast, those
messages will be sent to all cluster nodes, irrespective of whether
these nodes host any actual destinations socket or not. This is clearly
wasteful if the cluster is large and there are only a few real
destinations for the message being sent.

In this commit we extend the eligibility of the newly introduced
"replicast" transmit option. We now make it possible for a user to
select which method he wants to be used, either as a mandatory setting
via setsockopt(), or as a relative setting where we let the broadcast
layer decide which method to use based on the ratio between cluster
size and the message's actual number of destination nodes.

In the latter case, a sending socket must stick to a previously
selected method until it enters an idle period of at least 5 seconds.
This eliminates the risk of message reordering caused by method change,
i.e., when changes to cluster size or number of destinations would
otherwise mandate a new method to be used.

Reviewed-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  6 +++--
 net/tipc/bcast.c          | 62 ++++++++++++++++++++++++++++++++++++++++++-----
 net/tipc/bcast.h          | 17 ++++++++++++-
 net/tipc/link.c           |  4 +++
 net/tipc/node.h           |  4 ++-
 net/tipc/socket.c         | 36 +++++++++++++++++++++------
 6 files changed, 112 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index bf049e8fe31b..5351b08c897a 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -1,7 +1,7 @@
 /*
  * include/uapi/linux/tipc.h: Header for TIPC socket interface
  *
- * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2003-2006, 2015-2016 Ericsson AB
  * Copyright (c) 2005, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -220,7 +220,7 @@ struct sockaddr_tipc {
 #define TIPC_DESTNAME	3	/* destination name */
 
 /*
- * TIPC-specific socket option values
+ * TIPC-specific socket option names
  */
 
 #define TIPC_IMPORTANCE		127	/* Default: TIPC_LOW_IMPORTANCE */
@@ -229,6 +229,8 @@ struct sockaddr_tipc {
 #define TIPC_CONN_TIMEOUT	130	/* Default: 8000 (ms)  */
 #define TIPC_NODE_RECVQ_DEPTH	131	/* Default: none (read only) */
 #define TIPC_SOCK_RECVQ_DEPTH	132	/* Default: none (read only) */
+#define TIPC_MCAST_BROADCAST    133     /* Default: TIPC selects. No arg */
+#define TIPC_MCAST_REPLICAST    134     /* Default: TIPC selects. No arg */
 
 /*
  * Maximum sizes of TIPC bearer-related names (including terminating NULL)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 672e6ef93cab..7d99029df342 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -54,6 +54,9 @@ const char tipc_bclink_name[] = "broadcast-link";
  * @dest: array keeping number of reachable destinations per bearer
  * @primary_bearer: a bearer having links to all broadcast destinations, if any
  * @bcast_support: indicates if primary bearer, if any, supports broadcast
+ * @rcast_support: indicates if all peer nodes support replicast
+ * @rc_ratio: dest count as percentage of cluster size where send method changes
+ * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast
  */
 struct tipc_bc_base {
 	struct tipc_link *link;
@@ -61,6 +64,9 @@ struct tipc_bc_base {
 	int dests[MAX_BEARERS];
 	int primary_bearer;
 	bool bcast_support;
+	bool rcast_support;
+	int rc_ratio;
+	int bc_threshold;
 };
 
 static struct tipc_bc_base *tipc_bc_base(struct net *net)
@@ -73,6 +79,19 @@ int tipc_bcast_get_mtu(struct net *net)
 	return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
 }
 
+void tipc_bcast_disable_rcast(struct net *net)
+{
+	tipc_bc_base(net)->rcast_support = false;
+}
+
+static void tipc_bcbase_calc_bc_threshold(struct net *net)
+{
+	struct tipc_bc_base *bb = tipc_bc_base(net);
+	int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net));
+
+	bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100);
+}
+
 /* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
  *                               if any, and make it primary bearer
  */
@@ -175,6 +194,31 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
 	__skb_queue_purge(&_xmitq);
 }
 
+static void tipc_bcast_select_xmit_method(struct net *net, int dests,
+					  struct tipc_mc_method *method)
+{
+	struct tipc_bc_base *bb = tipc_bc_base(net);
+	unsigned long exp = method->expires;
+
+	/* Broadcast supported by used bearer/bearers? */
+	if (!bb->bcast_support) {
+		method->rcast = true;
+		return;
+	}
+	/* Any destinations which don't support replicast ? */
+	if (!bb->rcast_support) {
+		method->rcast = false;
+		return;
+	}
+	/* Can current method be changed ? */
+	method->expires = jiffies + TIPC_METHOD_EXPIRE;
+	if (method->mandatory || time_before(jiffies, exp))
+		return;
+
+	/* Determine method to use now */
+	method->rcast = dests <= bb->bc_threshold;
+}
+
 /* tipc_bcast_xmit - broadcast the buffer chain to all external nodes
  * @net: the applicable net namespace
  * @pkts: chain of buffers containing message
@@ -237,16 +281,16 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
  *                   and to identified node local sockets
  * @net: the applicable net namespace
  * @pkts: chain of buffers containing message
- * @dests: destination nodes for message. Not consumed.
+ * @method: send method to be used
+ * @dests: destination nodes for message.
  * @cong_link_cnt: returns number of encountered congested destination links
- * @cong_links: returns identities of congested links
  * Consumes buffer chain.
  * Returns 0 if success, otherwise errno
  */
 int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
-		    struct tipc_nlist *dests, u16 *cong_link_cnt)
+		    struct tipc_mc_method *method, struct tipc_nlist *dests,
+		    u16 *cong_link_cnt)
 {
-	struct tipc_bc_base *bb = tipc_bc_base(net);
 	struct sk_buff_head inputq, localq;
 	int rc = 0;
 
@@ -258,9 +302,10 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
 		rc = -ENOMEM;
 		goto exit;
 	}
-
+	/* Send according to determined transmit method */
 	if (dests->remote) {
-		if (!bb->bcast_support)
+		tipc_bcast_select_xmit_method(net, dests->remote, method);
+		if (method->rcast)
 			rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
 		else
 			rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
@@ -269,6 +314,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
 	if (dests->local)
 		tipc_sk_mcast_rcv(net, &localq, &inputq);
 exit:
+	/* This queue should normally be empty by now */
 	__skb_queue_purge(pkts);
 	return rc;
 }
@@ -377,6 +423,7 @@ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l,
 	tipc_bcast_lock(net);
 	tipc_link_add_bc_peer(snd_l, uc_l, xmitq);
 	tipc_bcbase_select_primary(net);
+	tipc_bcbase_calc_bc_threshold(net);
 	tipc_bcast_unlock(net);
 }
 
@@ -395,6 +442,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
 	tipc_bcast_lock(net);
 	tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq);
 	tipc_bcbase_select_primary(net);
+	tipc_bcbase_calc_bc_threshold(net);
 	tipc_bcast_unlock(net);
 
 	tipc_bcbase_xmit(net, &xmitq);
@@ -477,6 +525,8 @@ int tipc_bcast_init(struct net *net)
 		goto enomem;
 	bb->link = l;
 	tn->bcl = l;
+	bb->rc_ratio = 25;
+	bb->rcast_support = true;
 	return 0;
 enomem:
 	kfree(bb);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index dd772e6f6fa4..751530ab0c49 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -46,6 +46,8 @@ struct tipc_nlist;
 struct tipc_nitem;
 extern const char tipc_bclink_name[];
 
+#define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000)
+
 struct tipc_nlist {
 	struct list_head list;
 	u32 self;
@@ -58,6 +60,17 @@ void tipc_nlist_purge(struct tipc_nlist *nl);
 void tipc_nlist_add(struct tipc_nlist *nl, u32 node);
 void tipc_nlist_del(struct tipc_nlist *nl, u32 node);
 
+/* Cookie to be used between socket and broadcast layer
+ * @rcast: replicast (instead of broadcast) was used at previous xmit
+ * @mandatory: broadcast/replicast indication was set by user
+ * @expires: re-evaluate non-mandatory transmit method if we are past this
+ */
+struct tipc_mc_method {
+	bool rcast;
+	bool mandatory;
+	unsigned long expires;
+};
+
 int tipc_bcast_init(struct net *net);
 void tipc_bcast_stop(struct net *net);
 void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
@@ -66,8 +79,10 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
 void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
 void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
 int  tipc_bcast_get_mtu(struct net *net);
+void tipc_bcast_disable_rcast(struct net *net);
 int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
-		    struct tipc_nlist *dests, u16 *cong_link_cnt);
+		    struct tipc_mc_method *method, struct tipc_nlist *dests,
+		    u16 *cong_link_cnt);
 int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
 void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index b17b9e155469..ddd2dd6f77aa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -515,6 +515,10 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
 	if (link_is_bc_sndlink(l))
 		l->state = LINK_ESTABLISHED;
 
+	/* Disable replicast if even a single peer doesn't support it */
+	if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
+		tipc_bcast_disable_rcast(net);
+
 	return true;
 }
 
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 39ef54c1f2ad..898c22916984 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -47,11 +47,13 @@
 enum {
 	TIPC_BCAST_SYNCH      = (1 << 1),
 	TIPC_BCAST_STATE_NACK = (1 << 2),
-	TIPC_BLOCK_FLOWCTL    = (1 << 3)
+	TIPC_BLOCK_FLOWCTL    = (1 << 3),
+	TIPC_BCAST_RCAST      = (1 << 4)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
 				TIPC_BCAST_STATE_NACK | \
+				TIPC_BCAST_RCAST | \
 				TIPC_BLOCK_FLOWCTL)
 #define INVALID_BEARER_ID -1
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 93b6ae3154c9..5bec8aac5008 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -79,6 +79,7 @@ enum {
  * @rcv_unacked: # messages read by user, but not yet acked back to peer
  * @peer: 'connected' peer for dgram/rdm
  * @node: hash table node
+ * @mc_method: cookie for use between socket and broadcast layer
  * @rcu: rcu struct for tipc_sock
  */
 struct tipc_sock {
@@ -103,6 +104,7 @@ struct tipc_sock {
 	u16 rcv_win;
 	struct sockaddr_tipc peer;
 	struct rhash_head node;
+	struct tipc_mc_method mc_method;
 	struct rcu_head rcu;
 };
 
@@ -740,6 +742,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
 	int mtu = tipc_bcast_get_mtu(net);
+	struct tipc_mc_method *method = &tsk->mc_method;
 	u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
 	struct sk_buff_head pkts;
 	struct tipc_nlist dsts;
@@ -773,7 +776,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 
 	/* Send message if build was successful */
 	if (unlikely(rc == dlen))
-		rc = tipc_mcast_xmit(net, &pkts, &dsts,
+		rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
 				     &tsk->cong_link_cnt);
 
 	tipc_nlist_purge(&dsts);
@@ -2344,18 +2347,29 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	u32 value;
+	u32 value = 0;
 	int res;
 
 	if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
 		return 0;
 	if (lvl != SOL_TIPC)
 		return -ENOPROTOOPT;
-	if (ol < sizeof(value))
-		return -EINVAL;
-	res = get_user(value, (u32 __user *)ov);
-	if (res)
-		return res;
+
+	switch (opt) {
+	case TIPC_IMPORTANCE:
+	case TIPC_SRC_DROPPABLE:
+	case TIPC_DEST_DROPPABLE:
+	case TIPC_CONN_TIMEOUT:
+		if (ol < sizeof(value))
+			return -EINVAL;
+		res = get_user(value, (u32 __user *)ov);
+		if (res)
+			return res;
+		break;
+	default:
+		if (ov || ol)
+			return -EINVAL;
+	}
 
 	lock_sock(sk);
 
@@ -2376,6 +2390,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 		tipc_sk(sk)->conn_timeout = value;
 		/* no need to set "res", since already 0 at this point */
 		break;
+	case TIPC_MCAST_BROADCAST:
+		tsk->mc_method.rcast = false;
+		tsk->mc_method.mandatory = true;
+		break;
+	case TIPC_MCAST_REPLICAST:
+		tsk->mc_method.rcast = true;
+		tsk->mc_method.mandatory = true;
+		break;
 	default:
 		res = -EINVAL;
 	}
-- 
cgit v1.2.3


From 264b87fa617e758966108db48db220571ff3d60e Mon Sep 17 00:00:00 2001
From: Andrew Collins
Date: Wed, 18 Jan 2017 14:04:28 -0700
Subject: fq_codel: Avoid regenerating skb flow hash unless necessary

The fq_codel qdisc currently always regenerates the skb flow hash.
This wastes some cycles and prevents flow seperation in cases where
the traffic has been encrypted and can no longer be understood by the
flow dissector.

Change it to use the prexisting flow hash if one exists, and only
regenerate if necessary.

Signed-off-by: Andrew Collins <acollins@cradlepoint.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq_codel.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a5ea0e9b6be4..2f50e4c72fb4 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -57,7 +57,6 @@ struct fq_codel_sched_data {
 	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */
 	u32		*backlogs;	/* backlog table [flows_cnt] */
 	u32		flows_cnt;	/* number of flows */
-	u32		perturbation;	/* hash perturbation */
 	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
 	u32		drop_batch_size;
 	u32		memory_limit;
@@ -75,9 +74,7 @@ struct fq_codel_sched_data {
 static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
 				  struct sk_buff *skb)
 {
-	u32 hash = skb_get_hash_perturb(skb, q->perturbation);
-
-	return reciprocal_scale(hash, q->flows_cnt);
+	return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
 }
 
 static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -482,7 +479,6 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 	q->memory_limit = 32 << 20; /* 32 MBytes */
 	q->drop_batch_size = 64;
 	q->quantum = psched_mtu(qdisc_dev(sch));
-	q->perturbation = prandom_u32();
 	INIT_LIST_HEAD(&q->new_flows);
 	INIT_LIST_HEAD(&q->old_flows);
 	codel_params_init(&q->cparams);
-- 
cgit v1.2.3


From 1b7cd0044e4a9f69aaf00511870b07fcdeda591d Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar
Date: Wed, 18 Jan 2017 15:02:49 -0800
Subject: net: remove duplicate code.

netdev_rx_handler_register() checks to see if the handler is already
busy which was recently separated into netdev_is_rx_handler_busy(). So
use the same function inside register() to avoid code duplication.
Essentially this change should be a no-op

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index ad5959e56116..c8f1f67ff16c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3961,9 +3961,7 @@ int netdev_rx_handler_register(struct net_device *dev,
 			       rx_handler_func_t *rx_handler,
 			       void *rx_handler_data)
 {
-	ASSERT_RTNL();
-
-	if (dev->rx_handler)
+	if (netdev_is_rx_handler_busy(dev))
 		return -EBUSY;
 
 	/* Note: rx_handler_data must be set before rx_handler */
-- 
cgit v1.2.3


From 319554f284dda9f2737d09df82ba3610bd8ddea3 Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Thu, 19 Jan 2017 17:47:46 -0500
Subject: inet: don't use sk_v6_rcv_saddr directly

When comparing two sockets we need to use inet6_rcv_saddr so we get a NULL
sk_v6_rcv_saddr if the socket isn't AF_INET6, otherwise our comparison function
can be wrong.

Fixes: 637bc8b ("inet: reset tb->fastreuseport when adding a reuseport sk")
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c7f7c5335369..b4d5980ade3b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -99,7 +99,7 @@ int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 #if IS_ENABLED(CONFIG_IPV6)
 	if (sk->sk_family == AF_INET6)
 		return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
-					    &sk2->sk_v6_rcv_saddr,
+					    inet6_rcv_saddr(sk2),
 					    sk->sk_rcv_saddr,
 					    sk2->sk_rcv_saddr,
 					    ipv6_only_sock(sk),
-- 
cgit v1.2.3


From cf1a56a4cf196a2922e66e9a8e0bf80d324c5548 Mon Sep 17 00:00:00 2001
From: Andrew Lunn
Date: Fri, 20 Jan 2017 01:37:50 +0100
Subject: net: dsa: Remove hwmon support

Only the Marvell mv88e6xxx DSA driver made use of the HWMON support in
DSA. The temperature sensor registers are actually in the embedded
PHYs, and the PHY driver now supports it. So remove all HWMON support
from DSA and drivers.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c      | 154 ----------------------------------
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  16 ----
 include/net/dsa.h                     |   8 --
 net/dsa/Kconfig                       |  11 ---
 net/dsa/Makefile                      |   1 -
 net/dsa/dsa.c                         |   4 -
 net/dsa/dsa_priv.h                    |   9 --
 net/dsa/hwmon.c                       | 147 --------------------------------
 8 files changed, 350 deletions(-)
 delete mode 100644 net/dsa/hwmon.c

(limited to 'net')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 987b2dbbd35a..c7e08e13bb54 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2964,154 +2964,6 @@ static void mv88e6xxx_mdio_unregister(struct mv88e6xxx_chip *chip)
 		of_node_put(chip->mdio_np);
 }
 
-#ifdef CONFIG_NET_DSA_HWMON
-
-static int mv88e61xx_get_temp(struct dsa_switch *ds, int *temp)
-{
-	struct mv88e6xxx_chip *chip = ds->priv;
-	u16 val;
-	int ret;
-
-	*temp = 0;
-
-	mutex_lock(&chip->reg_lock);
-
-	ret = mv88e6xxx_phy_write(chip, 0x0, 0x16, 0x6);
-	if (ret < 0)
-		goto error;
-
-	/* Enable temperature sensor */
-	ret = mv88e6xxx_phy_read(chip, 0x0, 0x1a, &val);
-	if (ret < 0)
-		goto error;
-
-	ret = mv88e6xxx_phy_write(chip, 0x0, 0x1a, val | (1 << 5));
-	if (ret < 0)
-		goto error;
-
-	/* Wait for temperature to stabilize */
-	usleep_range(10000, 12000);
-
-	ret = mv88e6xxx_phy_read(chip, 0x0, 0x1a, &val);
-	if (ret < 0)
-		goto error;
-
-	/* Disable temperature sensor */
-	ret = mv88e6xxx_phy_write(chip, 0x0, 0x1a, val & ~(1 << 5));
-	if (ret < 0)
-		goto error;
-
-	*temp = ((val & 0x1f) - 5) * 5;
-
-error:
-	mv88e6xxx_phy_write(chip, 0x0, 0x16, 0x0);
-	mutex_unlock(&chip->reg_lock);
-	return ret;
-}
-
-static int mv88e63xx_get_temp(struct dsa_switch *ds, int *temp)
-{
-	struct mv88e6xxx_chip *chip = ds->priv;
-	int phy = mv88e6xxx_6320_family(chip) ? 3 : 0;
-	u16 val;
-	int ret;
-
-	*temp = 0;
-
-	mutex_lock(&chip->reg_lock);
-	ret = mv88e6xxx_phy_page_read(chip, phy, 6, 27, &val);
-	mutex_unlock(&chip->reg_lock);
-	if (ret < 0)
-		return ret;
-
-	*temp = (val & 0xff) - 25;
-
-	return 0;
-}
-
-static int mv88e6xxx_get_temp(struct dsa_switch *ds, int *temp)
-{
-	struct mv88e6xxx_chip *chip = ds->priv;
-
-	if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_TEMP))
-		return -EOPNOTSUPP;
-
-	if (mv88e6xxx_6320_family(chip) || mv88e6xxx_6352_family(chip))
-		return mv88e63xx_get_temp(ds, temp);
-
-	return mv88e61xx_get_temp(ds, temp);
-}
-
-static int mv88e6xxx_get_temp_limit(struct dsa_switch *ds, int *temp)
-{
-	struct mv88e6xxx_chip *chip = ds->priv;
-	int phy = mv88e6xxx_6320_family(chip) ? 3 : 0;
-	u16 val;
-	int ret;
-
-	if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_TEMP_LIMIT))
-		return -EOPNOTSUPP;
-
-	*temp = 0;
-
-	mutex_lock(&chip->reg_lock);
-	ret = mv88e6xxx_phy_page_read(chip, phy, 6, 26, &val);
-	mutex_unlock(&chip->reg_lock);
-	if (ret < 0)
-		return ret;
-
-	*temp = (((val >> 8) & 0x1f) * 5) - 25;
-
-	return 0;
-}
-
-static int mv88e6xxx_set_temp_limit(struct dsa_switch *ds, int temp)
-{
-	struct mv88e6xxx_chip *chip = ds->priv;
-	int phy = mv88e6xxx_6320_family(chip) ? 3 : 0;
-	u16 val;
-	int err;
-
-	if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_TEMP_LIMIT))
-		return -EOPNOTSUPP;
-
-	mutex_lock(&chip->reg_lock);
-	err = mv88e6xxx_phy_page_read(chip, phy, 6, 26, &val);
-	if (err)
-		goto unlock;
-	temp = clamp_val(DIV_ROUND_CLOSEST(temp, 5) + 5, 0, 0x1f);
-	err = mv88e6xxx_phy_page_write(chip, phy, 6, 26,
-				       (val & 0xe0ff) | (temp << 8));
-unlock:
-	mutex_unlock(&chip->reg_lock);
-
-	return err;
-}
-
-static int mv88e6xxx_get_temp_alarm(struct dsa_switch *ds, bool *alarm)
-{
-	struct mv88e6xxx_chip *chip = ds->priv;
-	int phy = mv88e6xxx_6320_family(chip) ? 3 : 0;
-	u16 val;
-	int ret;
-
-	if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_TEMP_LIMIT))
-		return -EOPNOTSUPP;
-
-	*alarm = false;
-
-	mutex_lock(&chip->reg_lock);
-	ret = mv88e6xxx_phy_page_read(chip, phy, 6, 26, &val);
-	mutex_unlock(&chip->reg_lock);
-	if (ret < 0)
-		return ret;
-
-	*alarm = !!(val & 0x40);
-
-	return 0;
-}
-#endif /* CONFIG_NET_DSA_HWMON */
-
 static int mv88e6xxx_get_eeprom_len(struct dsa_switch *ds)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
@@ -4386,12 +4238,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.get_sset_count		= mv88e6xxx_get_sset_count,
 	.set_eee		= mv88e6xxx_set_eee,
 	.get_eee		= mv88e6xxx_get_eee,
-#ifdef CONFIG_NET_DSA_HWMON
-	.get_temp		= mv88e6xxx_get_temp,
-	.get_temp_limit		= mv88e6xxx_get_temp_limit,
-	.set_temp_limit		= mv88e6xxx_set_temp_limit,
-	.get_temp_alarm		= mv88e6xxx_get_temp_alarm,
-#endif
 	.get_eeprom_len		= mv88e6xxx_get_eeprom_len,
 	.get_eeprom		= mv88e6xxx_get_eeprom,
 	.set_eeprom		= mv88e6xxx_set_eeprom,
diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index 466cfdadb7bd..ce8b43b14e96 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -497,12 +497,6 @@ enum mv88e6xxx_cap {
 	 */
 	MV88E6XXX_CAP_STU,
 
-	/* Internal temperature sensor.
-	 * Available from any enabled port's PHY register 26, page 6.
-	 */
-	MV88E6XXX_CAP_TEMP,
-	MV88E6XXX_CAP_TEMP_LIMIT,
-
 	/* VLAN Table Unit.
 	 * The VTU is used to program 802.1Q VLANs. See GLOBAL_VTU_OP.
 	 */
@@ -533,8 +527,6 @@ enum mv88e6xxx_cap {
 #define MV88E6XXX_FLAG_G2_POT		BIT_ULL(MV88E6XXX_CAP_G2_POT)
 
 #define MV88E6XXX_FLAG_STU		BIT_ULL(MV88E6XXX_CAP_STU)
-#define MV88E6XXX_FLAG_TEMP		BIT_ULL(MV88E6XXX_CAP_TEMP)
-#define MV88E6XXX_FLAG_TEMP_LIMIT	BIT_ULL(MV88E6XXX_CAP_TEMP_LIMIT)
 #define MV88E6XXX_FLAG_VTU		BIT_ULL(MV88E6XXX_CAP_VTU)
 
 /* Ingress Rate Limit unit */
@@ -586,7 +578,6 @@ enum mv88e6xxx_cap {
 	 MV88E6XXX_FLAG_G2_MGMT_EN_0X |	\
 	 MV88E6XXX_FLAG_G2_POT |	\
 	 MV88E6XXX_FLAG_STU |		\
-	 MV88E6XXX_FLAG_TEMP |		\
 	 MV88E6XXX_FLAG_VTU |		\
 	 MV88E6XXX_FLAGS_IRL |		\
 	 MV88E6XXX_FLAGS_MULTI_CHIP |	\
@@ -605,8 +596,6 @@ enum mv88e6xxx_cap {
 	 MV88E6XXX_FLAG_G2_MGMT_EN_2X |	\
 	 MV88E6XXX_FLAG_G2_MGMT_EN_0X |	\
 	 MV88E6XXX_FLAG_G2_POT |	\
-	 MV88E6XXX_FLAG_TEMP |		\
-	 MV88E6XXX_FLAG_TEMP_LIMIT |	\
 	 MV88E6XXX_FLAG_VTU |		\
 	 MV88E6XXX_FLAGS_IRL |		\
 	 MV88E6XXX_FLAGS_MULTI_CHIP |	\
@@ -621,7 +610,6 @@ enum mv88e6xxx_cap {
 	 MV88E6XXX_FLAG_G2_MGMT_EN_0X |	\
 	 MV88E6XXX_FLAG_G2_POT |	\
 	 MV88E6XXX_FLAG_STU |		\
-	 MV88E6XXX_FLAG_TEMP |		\
 	 MV88E6XXX_FLAG_VTU |		\
 	 MV88E6XXX_FLAGS_IRL |		\
 	 MV88E6XXX_FLAGS_MULTI_CHIP |	\
@@ -637,8 +625,6 @@ enum mv88e6xxx_cap {
 	 MV88E6XXX_FLAG_G2_MGMT_EN_0X |	\
 	 MV88E6XXX_FLAG_G2_POT |	\
 	 MV88E6XXX_FLAG_STU |		\
-	 MV88E6XXX_FLAG_TEMP |		\
-	 MV88E6XXX_FLAG_TEMP_LIMIT |	\
 	 MV88E6XXX_FLAG_VTU |		\
 	 MV88E6XXX_FLAGS_IRL |		\
 	 MV88E6XXX_FLAGS_MULTI_CHIP |	\
@@ -651,8 +637,6 @@ struct mv88e6xxx_ops;
 	(MV88E6XXX_FLAG_EEE |		\
 	 MV88E6XXX_FLAG_GLOBAL2 |	\
 	 MV88E6XXX_FLAG_STU |		\
-	 MV88E6XXX_FLAG_TEMP |		\
-	 MV88E6XXX_FLAG_TEMP_LIMIT |	\
 	 MV88E6XXX_FLAG_VTU |		\
 	 MV88E6XXX_FLAGS_IRL |		\
 	 MV88E6XXX_FLAGS_MULTI_CHIP |	\
diff --git a/include/net/dsa.h b/include/net/dsa.h
index c72ed7af2a2a..9d6cd923c48c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -307,14 +307,6 @@ struct dsa_switch_ops {
 	int	(*get_eee)(struct dsa_switch *ds, int port,
 			   struct ethtool_eee *e);
 
-#ifdef CONFIG_NET_DSA_HWMON
-	/* Hardware monitoring */
-	int	(*get_temp)(struct dsa_switch *ds, int *temp);
-	int	(*get_temp_limit)(struct dsa_switch *ds, int *temp);
-	int	(*set_temp_limit)(struct dsa_switch *ds, int temp);
-	int	(*get_temp_alarm)(struct dsa_switch *ds, bool *alarm);
-#endif
-
 	/* EEPROM access */
 	int	(*get_eeprom_len)(struct dsa_switch *ds);
 	int	(*get_eeprom)(struct dsa_switch *ds,
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 39bb5b3a82f2..9649238eef40 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -15,17 +15,6 @@ config NET_DSA
 
 if NET_DSA
 
-config NET_DSA_HWMON
-	bool "Distributed Switch Architecture HWMON support"
-	default y
-	depends on HWMON && !(NET_DSA=y && HWMON=m)
-	---help---
-	  Say Y if you want to expose thermal sensor data on switches supported
-	  by the Distributed Switch Architecture.
-
-	  Some of those switches contain thermal sensors. This data is available
-	  via the hwmon sysfs interface and exposes the onboard sensors.
-
 # tagging formats
 config NET_DSA_TAG_BRCM
 	bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 560b6747c276..a3380ed0e0be 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,7 +1,6 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += dsa.o slave.o dsa2.o
-dsa_core-$(CONFIG_NET_DSA_HWMON) += hwmon.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 91f96e1bd2ec..77cb78767f1d 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -316,8 +316,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	if (ret)
 		return ret;
 
-	dsa_hwmon_register(ds);
-
 	return 0;
 }
 
@@ -376,8 +374,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 {
 	int port;
 
-	dsa_hwmon_unregister(ds);
-
 	/* Destroy network devices for physical switch ports. */
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
 		if (!(ds->enabled_port_mask & (1 << port)))
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7e3385ec73f4..63ae1484abae 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -56,15 +56,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
 void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
 
-/* hwmon.c */
-#ifdef CONFIG_NET_DSA_HWMON
-void dsa_hwmon_register(struct dsa_switch *ds);
-void dsa_hwmon_unregister(struct dsa_switch *ds);
-#else
-static inline void dsa_hwmon_register(struct dsa_switch *ds) { }
-static inline void dsa_hwmon_unregister(struct dsa_switch *ds) { }
-#endif
-
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/hwmon.c b/net/dsa/hwmon.c
deleted file mode 100644
index 08831a811278..000000000000
--- a/net/dsa/hwmon.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * net/dsa/hwmon.c - HWMON subsystem support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/ctype.h>
-#include <linux/hwmon.h>
-#include <net/dsa.h>
-
-#include "dsa_priv.h"
-
-static ssize_t temp1_input_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-static DEVICE_ATTR_RO(temp1_input);
-
-static ssize_t temp1_max_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp_limit(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-
-static ssize_t temp1_max_store(struct device *dev,
-			       struct device_attribute *attr, const char *buf,
-			       size_t count)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = kstrtoint(buf, 0, &temp);
-	if (ret < 0)
-		return ret;
-
-	ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
-	if (ret < 0)
-		return ret;
-
-	return count;
-}
-static DEVICE_ATTR_RW(temp1_max);
-
-static ssize_t temp1_max_alarm_show(struct device *dev,
-				    struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	bool alarm;
-	int ret;
-
-	ret = ds->ops->get_temp_alarm(ds, &alarm);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", alarm);
-}
-static DEVICE_ATTR_RO(temp1_max_alarm);
-
-static struct attribute *dsa_hwmon_attrs[] = {
-	&dev_attr_temp1_input.attr,	/* 0 */
-	&dev_attr_temp1_max.attr,	/* 1 */
-	&dev_attr_temp1_max_alarm.attr,	/* 2 */
-	NULL
-};
-
-static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
-				       struct attribute *attr, int index)
-{
-	struct device *dev = container_of(kobj, struct device, kobj);
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	const struct dsa_switch_ops *ops = ds->ops;
-	umode_t mode = attr->mode;
-
-	if (index == 1) {
-		if (!ops->get_temp_limit)
-			mode = 0;
-		else if (!ops->set_temp_limit)
-			mode &= ~S_IWUSR;
-	} else if (index == 2 && !ops->get_temp_alarm) {
-		mode = 0;
-	}
-	return mode;
-}
-
-static const struct attribute_group dsa_hwmon_group = {
-	.attrs = dsa_hwmon_attrs,
-	.is_visible = dsa_hwmon_attrs_visible,
-};
-__ATTRIBUTE_GROUPS(dsa_hwmon);
-
-void dsa_hwmon_register(struct dsa_switch *ds)
-{
-	const char *netname = netdev_name(ds->dst->master_netdev);
-	char hname[IFNAMSIZ + 1];
-	int i, j;
-
-	/* If the switch provides temperature accessors, register with hardware
-	 * monitoring subsystem. Treat registration error as non-fatal.
-	 */
-	if (!ds->ops->get_temp)
-		return;
-
-	/* Create valid hwmon 'name' attribute */
-	for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
-		if (isalnum(netname[i]))
-			hname[j++] = netname[i];
-	}
-	hname[j] = '\0';
-	scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d", hname,
-		  ds->index);
-	ds->hwmon_dev = hwmon_device_register_with_groups(NULL, ds->hwmon_name,
-							  ds, dsa_hwmon_groups);
-	if (IS_ERR(ds->hwmon_dev)) {
-		pr_warn("DSA: failed to register HWMON subsystem for switch %d\n",
-			ds->index);
-		ds->hwmon_dev = NULL;
-	} else {
-		pr_info("DSA: registered HWMON subsystem for switch %d\n",
-			ds->index);
-	}
-}
-
-void dsa_hwmon_unregister(struct dsa_switch *ds)
-{
-	if (ds->hwmon_dev) {
-		hwmon_device_unregister(ds->hwmon_dev);
-		ds->hwmon_dev = NULL;
-	}
-}
-- 
cgit v1.2.3


From 530cef21d98930c4d10f188325e00330455d216d Mon Sep 17 00:00:00 2001
From: Geliang Tang
Date: Fri, 20 Jan 2017 22:36:53 +0800
Subject: 6lowpan: use rb_entry()

To make the code clearer, use rb_entry() instead of container_of() to
deal with rbtree.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/6lowpan/nhc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
index 7008d53e455c..4fa2fdda174d 100644
--- a/net/6lowpan/nhc.c
+++ b/net/6lowpan/nhc.c
@@ -27,8 +27,8 @@ static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
 
 	/* Figure out where to put new node */
 	while (*new) {
-		struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc,
-						       node);
+		struct lowpan_nhc *this = rb_entry(*new, struct lowpan_nhc,
+						   node);
 		int result, len_dif, len;
 
 		len_dif = nhc->idlen - this->idlen;
@@ -69,8 +69,8 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
 	const u8 *nhcid_skb_ptr = skb->data;
 
 	while (node) {
-		struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc,
-						      node);
+		struct lowpan_nhc *nhc = rb_entry(node, struct lowpan_nhc,
+						  node);
 		u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN];
 		int result, i;
 
-- 
cgit v1.2.3


From 9ca677b1bd8979f9a7702db5ed4028e4b8b232e8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 20 Jan 2017 08:08:56 -0800
Subject: ipv6: add NUMA awareness to seg6_hmac_init_algo()

Since we allocate per cpu storage, let's also use NUMA hints.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_hmac.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 5215e1eba010..b274f1d95e03 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -389,7 +389,8 @@ static int seg6_hmac_init_algo(void)
 			return -ENOMEM;
 
 		for_each_possible_cpu(cpu) {
-			shash = kzalloc(shsize, GFP_KERNEL);
+			shash = kzalloc_node(shsize, GFP_KERNEL,
+					     cpu_to_node(cpu));
 			if (!shash)
 				return -ENOMEM;
 			*per_cpu_ptr(algo->shashs, cpu) = shash;
-- 
cgit v1.2.3


From 1331b62c97217459780e040e8a66bb609f2acd20 Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Mon, 2 Jan 2017 16:01:57 +0100
Subject: rfkill: remove rfkill-regulator

There are no users of this ("vrfkill") in the tree, so it's just
dead code - remove it.

This also isn't really how rfkill is supposed to be used - it's
intended as a signalling mechanism to/from the device, which the
driver (and partially cfg80211) will handle - having a separate
rfkill instance for a regulator is confusing, the driver should
use the regulator instead to turn off the device when requested.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill-regulator.h |  48 ------------
 net/rfkill/Kconfig               |  11 ---
 net/rfkill/Makefile              |   1 -
 net/rfkill/rfkill-regulator.c    | 154 ---------------------------------------
 4 files changed, 214 deletions(-)
 delete mode 100644 include/linux/rfkill-regulator.h
 delete mode 100644 net/rfkill/rfkill-regulator.c

(limited to 'net')

diff --git a/include/linux/rfkill-regulator.h b/include/linux/rfkill-regulator.h
deleted file mode 100644
index aca36bc83315..000000000000
--- a/include/linux/rfkill-regulator.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * rfkill-regulator.c - Regulator consumer driver for rfkill
- *
- * Copyright (C) 2009  Guiming Zhuo <gmzhuo@gmail.com>
- * Copyright (C) 2011  Antonio Ospite <ospite@studenti.unina.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#ifndef __LINUX_RFKILL_REGULATOR_H
-#define __LINUX_RFKILL_REGULATOR_H
-
-/*
- * Use "vrfkill" as supply id when declaring the regulator consumer:
- *
- * static struct regulator_consumer_supply pcap_regulator_V6_consumers [] = {
- * 	{ .dev_name = "rfkill-regulator.0", .supply = "vrfkill" },
- * };
- *
- * If you have several regulator driven rfkill, you can append a numerical id to
- * .dev_name as done above, and use the same id when declaring the platform
- * device:
- *
- * static struct rfkill_regulator_platform_data ezx_rfkill_bt_data = {
- * 	.name  = "ezx-bluetooth",
- * 	.type  = RFKILL_TYPE_BLUETOOTH,
- * };
- *
- * static struct platform_device a910_rfkill = {
- * 	.name  = "rfkill-regulator",
- * 	.id    = 0,
- * 	.dev   = {
- * 		.platform_data = &ezx_rfkill_bt_data,
- * 	},
- * };
- */
-
-#include <linux/rfkill.h>
-
-struct rfkill_regulator_platform_data {
-	char *name;             /* the name for the rfkill switch */
-	enum rfkill_type type;  /* the type as specified in rfkill.h */
-};
-
-#endif /* __LINUX_RFKILL_REGULATOR_H */
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..060600b03fad 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -23,17 +23,6 @@ config RFKILL_INPUT
 	depends on INPUT = y || RFKILL = INPUT
 	default y if !EXPERT
 
-config RFKILL_REGULATOR
-	tristate "Generic rfkill regulator driver"
-	depends on RFKILL || !RFKILL
-	depends on REGULATOR
-	help
-          This options enable controlling radio transmitters connected to
-          voltage regulator using the regulator framework.
-
-          To compile this driver as a module, choose M here: the module will
-          be called rfkill-regulator.
-
 config RFKILL_GPIO
 	tristate "GPIO RFKILL driver"
 	depends on RFKILL
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
index 311768783f4a..87a80aded0b3 100644
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -5,5 +5,4 @@
 rfkill-y			+= core.o
 rfkill-$(CONFIG_RFKILL_INPUT)	+= input.o
 obj-$(CONFIG_RFKILL)		+= rfkill.o
-obj-$(CONFIG_RFKILL_REGULATOR)	+= rfkill-regulator.o
 obj-$(CONFIG_RFKILL_GPIO)	+= rfkill-gpio.o
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
deleted file mode 100644
index 50cd26a48e87..000000000000
--- a/net/rfkill/rfkill-regulator.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * rfkill-regulator.c - Regulator consumer driver for rfkill
- *
- * Copyright (C) 2009  Guiming Zhuo <gmzhuo@gmail.com>
- * Copyright (C) 2011  Antonio Ospite <ospite@studenti.unina.it>
- *
- * Implementation inspired by leds-regulator driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/consumer.h>
-#include <linux/rfkill.h>
-#include <linux/rfkill-regulator.h>
-
-struct rfkill_regulator_data {
-	struct rfkill *rf_kill;
-	bool reg_enabled;
-
-	struct regulator *vcc;
-};
-
-static int rfkill_regulator_set_block(void *data, bool blocked)
-{
-	struct rfkill_regulator_data *rfkill_data = data;
-	int ret = 0;
-
-	pr_debug("%s: blocked: %d\n", __func__, blocked);
-
-	if (blocked) {
-		if (rfkill_data->reg_enabled) {
-			regulator_disable(rfkill_data->vcc);
-			rfkill_data->reg_enabled = false;
-		}
-	} else {
-		if (!rfkill_data->reg_enabled) {
-			ret = regulator_enable(rfkill_data->vcc);
-			if (!ret)
-				rfkill_data->reg_enabled = true;
-		}
-	}
-
-	pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
-		regulator_is_enabled(rfkill_data->vcc));
-
-	return ret;
-}
-
-static struct rfkill_ops rfkill_regulator_ops = {
-	.set_block = rfkill_regulator_set_block,
-};
-
-static int rfkill_regulator_probe(struct platform_device *pdev)
-{
-	struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
-	struct rfkill_regulator_data *rfkill_data;
-	struct regulator *vcc;
-	struct rfkill *rf_kill;
-	int ret = 0;
-
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "no platform data\n");
-		return -ENODEV;
-	}
-
-	if (pdata->name == NULL || pdata->type == 0) {
-		dev_err(&pdev->dev, "invalid name or type in platform data\n");
-		return -EINVAL;
-	}
-
-	vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
-	if (IS_ERR(vcc)) {
-		dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
-		ret = PTR_ERR(vcc);
-		goto out;
-	}
-
-	rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
-	if (rfkill_data == NULL) {
-		ret = -ENOMEM;
-		goto err_data_alloc;
-	}
-
-	rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
-				pdata->type,
-				&rfkill_regulator_ops, rfkill_data);
-	if (rf_kill == NULL) {
-		ret = -ENOMEM;
-		goto err_rfkill_alloc;
-	}
-
-	if (regulator_is_enabled(vcc)) {
-		dev_dbg(&pdev->dev, "Regulator already enabled\n");
-		rfkill_data->reg_enabled = true;
-	}
-	rfkill_data->vcc = vcc;
-	rfkill_data->rf_kill = rf_kill;
-
-	ret = rfkill_register(rf_kill);
-	if (ret) {
-		dev_err(&pdev->dev, "Cannot register rfkill device\n");
-		goto err_rfkill_register;
-	}
-
-	platform_set_drvdata(pdev, rfkill_data);
-	dev_info(&pdev->dev, "%s initialized\n", pdata->name);
-
-	return 0;
-
-err_rfkill_register:
-	rfkill_destroy(rf_kill);
-err_rfkill_alloc:
-	kfree(rfkill_data);
-err_data_alloc:
-	regulator_put(vcc);
-out:
-	return ret;
-}
-
-static int rfkill_regulator_remove(struct platform_device *pdev)
-{
-	struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
-	struct rfkill *rf_kill = rfkill_data->rf_kill;
-
-	rfkill_unregister(rf_kill);
-	rfkill_destroy(rf_kill);
-	regulator_put(rfkill_data->vcc);
-	kfree(rfkill_data);
-
-	return 0;
-}
-
-static struct platform_driver rfkill_regulator_driver = {
-	.probe = rfkill_regulator_probe,
-	.remove = rfkill_regulator_remove,
-	.driver = {
-		.name = "rfkill-regulator",
-	},
-};
-
-module_platform_driver(rfkill_regulator_driver);
-
-MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
-MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
-MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:rfkill-regulator");
-- 
cgit v1.2.3


From 4548b683b78137f8eadeb312b94e20bb0d4a7141 Mon Sep 17 00:00:00 2001
From: Krister Johansen
Date: Fri, 20 Jan 2017 17:49:11 -0800
Subject: Introduce a sysctl that modifies the value of PROT_SOCK.

Add net.ipv4.ip_unprivileged_port_start, which is a per namespace sysctl
that denotes the first unprivileged inet port in the namespace.  To
disable all privileged ports set this to zero.  It also checks for
overlap with the local port range.  The privileged and local range may
not overlap.

The use case for this change is to allow containerized processes to bind
to priviliged ports, but prevent them from ever being allowed to modify
their container's network configuration.  The latter is accomplished by
ensuring that the network namespace is not a child of the user
namespace.  This modification was needed to allow the container manager
to disable a namespace's priviliged port restrictions without exposing
control of the network namespace to processes in the user namespace.

Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 ++++++
 include/net/ip.h                       | 10 +++++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/af_inet.c                     |  5 +++-
 net/ipv4/sysctl_net_ipv4.c             | 50 +++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c                    |  3 +-
 net/netfilter/ipvs/ip_vs_ctl.c         |  7 ++---
 net/sctp/socket.c                      | 10 ++++---
 security/selinux/hooks.c               |  3 +-
 9 files changed, 86 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index aa1bb49f1dc6..17f2e7791042 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -822,6 +822,15 @@ ip_local_reserved_ports - list of comma separated ranges
 
 	Default: Empty
 
+ip_unprivileged_port_start - INTEGER
+	This is a per-namespace sysctl.  It defines the first
+	unprivileged port in the network namespace.  Privileged ports
+	require root or CAP_NET_BIND_SERVICE in order to bind to them.
+	To disable all privileged ports, set this to 0.  It may not
+	overlap with the ip_local_reserved_ports range.
+
+	Default: 1024
+
 ip_nonlocal_bind - BOOLEAN
 	If set, allows processes to bind() to non-local IP addresses,
 	which can be quite useful - but may break some applications.
diff --git a/include/net/ip.h b/include/net/ip.h
index ab6761a7c883..bf264a8db1ce 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -263,11 +263,21 @@ static inline bool sysctl_dev_name_is_allowed(const char *name)
 	return strcmp(name, "default") != 0  && strcmp(name, "all") != 0;
 }
 
+static inline int inet_prot_sock(struct net *net)
+{
+	return net->ipv4.sysctl_ip_prot_sock;
+}
+
 #else
 static inline int inet_is_local_reserved_port(struct net *net, int port)
 {
 	return 0;
 }
+
+static inline int inet_prot_sock(struct net *net)
+{
+	return PROT_SOCK;
+}
 #endif
 
 __be32 inet_current_timestamp(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8e3f5b6f26d5..e365732b8051 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,7 @@ struct netns_ipv4 {
 
 #ifdef CONFIG_SYSCTL
 	unsigned long *sysctl_local_reserved_ports;
+	int sysctl_ip_prot_sock;
 #endif
 
 #ifdef CONFIG_IP_MROUTE
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index aae410bb655a..28fe8da4e1ac 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -479,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	snum = ntohs(addr->sin_port);
 	err = -EACCES;
-	if (snum && snum < PROT_SOCK &&
+	if (snum && snum < inet_prot_sock(net) &&
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		goto out;
 
@@ -1700,6 +1700,9 @@ static __net_init int inet_init_net(struct net *net)
 	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
 	net->ipv4.sysctl_ip_dynaddr = 0;
 	net->ipv4.sysctl_ip_early_demux = 1;
+#ifdef CONFIG_SYSCTL
+	net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
+#endif
 
 	return 0;
 }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index c8d283615c6f..1b861997fdc5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
 static int tcp_adv_win_scale_min = -31;
 static int tcp_adv_win_scale_max = 31;
+static int ip_privileged_port_min;
+static int ip_privileged_port_max = 65535;
 static int ip_ttl_min = 1;
 static int ip_ttl_max = 255;
 static int tcp_syn_retries_min = 1;
@@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
-		if (range[1] < range[0])
+		/* Ensure that the upper limit is not smaller than the lower,
+		 * and that the lower does not encroach upon the privileged
+		 * port limit.
+		 */
+		if ((range[1] < range[0]) ||
+		    (range[0] < net->ipv4.sysctl_ip_prot_sock))
 			ret = -EINVAL;
 		else
 			set_local_port_range(net, range);
@@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 	return ret;
 }
 
+/* Validate changes from /proc interface. */
+static int ipv4_privileged_ports(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_ip_prot_sock);
+	int ret;
+	int pports;
+	int range[2];
+	struct ctl_table tmp = {
+		.data = &pports,
+		.maxlen = sizeof(pports),
+		.mode = table->mode,
+		.extra1 = &ip_privileged_port_min,
+		.extra2 = &ip_privileged_port_max,
+	};
+
+	pports = net->ipv4.sysctl_ip_prot_sock;
+
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		inet_get_local_port_range(net, &range[0], &range[1]);
+		/* Ensure that the local port range doesn't overlap with the
+		 * privileged port range.
+		 */
+		if (range[0] < pports)
+			ret = -EINVAL;
+		else
+			net->ipv4.sysctl_ip_prot_sock = pports;
+	}
+
+	return ret;
+}
 
 static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
 {
@@ -964,6 +1005,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "ip_unprivileged_port_start",
+		.maxlen		= sizeof(int),
+		.data		= &init_net.ipv4.sysctl_ip_prot_sock,
+		.mode		= 0644,
+		.proc_handler	= ipv4_privileged_ports,
+	},
 	{ }
 };
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index aa42123bc301..04db40620ea6 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -302,7 +302,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		return -EINVAL;
 
 	snum = ntohs(addr->sin6_port);
-	if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+	if (snum && snum < inet_prot_sock(net) &&
+	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
 	lock_sock(sk);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 55e0169caa4c..8b7416f4e01a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -426,10 +426,9 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
 	 */
 	svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
 
-	if (svc == NULL
-	    && protocol == IPPROTO_TCP
-	    && atomic_read(&ipvs->ftpsvc_counter)
-	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+	if (!svc && protocol == IPPROTO_TCP &&
+	    atomic_read(&ipvs->ftpsvc_counter) &&
+	    (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
 		/*
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bee4dd3feabb..d699d2cbf275 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -360,7 +360,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 		}
 	}
 
-	if (snum && snum < PROT_SOCK &&
+	if (snum && snum < inet_prot_sock(net) &&
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
@@ -1152,8 +1152,10 @@ static int __sctp_connect(struct sock *sk,
 				 * accept new associations, but it SHOULD NOT
 				 * be permitted to open new associations.
 				 */
-				if (ep->base.bind_addr.port < PROT_SOCK &&
-				    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
+				if (ep->base.bind_addr.port <
+				    inet_prot_sock(net) &&
+				    !ns_capable(net->user_ns,
+				    CAP_NET_BIND_SERVICE)) {
 					err = -EACCES;
 					goto out_free;
 				}
@@ -1818,7 +1820,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 			 * but it SHOULD NOT be permitted to open new
 			 * associations.
 			 */
-			if (ep->base.bind_addr.port < PROT_SOCK &&
+			if (ep->base.bind_addr.port < inet_prot_sock(net) &&
 			    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
 				err = -EACCES;
 				goto out_unlock;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c7c6619431d5..53cb6da5f1c6 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4365,7 +4365,8 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 
 			inet_get_local_port_range(sock_net(sk), &low, &high);
 
-			if (snum < max(PROT_SOCK, low) || snum > high) {
+			if (snum < max(inet_prot_sock(sock_net(sk)), low) ||
+			    snum > high) {
 				err = sel_netport_sid(sk->sk_protocol,
 						      snum, &sid);
 				if (err)
-- 
cgit v1.2.3


From 6db6f0eae6052b70885562e1733896647ec1d807 Mon Sep 17 00:00:00 2001
From: Felix Fietkau
Date: Sat, 21 Jan 2017 21:01:32 +0100
Subject: bridge: multicast to unicast

Implements an optional, per bridge port flag and feature to deliver
multicast packets to any host on the according port via unicast
individually. This is done by copying the packet per host and
changing the multicast destination MAC to a unicast one accordingly.

multicast-to-unicast works on top of the multicast snooping feature of
the bridge. Which means unicast copies are only delivered to hosts which
are interested in it and signalized this via IGMP/MLD reports
previously.

This feature is intended for interface types which have a more reliable
and/or efficient way to deliver unicast packets than broadcast ones
(e.g. wifi).

However, it should only be enabled on interfaces where no IGMPv2/MLDv1
report suppression takes place. This feature is disabled by default.

The initial patch and idea is from Felix Fietkau.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
[linus.luessing@c0d3.blue: various bug + style fixes, commit message]
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_forward.c      | 39 ++++++++++++++++++-
 net/bridge/br_mdb.c          |  2 +-
 net/bridge/br_multicast.c    | 90 ++++++++++++++++++++++++++++++++------------
 net/bridge/br_netlink.c      |  5 +++
 net/bridge/br_private.h      |  3 +-
 net/bridge/br_sysfs_if.c     |  2 +
 8 files changed, 114 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index c6587c01d951..debc9d5904e5 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -46,6 +46,7 @@ struct br_ip_list {
 #define BR_LEARNING_SYNC	BIT(9)
 #define BR_PROXYARP_WIFI	BIT(10)
 #define BR_MCAST_FLOOD		BIT(11)
+#define BR_MULTICAST_TO_UNICAST	BIT(12)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 184b16ed2b84..b9aa5641ebe5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -321,6 +321,7 @@ enum {
 	IFLA_BRPORT_MULTICAST_ROUTER,
 	IFLA_BRPORT_PAD,
 	IFLA_BRPORT_MCAST_FLOOD,
+	IFLA_BRPORT_MCAST_TO_UCAST,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7cb41aee4c82..a0f9d0037d24 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -174,6 +174,31 @@ out:
 	return p;
 }
 
+static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
+			       const unsigned char *addr, bool local_orig)
+{
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+	const unsigned char *src = eth_hdr(skb)->h_source;
+
+	if (!should_deliver(p, skb))
+		return;
+
+	/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+	if (skb->dev == p->dev && ether_addr_equal(src, addr))
+		return;
+
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (!skb) {
+		dev->stats.tx_dropped++;
+		return;
+	}
+
+	if (!is_broadcast_ether_addr(addr))
+		memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+
+	__br_forward(p, skb, local_orig);
+}
+
 /* called under rcu_read_lock */
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
@@ -241,10 +266,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 		rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
 			     NULL;
 
-		port = (unsigned long)lport > (unsigned long)rport ?
-		       lport : rport;
+		if ((unsigned long)lport > (unsigned long)rport) {
+			port = lport;
+
+			if (port->flags & BR_MULTICAST_TO_UNICAST) {
+				maybe_deliver_addr(lport, skb, p->eth_addr,
+						   local_orig);
+				goto delivered;
+			}
+		} else {
+			port = rport;
+		}
 
 		prev = maybe_deliver(prev, port, skb, local_orig);
+delivered:
 		if (IS_ERR(prev))
 			goto out;
 		if (prev == port)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7dbc80d01eb0..056e6ac49d8f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, state);
+	p = br_multicast_new_port_group(port, group, *pp, state, NULL);
 	if (unlikely(!p))
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f66346122dc4..1de3438e36bf 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -43,12 +43,14 @@ static void br_multicast_add_router(struct net_bridge *br,
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid);
+					 __u16 vid,
+					 const unsigned char *src);
+
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid);
+					 __u16 vid, const unsigned char *src);
 #endif
 unsigned int br_mdb_rehash_seq;
 
@@ -711,7 +713,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 			struct net_bridge_port *port,
 			struct br_ip *group,
 			struct net_bridge_port_group __rcu *next,
-			unsigned char flags)
+			unsigned char flags,
+			const unsigned char *src)
 {
 	struct net_bridge_port_group *p;
 
@@ -726,12 +729,32 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 	hlist_add_head(&p->mglist, &port->mglist);
 	setup_timer(&p->timer, br_multicast_port_group_expired,
 		    (unsigned long)p);
+
+	if (src)
+		memcpy(p->eth_addr, src, ETH_ALEN);
+	else
+		memset(p->eth_addr, 0xff, ETH_ALEN);
+
 	return p;
 }
 
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+				struct net_bridge_port *port,
+				const unsigned char *src)
+{
+	if (p->port != port)
+		return false;
+
+	if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+		return true;
+
+	return ether_addr_equal(src, p->eth_addr);
+}
+
 static int br_multicast_add_group(struct net_bridge *br,
 				  struct net_bridge_port *port,
-				  struct br_ip *group)
+				  struct br_ip *group,
+				  const unsigned char *src)
 {
 	struct net_bridge_port_group __rcu **pp;
 	struct net_bridge_port_group *p;
@@ -758,13 +781,13 @@ static int br_multicast_add_group(struct net_bridge *br,
 	for (pp = &mp->ports;
 	     (p = mlock_dereference(*pp, br)) != NULL;
 	     pp = &p->next) {
-		if (p->port == port)
+		if (br_port_group_equal(p, port, src))
 			goto found;
 		if ((unsigned long)p->port < (unsigned long)port)
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, 0);
+	p = br_multicast_new_port_group(port, group, *pp, 0, src);
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
@@ -783,7 +806,8 @@ err:
 static int br_ip4_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      __be32 group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -794,14 +818,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IP);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int br_ip6_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      const struct in6_addr *group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -812,7 +837,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IPV6);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 #endif
 
@@ -1081,6 +1106,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 					 struct sk_buff *skb,
 					 u16 vid)
 {
+	const unsigned char *src;
 	struct igmpv3_report *ih;
 	struct igmpv3_grec *grec;
 	int i;
@@ -1121,12 +1147,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
 		     type == IGMPV3_MODE_IS_INCLUDE) &&
 		    ntohs(grec->grec_nsrcs) == 0) {
-			br_ip4_multicast_leave_group(br, port, group, vid);
+			br_ip4_multicast_leave_group(br, port, group, vid, src);
 		} else {
-			err = br_ip4_multicast_add_group(br, port, group, vid);
+			err = br_ip4_multicast_add_group(br, port, group, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1141,6 +1169,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 					struct sk_buff *skb,
 					u16 vid)
 {
+	const unsigned char *src;
 	struct icmp6hdr *icmp6h;
 	struct mld2_grec *grec;
 	int i;
@@ -1188,14 +1217,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
 		     grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
 		    ntohs(*nsrcs) == 0) {
 			br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
-						     vid);
+						     vid, src);
 		} else {
 			err = br_ip6_multicast_add_group(br, port,
-							 &grec->grec_mca, vid);
+							 &grec->grec_mca, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1511,7 +1542,8 @@ br_multicast_leave_group(struct net_bridge *br,
 			 struct net_bridge_port *port,
 			 struct br_ip *group,
 			 struct bridge_mcast_other_query *other_query,
-			 struct bridge_mcast_own_query *own_query)
+			 struct bridge_mcast_own_query *own_query,
+			 const unsigned char *src)
 {
 	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
@@ -1535,7 +1567,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (pp = &mp->ports;
 		     (p = mlock_dereference(*pp, br)) != NULL;
 		     pp = &p->next) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			rcu_assign_pointer(*pp, p->next);
@@ -1566,7 +1598,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (p = mlock_dereference(mp->ports, br);
 		     p != NULL;
 		     p = mlock_dereference(p->next, br)) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			if (!hlist_unhashed(&p->mglist) &&
@@ -1617,7 +1649,8 @@ out:
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1632,14 +1665,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
-				 own_query);
+				 own_query, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1654,7 +1688,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
-				 own_query);
+				 own_query, src);
 }
 #endif
 
@@ -1712,6 +1746,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct igmphdr *ih;
 	int err;
 
@@ -1731,13 +1766,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 	}
 
 	ih = igmp_hdr(skb);
+	src = eth_hdr(skb)->h_source;
 	BR_INPUT_SKB_CB(skb)->igmp = ih->type;
 
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_REPORT:
 	case IGMPV2_HOST_MEMBERSHIP_REPORT:
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip4_multicast_add_group(br, port, ih->group, vid);
+		err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
 		break;
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
 		err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1746,7 +1782,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_LEAVE_MESSAGE:
-		br_ip4_multicast_leave_group(br, port, ih->group, vid);
+		br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
 		break;
 	}
 
@@ -1766,6 +1802,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct mld_msg *mld;
 	int err;
 
@@ -1785,8 +1822,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 
 	switch (mld->mld_type) {
 	case ICMPV6_MGM_REPORT:
+		src = eth_hdr(skb)->h_source;
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
+		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
+						 src);
 		break;
 	case ICMPV6_MLD2_REPORT:
 		err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1795,7 +1834,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 		err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_REDUCTION:
-		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
+		src = eth_hdr(skb)->h_source;
+		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
 		break;
 	}
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 71c7453268c1..6c087cd049b9 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -123,6 +123,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_GUARD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROTECT */
 		+ nla_total_size(1)	/* IFLA_BRPORT_FAST_LEAVE */
+		+ nla_total_size(1)	/* IFLA_BRPORT_MCAST_TO_UCAST */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LEARNING */
 		+ nla_total_size(1)	/* IFLA_BRPORT_UNICAST_FLOOD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
@@ -173,6 +174,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       !!(p->flags & BR_ROOT_BLOCK)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
 		       !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
+		       !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
 		       !!(p->flags & BR_FLOOD)) ||
@@ -586,6 +589,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_PROXYARP]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
 	[IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
+	[IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -636,6 +640,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
 	br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
+	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8ce621e8345c..0b82a227fc34 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -177,6 +177,7 @@ struct net_bridge_port_group {
 	struct timer_list		timer;
 	struct br_ip			addr;
 	unsigned char			flags;
+	unsigned char			eth_addr[ETH_ALEN];
 };
 
 struct net_bridge_mdb_entry
@@ -599,7 +600,7 @@ void br_multicast_free_pg(struct rcu_head *head);
 struct net_bridge_port_group *
 br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
 			    struct net_bridge_port_group __rcu *next,
-			    unsigned char flags);
+			    unsigned char flags, const unsigned char *src);
 void br_mdb_init(void);
 void br_mdb_uninit(void);
 void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 8bd569695e76..05e8946ccc03 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -188,6 +188,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
 		   store_multicast_router);
 
 BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
 #endif
 
 static const struct brport_attribute *brport_attrs[] = {
@@ -214,6 +215,7 @@ static const struct brport_attribute *brport_attrs[] = {
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&brport_attr_multicast_router,
 	&brport_attr_multicast_fast_leave,
+	&brport_attr_multicast_to_unicast,
 #endif
 	&brport_attr_proxyarp,
 	&brport_attr_proxyarp_wifi,
-- 
cgit v1.2.3


From 23e3d618e49eb560052fe59a3c9629d3650ba46e Mon Sep 17 00:00:00 2001
From: Andrew Lunn
Date: Sun, 22 Jan 2017 22:16:45 +0100
Subject: net: dsa: Fix inverted test for multiple CPU interface

Remove the wrong !, otherwise we get false positives about having
multiple CPU interfaces.

Fixes: b22de490869d ("net: dsa: store CPU switch structure in the tree")
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 77cb78767f1d..1f3afeb673d6 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -225,7 +225,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 			continue;
 
 		if (!strcmp(name, "cpu")) {
-			if (!dst->cpu_switch) {
+			if (dst->cpu_switch) {
 				netdev_err(dst->master_netdev,
 					   "multiple cpu ports?!\n");
 				return -EINVAL;
-- 
cgit v1.2.3


From 6ae0a6286171154661b74f7f550f9441c6008424 Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Mon, 23 Jan 2017 11:07:08 +0100
Subject: net: Introduce psample, a new genetlink channel for packet sampling

Add a general way for kernel modules to sample packets, without being tied
to any specific subsystem. This netlink channel can be used by tc,
iptables, etc. and allow to standardize packet sampling in the kernel.

For every sampled packet, the psample module adds the following metadata
fields:

PSAMPLE_ATTR_IIFINDEX - the packets input ifindex, if applicable

PSAMPLE_ATTR_OIFINDEX - the packet output ifindex, if applicable

PSAMPLE_ATTR_ORIGSIZE - the packet's original size, in case it has been
   truncated during sampling

PSAMPLE_ATTR_SAMPLE_GROUP - the packet's sample group, which is set by the
   user who initiated the sampling. This field allows the user to
   differentiate between several samplers working simultaneously and
   filter packets relevant to him

PSAMPLE_ATTR_GROUP_SEQ - sequence counter of last sent packet. The
   sequence is kept for each group

PSAMPLE_ATTR_SAMPLE_RATE - the sampling rate used for sampling the packets

PSAMPLE_ATTR_DATA - the actual packet bits

The sampled packets are sent to the PSAMPLE_NL_MCGRP_SAMPLE multicast
group. In addition, add the GET_GROUPS netlink command which allows the
user to see the current sample groups, their refcount and sequence number.
This command currently supports only netlink dump mode.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                  |   7 +
 include/net/psample.h        |  36 ++++++
 include/uapi/linux/Kbuild    |   1 +
 include/uapi/linux/psample.h |  35 +++++
 net/Kconfig                  |   1 +
 net/Makefile                 |   1 +
 net/psample/Kconfig          |  15 +++
 net/psample/Makefile         |   5 +
 net/psample/psample.c        | 301 +++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 402 insertions(+)
 create mode 100644 include/net/psample.h
 create mode 100644 include/uapi/linux/psample.h
 create mode 100644 net/psample/Kconfig
 create mode 100644 net/psample/Makefile
 create mode 100644 net/psample/psample.c

(limited to 'net')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3c84a8fecc09..d76fccd09266 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9957,6 +9957,13 @@ L:	linuxppc-dev@lists.ozlabs.org
 S:	Maintained
 F:	drivers/block/ps3vram.c
 
+PSAMPLE PACKET SAMPLING SUPPORT:
+M:	Yotam Gigi <yotamg@mellanox.com>
+S:	Maintained
+F:	net/psample
+F:	include/net/psample.h
+F:	include/uapi/linux/psample.h
+
 PSTORE FILESYSTEM
 M:	Anton Vorontsov <anton@enomsg.org>
 M:	Colin Cross <ccross@android.com>
diff --git a/include/net/psample.h b/include/net/psample.h
new file mode 100644
index 000000000000..8888b0e1a82e
--- /dev/null
+++ b/include/net/psample.h
@@ -0,0 +1,36 @@
+#ifndef __NET_PSAMPLE_H
+#define __NET_PSAMPLE_H
+
+#include <uapi/linux/psample.h>
+#include <linux/module.h>
+#include <linux/list.h>
+
+struct psample_group {
+	struct list_head list;
+	struct net *net;
+	u32 group_num;
+	u32 refcount;
+	u32 seq;
+};
+
+struct psample_group *psample_group_get(struct net *net, u32 group_num);
+void psample_group_put(struct psample_group *group);
+
+#if IS_ENABLED(CONFIG_PSAMPLE)
+
+void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
+			   u32 trunc_size, int in_ifindex, int out_ifindex,
+			   u32 sample_rate);
+
+#else
+
+static inline void psample_sample_packet(struct psample_group *group,
+					 struct sk_buff *skb, u32 trunc_size,
+					 int in_ifindex, int out_ifindex,
+					 u32 sample_rate)
+{
+}
+
+#endif
+
+#endif /* __NET_PSAMPLE_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index e600b50be77e..80ad741a42fa 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -305,6 +305,7 @@ header-y += netrom.h
 header-y += net_namespace.h
 header-y += net_tstamp.h
 header-y += nfc.h
+header-y += psample.h
 header-y += nfs2.h
 header-y += nfs3.h
 header-y += nfs4.h
diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h
new file mode 100644
index 000000000000..ed48996ec0e8
--- /dev/null
+++ b/include/uapi/linux/psample.h
@@ -0,0 +1,35 @@
+#ifndef __UAPI_PSAMPLE_H
+#define __UAPI_PSAMPLE_H
+
+enum {
+	/* sampled packet metadata */
+	PSAMPLE_ATTR_IIFINDEX,
+	PSAMPLE_ATTR_OIFINDEX,
+	PSAMPLE_ATTR_ORIGSIZE,
+	PSAMPLE_ATTR_SAMPLE_GROUP,
+	PSAMPLE_ATTR_GROUP_SEQ,
+	PSAMPLE_ATTR_SAMPLE_RATE,
+	PSAMPLE_ATTR_DATA,
+
+	/* commands attributes */
+	PSAMPLE_ATTR_GROUP_REFCOUNT,
+
+	__PSAMPLE_ATTR_MAX
+};
+
+enum psample_command {
+	PSAMPLE_CMD_SAMPLE,
+	PSAMPLE_CMD_GET_GROUP,
+	PSAMPLE_CMD_NEW_GROUP,
+	PSAMPLE_CMD_DEL_GROUP,
+};
+
+/* Can be overridden at runtime by module option */
+#define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1)
+
+#define PSAMPLE_NL_MCGRP_CONFIG_NAME "config"
+#define PSAMPLE_NL_MCGRP_SAMPLE_NAME "packets"
+#define PSAMPLE_GENL_NAME "psample"
+#define PSAMPLE_GENL_VERSION 1
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index 92ae1500d9e1..ce4aee69fc0d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -390,6 +390,7 @@ source "net/9p/Kconfig"
 source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
+source "net/psample/Kconfig"
 
 config LWTUNNEL
 	bool "Network light weight tunnels"
diff --git a/net/Makefile b/net/Makefile
index 5d6e0e5ff7f8..7d41de48310e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
 obj-$(CONFIG_CEPH_LIB)		+= ceph/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
 obj-$(CONFIG_NFC)		+= nfc/
+obj-$(CONFIG_PSAMPLE)		+= psample/
 obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_MPLS)		+= mpls/
diff --git a/net/psample/Kconfig b/net/psample/Kconfig
new file mode 100644
index 000000000000..d850246a6059
--- /dev/null
+++ b/net/psample/Kconfig
@@ -0,0 +1,15 @@
+#
+# psample packet sampling configuration
+#
+
+menuconfig PSAMPLE
+	depends on NET
+	tristate "Packet-sampling netlink channel"
+	default n
+	help
+	  Say Y here to add support for packet-sampling netlink channel
+	  This netlink channel allows transferring packets alongside some
+	  metadata to userspace.
+
+	  To compile this support as a module, choose M here: the module will
+	  be called psample.
diff --git a/net/psample/Makefile b/net/psample/Makefile
new file mode 100644
index 000000000000..609b0a79c9f3
--- /dev/null
+++ b/net/psample/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the psample netlink channel
+#
+
+obj-$(CONFIG_PSAMPLE) += psample.o
diff --git a/net/psample/psample.c b/net/psample/psample.c
new file mode 100644
index 000000000000..8aa58a918783
--- /dev/null
+++ b/net/psample/psample.c
@@ -0,0 +1,301 @@
+/*
+ * net/psample/psample.c - Netlink channel for packet sampling
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/psample.h>
+#include <linux/spinlock.h>
+
+#define PSAMPLE_MAX_PACKET_SIZE 0xffff
+
+static LIST_HEAD(psample_groups_list);
+static DEFINE_SPINLOCK(psample_groups_lock);
+
+/* multicast groups */
+enum psample_nl_multicast_groups {
+	PSAMPLE_NL_MCGRP_CONFIG,
+	PSAMPLE_NL_MCGRP_SAMPLE,
+};
+
+static const struct genl_multicast_group psample_nl_mcgrps[] = {
+	[PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME },
+	[PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME },
+};
+
+static struct genl_family psample_nl_family __ro_after_init;
+
+static int psample_group_nl_fill(struct sk_buff *msg,
+				 struct psample_group *group,
+				 enum psample_command cmd, u32 portid, u32 seq,
+				 int flags)
+{
+	void *hdr;
+	int ret;
+
+	hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+	if (ret < 0)
+		goto error;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount);
+	if (ret < 0)
+		goto error;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq);
+	if (ret < 0)
+		goto error;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+error:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
+					   struct netlink_callback *cb)
+{
+	struct psample_group *group;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	spin_lock(&psample_groups_lock);
+	list_for_each_entry(group, &psample_groups_list, list) {
+		if (!net_eq(group->net, sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			break;
+		idx++;
+	}
+
+	spin_unlock(&psample_groups_lock);
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static const struct genl_ops psample_nl_ops[] = {
+	{
+		.cmd = PSAMPLE_CMD_GET_GROUP,
+		.dumpit = psample_nl_cmd_get_group_dumpit,
+		/* can be retrieved by unprivileged users */
+	}
+};
+
+static struct genl_family psample_nl_family __ro_after_init = {
+	.name		= PSAMPLE_GENL_NAME,
+	.version	= PSAMPLE_GENL_VERSION,
+	.maxattr	= PSAMPLE_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.mcgrps		= psample_nl_mcgrps,
+	.ops		= psample_nl_ops,
+	.n_ops		= ARRAY_SIZE(psample_nl_ops),
+	.n_mcgrps	= ARRAY_SIZE(psample_nl_mcgrps),
+};
+
+static void psample_group_notify(struct psample_group *group,
+				 enum psample_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI);
+	if (!err)
+		genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0,
+					PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC);
+	else
+		nlmsg_free(msg);
+}
+
+static struct psample_group *psample_group_create(struct net *net,
+						  u32 group_num)
+{
+	struct psample_group *group;
+
+	group = kzalloc(sizeof(*group), GFP_ATOMIC);
+	if (!group)
+		return NULL;
+
+	group->net = net;
+	group->group_num = group_num;
+	list_add_tail(&group->list, &psample_groups_list);
+
+	psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP);
+	return group;
+}
+
+static void psample_group_destroy(struct psample_group *group)
+{
+	psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP);
+	list_del(&group->list);
+	kfree(group);
+}
+
+static struct psample_group *
+psample_group_lookup(struct net *net, u32 group_num)
+{
+	struct psample_group *group;
+
+	list_for_each_entry(group, &psample_groups_list, list)
+		if ((group->group_num == group_num) && (group->net == net))
+			return group;
+	return NULL;
+}
+
+struct psample_group *psample_group_get(struct net *net, u32 group_num)
+{
+	struct psample_group *group;
+
+	spin_lock(&psample_groups_lock);
+
+	group = psample_group_lookup(net, group_num);
+	if (!group) {
+		group = psample_group_create(net, group_num);
+		if (!group)
+			goto out;
+	}
+	group->refcount++;
+
+out:
+	spin_unlock(&psample_groups_lock);
+	return group;
+}
+EXPORT_SYMBOL_GPL(psample_group_get);
+
+void psample_group_put(struct psample_group *group)
+{
+	spin_lock(&psample_groups_lock);
+
+	if (--group->refcount == 0)
+		psample_group_destroy(group);
+
+	spin_unlock(&psample_groups_lock);
+}
+EXPORT_SYMBOL_GPL(psample_group_put);
+
+void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
+			   u32 trunc_size, int in_ifindex, int out_ifindex,
+			   u32 sample_rate)
+{
+	struct sk_buff *nl_skb;
+	int data_len;
+	int meta_len;
+	void *data;
+	int ret;
+
+	meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+		   (out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+		   nla_total_size(sizeof(u32)) +	/* sample_rate */
+		   nla_total_size(sizeof(u32)) +	/* orig_size */
+		   nla_total_size(sizeof(u32)) +	/* group_num */
+		   nla_total_size(sizeof(u32));		/* seq */
+
+	data_len = min(skb->len, trunc_size);
+	if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
+		data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
+			    - NLA_ALIGNTO;
+
+	nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC);
+	if (unlikely(!nl_skb))
+		return;
+
+	data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0,
+			   PSAMPLE_CMD_SAMPLE);
+	if (unlikely(!data))
+		goto error;
+
+	if (in_ifindex) {
+		ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	if (out_ifindex) {
+		ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++);
+	if (unlikely(ret < 0))
+		goto error;
+
+	if (data_len) {
+		int nla_len = nla_total_size(data_len);
+		struct nlattr *nla;
+
+		nla = (struct nlattr *)skb_put(nl_skb, nla_len);
+		nla->nla_type = PSAMPLE_ATTR_DATA;
+		nla->nla_len = nla_attr_size(data_len);
+
+		if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
+			goto error;
+	}
+
+	genlmsg_end(nl_skb, data);
+	genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
+				PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
+
+	return;
+error:
+	pr_err_ratelimited("Could not create psample log message\n");
+	nlmsg_free(nl_skb);
+}
+EXPORT_SYMBOL_GPL(psample_sample_packet);
+
+static int __init psample_module_init(void)
+{
+	return genl_register_family(&psample_nl_family);
+}
+
+static void __exit psample_module_exit(void)
+{
+	genl_unregister_family(&psample_nl_family);
+}
+
+module_init(psample_module_init);
+module_exit(psample_module_exit);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("netlink channel for packet sampling");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 5c5670fae43027778e84b9d9ff3b9d91a10a8131 Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Mon, 23 Jan 2017 11:07:09 +0100
Subject: net/sched: Introduce sample tc action

This action allows the user to sample traffic matched by tc classifier.
The sampling consists of choosing packets randomly and sampling them using
the psample module. The user can configure the psample group number, the
sampling rate and the packet's truncation (to save kernel-user traffic).

Example:
To sample ingress traffic from interface eth1, one may use the commands:

tc qdisc add dev eth1 handle ffff: ingress

tc filter add dev eth1 parent ffff: \
	   matchall action sample rate 12 group 4

Where the first command adds an ingress qdisc and the second starts
sampling randomly with an average of one sampled packet per 12 packets on
dev eth1 to psample group 4.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_sample.h        |  50 +++++++
 include/uapi/linux/tc_act/Kbuild      |   1 +
 include/uapi/linux/tc_act/tc_sample.h |  26 ++++
 net/sched/Kconfig                     |  12 ++
 net/sched/Makefile                    |   1 +
 net/sched/act_sample.c                | 274 ++++++++++++++++++++++++++++++++++
 6 files changed, 364 insertions(+)
 create mode 100644 include/net/tc_act/tc_sample.h
 create mode 100644 include/uapi/linux/tc_act/tc_sample.h
 create mode 100644 net/sched/act_sample.c

(limited to 'net')

diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
new file mode 100644
index 000000000000..89e9305be880
--- /dev/null
+++ b/include/net/tc_act/tc_sample.h
@@ -0,0 +1,50 @@
+#ifndef __NET_TC_SAMPLE_H
+#define __NET_TC_SAMPLE_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/psample.h>
+
+struct tcf_sample {
+	struct tc_action common;
+	u32 rate;
+	bool truncate;
+	u32 trunc_size;
+	struct psample_group __rcu *psample_group;
+	u32 psample_group_num;
+	struct list_head tcfm_list;
+	struct rcu_head rcu;
+};
+#define to_sample(a) ((struct tcf_sample *)a)
+
+static inline bool is_tcf_sample(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	return a->ops && a->ops->type == TCA_ACT_SAMPLE;
+#else
+	return false;
+#endif
+}
+
+static inline __u32 tcf_sample_rate(const struct tc_action *a)
+{
+	return to_sample(a)->rate;
+}
+
+static inline bool tcf_sample_truncate(const struct tc_action *a)
+{
+	return to_sample(a)->truncate;
+}
+
+static inline int tcf_sample_trunc_size(const struct tc_action *a)
+{
+	return to_sample(a)->trunc_size;
+}
+
+static inline struct psample_group *
+tcf_sample_psample_group(const struct tc_action *a)
+{
+	return rcu_dereference(to_sample(a)->psample_group);
+}
+
+#endif /* __NET_TC_SAMPLE_H */
diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index e3db7403296f..ba62ddf0e58a 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -4,6 +4,7 @@ header-y += tc_defact.h
 header-y += tc_gact.h
 header-y += tc_ipt.h
 header-y += tc_mirred.h
+header-y += tc_sample.h
 header-y += tc_nat.h
 header-y += tc_pedit.h
 header-y += tc_skbedit.h
diff --git a/include/uapi/linux/tc_act/tc_sample.h b/include/uapi/linux/tc_act/tc_sample.h
new file mode 100644
index 000000000000..edc9058bb30d
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_sample.h
@@ -0,0 +1,26 @@
+#ifndef __LINUX_TC_SAMPLE_H
+#define __LINUX_TC_SAMPLE_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+
+#define TCA_ACT_SAMPLE 26
+
+struct tc_sample {
+	tc_gen;
+};
+
+enum {
+	TCA_SAMPLE_UNSPEC,
+	TCA_SAMPLE_TM,
+	TCA_SAMPLE_PARMS,
+	TCA_SAMPLE_RATE,
+	TCA_SAMPLE_TRUNC_SIZE,
+	TCA_SAMPLE_PSAMPLE_GROUP,
+	TCA_SAMPLE_PAD,
+	__TCA_SAMPLE_MAX
+};
+#define TCA_SAMPLE_MAX (__TCA_SAMPLE_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a9aa38d43fa7..72cfa3a6bac0 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -650,6 +650,18 @@ config NET_ACT_MIRRED
 	  To compile this code as a module, choose M here: the
 	  module will be called act_mirred.
 
+config NET_ACT_SAMPLE
+        tristate "Traffic Sampling"
+        depends on NET_CLS_ACT
+        select PSAMPLE
+        ---help---
+	  Say Y here to allow packet sampling tc action. The packet sample
+	  action consists of statistically choosing packets and sampling
+	  them using the psample module.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_sample.
+
 config NET_ACT_IPT
         tristate "IPtables targets"
         depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 4bdda3634e0b..7b915d226de7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)	+= act_police.o
 obj-$(CONFIG_NET_ACT_GACT)	+= act_gact.o
 obj-$(CONFIG_NET_ACT_MIRRED)	+= act_mirred.o
+obj-$(CONFIG_NET_ACT_SAMPLE)	+= act_sample.o
 obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
 obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 000000000000..39229756de07
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,274 @@
+/*
+ * net/sched/act_sample.c - Packet sampling tc action
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/psample.h>
+
+#include <linux/if_arp.h>
+
+#define SAMPLE_TAB_MASK     7
+static unsigned int sample_net_id;
+static struct tc_action_ops act_sample_ops;
+
+static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
+	[TCA_SAMPLE_PARMS]		= { .len = sizeof(struct tc_sample) },
+	[TCA_SAMPLE_RATE]		= { .type = NLA_U32 },
+	[TCA_SAMPLE_TRUNC_SIZE]		= { .type = NLA_U32 },
+	[TCA_SAMPLE_PSAMPLE_GROUP]	= { .type = NLA_U32 },
+};
+
+static int tcf_sample_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a, int ovr,
+			   int bind)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
+	struct psample_group *psample_group;
+	struct tc_sample *parm;
+	struct tcf_sample *s;
+	bool exists = false;
+	int ret;
+
+	if (!nla)
+		return -EINVAL;
+	ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy);
+	if (ret < 0)
+		return ret;
+	if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
+	    !tb[TCA_SAMPLE_PSAMPLE_GROUP])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SAMPLE_PARMS]);
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_sample_ops, bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+	s = to_sample(*a);
+
+	ASSERT_RTNL();
+	s->tcf_action = parm->action;
+	s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+	s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
+	psample_group = psample_group_get(net, s->psample_group_num);
+	if (!psample_group)
+		return -ENOMEM;
+	RCU_INIT_POINTER(s->psample_group, psample_group);
+
+	if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
+		s->truncate = true;
+		s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
+	}
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
+{
+	struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
+	struct psample_group *psample_group;
+
+	psample_group = rcu_dereference_protected(s->psample_group, 1);
+	RCU_INIT_POINTER(s->psample_group, NULL);
+	psample_group_put(psample_group);
+}
+
+static void tcf_sample_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_sample *s = to_sample(a);
+
+	call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
+}
+
+static bool tcf_sample_dev_ok_push(struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_TUNNEL:
+	case ARPHRD_TUNNEL6:
+	case ARPHRD_SIT:
+	case ARPHRD_IPGRE:
+	case ARPHRD_VOID:
+	case ARPHRD_NONE:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_sample *s = to_sample(a);
+	struct psample_group *psample_group;
+	int retval;
+	int size;
+	int iif;
+	int oif;
+
+	tcf_lastuse_update(&s->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
+	retval = READ_ONCE(s->tcf_action);
+
+	rcu_read_lock();
+	psample_group = rcu_dereference(s->psample_group);
+
+	/* randomly sample packets according to rate */
+	if (psample_group && (prandom_u32() % s->rate == 0)) {
+		if (!skb_at_tc_ingress(skb)) {
+			iif = skb->skb_iif;
+			oif = skb->dev->ifindex;
+		} else {
+			iif = skb->dev->ifindex;
+			oif = 0;
+		}
+
+		/* on ingress, the mac header gets popped, so push it back */
+		if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+			skb_push(skb, skb->mac_len);
+
+		size = s->truncate ? s->trunc_size : skb->len;
+		psample_sample_packet(psample_group, skb, size, iif, oif,
+				      s->rate);
+
+		if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+			skb_pull(skb, skb->mac_len);
+	}
+
+	rcu_read_unlock();
+	return retval;
+}
+
+static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_sample *s = to_sample(a);
+	struct tc_sample opt = {
+		.index      = s->tcf_index,
+		.action     = s->tcf_action,
+		.refcnt     = s->tcf_refcnt - ref,
+		.bindcnt    = s->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &s->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
+		goto nla_put_failure;
+
+	if (s->truncate)
+		if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
+			goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_sample_ops = {
+	.kind	  = "sample",
+	.type	  = TCA_ACT_SAMPLE,
+	.owner	  = THIS_MODULE,
+	.act	  = tcf_sample_act,
+	.dump	  = tcf_sample_dump,
+	.init	  = tcf_sample_init,
+	.cleanup  = tcf_sample_cleanup,
+	.walk	  = tcf_sample_walker,
+	.lookup	  = tcf_sample_search,
+	.size	  = sizeof(struct tcf_sample),
+};
+
+static __net_init int sample_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
+}
+
+static void __net_exit sample_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations sample_net_ops = {
+	.init = sample_init_net,
+	.exit = sample_exit_net,
+	.id   = &sample_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init sample_init_module(void)
+{
+	return tcf_register_action(&act_sample_ops, &sample_net_ops);
+}
+
+static void __exit sample_cleanup_module(void)
+{
+	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
+}
+
+module_init(sample_init_module);
+module_exit(sample_cleanup_module);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Packet sampling action");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 4faf940dd869c36436ff6f0a0b20369fdf5da68b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Tue, 24 Jan 2017 01:06:26 +0100
Subject: bpf: simplify __is_valid_access test on cb

The __is_valid_access() test for cb[] from 62c7989b24db ("bpf: allow
b/h/w/dw access for bpf's cb in ctx") was done unnecessarily complex,
we can just simplify it the same way as recent fix from 2d071c643f1c
("bpf, trace: make ctx access checks more robust") did. Overflow can
never happen as size is 1/2/4/8 depending on access.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 90383860e224..883975fa4ed1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2784,19 +2784,8 @@ static bool __is_valid_access(int off, int size)
 	switch (off) {
 	case offsetof(struct __sk_buff, cb[0]) ...
 	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
-		if (size == sizeof(__u16) &&
-		    off > offsetof(struct __sk_buff, cb[4]) + sizeof(__u16))
-			return false;
-		if (size == sizeof(__u32) &&
-		    off > offsetof(struct __sk_buff, cb[4]))
-			return false;
-		if (size == sizeof(__u64) &&
-		    off > offsetof(struct __sk_buff, cb[2]))
-			return false;
-		if (size != sizeof(__u8)  &&
-		    size != sizeof(__u16) &&
-		    size != sizeof(__u32) &&
-		    size != sizeof(__u64))
+		if (off + size >
+		    offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
 			return false;
 		break;
 	default:
-- 
cgit v1.2.3


From 2492d3b867043f6880708d095a7a5d65debcfc32 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Tue, 24 Jan 2017 01:06:27 +0100
Subject: bpf: enable load bytes helper for filter/reuseport progs

BPF_PROG_TYPE_SOCKET_FILTER are used in various facilities such as
for SO_REUSEPORT and packet fanout demuxing, packet filtering, kcm,
etc, and yet the only facility they can use is BPF_LD with {BPF_ABS,
BPF_IND} for single byte/half/word access.

Direct packet access is only restricted to tc programs right now,
but we can still facilitate usage by allowing skb_load_bytes() helper
added back then in 05c74e5e53f6 ("bpf: add bpf_skb_load_bytes helper")
that calls skb_header_pointer() similarly to bpf_load_pointer(), but
for stack buffers with larger access size.

Name the previous sk_filter_func_proto() as bpf_base_func_proto()
since this is used everywhere else as well, similarly for the ctx
converter, that is, bpf_convert_ctx_access().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 883975fa4ed1..e2263da505be 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2598,7 +2598,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 };
 
 static const struct bpf_func_proto *
-sk_filter_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -2625,6 +2625,17 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *
 tc_cls_act_func_proto(enum bpf_func_id func_id)
 {
@@ -2680,7 +2691,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2695,7 +2706,7 @@ xdp_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2706,7 +2717,7 @@ cg_skb_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2733,7 +2744,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2983,10 +2994,10 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
-					const struct bpf_insn *si,
-					struct bpf_insn *insn_buf,
-					struct bpf_prog *prog)
+static u32 bpf_convert_ctx_access(enum bpf_access_type type,
+				  const struct bpf_insn *si,
+				  struct bpf_insn *insn_buf,
+				  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 	int off;
@@ -3210,7 +3221,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct net_device, ifindex));
 		break;
 	default:
-		return sk_filter_convert_ctx_access(type, si, insn_buf, prog);
+		return bpf_convert_ctx_access(type, si, insn_buf, prog);
 	}
 
 	return insn - insn_buf;
@@ -3242,7 +3253,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops tc_cls_act_ops = {
@@ -3261,24 +3272,24 @@ static const struct bpf_verifier_ops xdp_ops = {
 static const struct bpf_verifier_ops cg_skb_ops = {
 	.get_func_proto		= cg_skb_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops lwt_inout_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops lwt_xmit_ops = {
 	.get_func_proto		= lwt_xmit_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
 static const struct bpf_verifier_ops cg_sock_ops = {
-	.get_func_proto		= sk_filter_func_proto,
+	.get_func_proto		= bpf_base_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
-- 
cgit v1.2.3


From d1b662adcdb87944b7f6f7bd2f95cbb1404dbf18 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Tue, 24 Jan 2017 01:06:28 +0100
Subject: bpf: allow option for setting bpf_l4_csum_replace from scratch

When programs need to calculate the csum from scratch for small UDP
packets and use bpf_l4_csum_replace() to feed the result from helpers
like bpf_csum_diff(), then we need a flag besides BPF_F_MARK_MANGLED_0
that would ignore the case of current csum being 0, and which would
still allow for the helper to set the csum and transform when needed
to CSUM_MANGLED_0.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd3068485410..e07fd5a324e6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -522,6 +522,7 @@ enum bpf_func_id {
 /* BPF_FUNC_l4_csum_replace flags. */
 #define BPF_F_PSEUDO_HDR		(1ULL << 4)
 #define BPF_F_MARK_MANGLED_0		(1ULL << 5)
+#define BPF_F_MARK_ENFORCE		(1ULL << 6)
 
 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
 #define BPF_F_INGRESS			(1ULL << 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index e2263da505be..1e00737e3bc3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1522,10 +1522,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
 {
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
 	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
+	bool do_mforce = flags & BPF_F_MARK_ENFORCE;
 	__sum16 *ptr;
 
-	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
-			       BPF_F_HDR_FIELD_MASK)))
+	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
+			       BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
 		return -EINVAL;
 	if (unlikely(offset > 0xffff || offset & 1))
 		return -EFAULT;
@@ -1533,7 +1534,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
 		return -EFAULT;
 
 	ptr = (__sum16 *)(skb->data + offset);
-	if (is_mmzero && !*ptr)
+	if (is_mmzero && !do_mforce && !*ptr)
 		return 0;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
-- 
cgit v1.2.3


From 82272db84d874ebe8b4ae24d4e64d770882dbdd1 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Mon, 23 Jan 2017 19:19:07 -0800
Subject: net: dsa: Drop WARN() in tag_brcm.c

We may be able to see invalid Broadcom tags when the hardware and drivers are
misconfigured, or just while exercising the error path. Instead of flooding
the console with messages, flat out drop the packet.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_brcm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index af82927674e0..cb5a2b7a0118 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -121,7 +121,8 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	/* We should never see a reserved reason code without knowing how to
 	 * handle it
 	 */
-	WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD);
+	if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
+		goto out_drop;
 
 	/* Locate which port this is coming from */
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
-- 
cgit v1.2.3


From 146408693945a68f227175c1cea3772fc0f98f20 Mon Sep 17 00:00:00 2001
From: Colin Ian King
Date: Tue, 24 Jan 2017 09:25:54 +0000
Subject: net: sctp: fix array overrun read on sctp_timer_tbl

Table sctp_timer_tbl is missing a TIMEOUT_RECONF string so
add this in. Also compare timeout with the size of the array
sctp_timer_tbl rather than SCTP_EVENT_TIMEOUT_MAX.  Also add
a build time check that SCTP_EVENT_TIMEOUT_MAX is correct
so we don't ever get this kind of mismatch between the table
and SCTP_EVENT_TIMEOUT_MAX in the future.

Kudos to Marcelo Ricardo Leitner for spotting the missing string
and suggesting the build time sanity check.

Fixes CoverityScan CID#1397639 ("Out-of-bounds read")

Fixes: 7b9438de0cd4 ("sctp: add stream reconf timer")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/debug.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 95d7b15dad21..2e47eb2f05cb 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -159,6 +159,7 @@ static const char *const sctp_timer_tbl[] = {
 	"TIMEOUT_T4_RTO",
 	"TIMEOUT_T5_SHUTDOWN_GUARD",
 	"TIMEOUT_HEARTBEAT",
+	"TIMEOUT_RECONF",
 	"TIMEOUT_SACK",
 	"TIMEOUT_AUTOCLOSE",
 };
@@ -166,7 +167,9 @@ static const char *const sctp_timer_tbl[] = {
 /* Lookup timer debug name. */
 const char *sctp_tname(const sctp_subtype_t id)
 {
-	if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX)
+	BUILD_BUG_ON(SCTP_EVENT_TIMEOUT_MAX + 1 != ARRAY_SIZE(sctp_timer_tbl));
+
+	if (id.timeout < ARRAY_SIZE(sctp_timer_tbl))
 		return sctp_timer_tbl[id.timeout];
 	return "unknown_timer";
 }
-- 
cgit v1.2.3


From 1045ba77a5962a22bce7777678ef46714107ea63 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim
Date: Tue, 24 Jan 2017 07:02:41 -0500
Subject: net sched actions: Add support for user cookies

Introduce optional 128-bit action cookie.
Like all other cookie schemes in the networking world (eg in protocols
like http or existing kernel fib protocol field, etc) the idea is to save
user state that when retrieved serves as a correlator. The kernel
_should not_ intepret it.  The user can store whatever they wish in the
128 bits.

Sample exercise(showing variable length use of cookie)

.. create an accept action with cookie a1b2c3d4
sudo $TC actions add action ok index 1 cookie a1b2c3d4

.. dump all gact actions..
sudo $TC -s actions ls action gact

    action order 0: gact action pass
     random type none pass val 0
     index 1 ref 1 bind 0 installed 5 sec used 5 sec
    Action statistics:
    Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0
    cookie a1b2c3d4

.. bind the accept action to a filter..
sudo $TC filter add dev lo parent ffff: protocol ip prio 1 \
u32 match ip dst 127.0.0.1/32 flowid 1:1 action gact index 1

... send some traffic..
$ ping 127.0.0.1 -c 3
PING 127.0.0.1 (127.0.0.1) 56(84) bytes of data.
64 bytes from 127.0.0.1: icmp_seq=1 ttl=64 time=0.020 ms
64 bytes from 127.0.0.1: icmp_seq=2 ttl=64 time=0.027 ms
64 bytes from 127.0.0.1: icmp_seq=3 ttl=64 time=0.038 ms

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        |  1 +
 include/net/pkt_cls.h        |  8 ++++++++
 include/uapi/linux/pkt_cls.h |  3 +++
 net/sched/act_api.c          | 45 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+)

(limited to 'net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 1d716449209e..cfa2ae33da9a 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -41,6 +41,7 @@ struct tc_action {
 	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
+	struct tc_cookie	*act_cookie;
 };
 #define tcf_head	common.tcfa_head
 #define tcf_index	common.tcfa_index
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f0a051480c6c..b43077e47d35 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -515,4 +515,12 @@ struct tc_cls_bpf_offload {
 	u32 gen_flags;
 };
 
+
+/* This structure holds cookie structure that is passed from user
+ * to the kernel for actions and classifiers
+ */
+struct tc_cookie {
+	u8  *data;
+	u32 len;
+};
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index fd373ebd5a44..345551e71410 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -4,6 +4,8 @@
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
 
+#define TC_COOKIE_MAX_SIZE 16
+
 /* Action attributes */
 enum {
 	TCA_ACT_UNSPEC,
@@ -12,6 +14,7 @@ enum {
 	TCA_ACT_INDEX,
 	TCA_ACT_STATS,
 	TCA_ACT_PAD,
+	TCA_ACT_COOKIE,
 	__TCA_ACT_MAX
 };
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index cd08df91351d..3c5e29ba6594 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -24,6 +24,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/sch_generic.h>
+#include <net/pkt_cls.h>
 #include <net/act_api.h>
 #include <net/netlink.h>
 
@@ -33,6 +34,12 @@ static void free_tcf(struct rcu_head *head)
 
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_qstats);
+
+	if (p->act_cookie) {
+		kfree(p->act_cookie->data);
+		kfree(p->act_cookie);
+	}
+
 	kfree(p);
 }
 
@@ -475,6 +482,12 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 		goto nla_put_failure;
 	if (tcf_action_copy_stats(skb, a, 0))
 		goto nla_put_failure;
+	if (a->act_cookie) {
+		if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
+			    a->act_cookie->data))
+			goto nla_put_failure;
+	}
+
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
@@ -516,6 +529,22 @@ errout:
 	return err;
 }
 
+int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb)
+{
+	a->act_cookie = kzalloc(sizeof(*a->act_cookie), GFP_KERNEL);
+	if (!a->act_cookie)
+		return -ENOMEM;
+
+	a->act_cookie->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
+	if (!a->act_cookie->data) {
+		kfree(a->act_cookie);
+		return -ENOMEM;
+	}
+	a->act_cookie->len = nla_len(tb[TCA_ACT_COOKIE]);
+
+	return 0;
+}
+
 struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 				    struct nlattr *est, char *name, int ovr,
 				    int bind)
@@ -575,6 +604,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		goto err_mod;
 
+	if (tb[TCA_ACT_COOKIE]) {
+		int cklen = nla_len(tb[TCA_ACT_COOKIE]);
+
+		if (cklen > TC_COOKIE_MAX_SIZE) {
+			err = -EINVAL;
+			tcf_hash_release(a, bind);
+			goto err_mod;
+		}
+
+		err = nla_memdup_cookie(a, tb);
+		if (err < 0) {
+			tcf_hash_release(a, bind);
+			goto err_mod;
+		}
+	}
+
 	/* module count goes up only when brand new policy is created
 	 * if it exists and is only bound to in a_o->init() then
 	 * ACT_P_CREATED is not returned (a zero is).
-- 
cgit v1.2.3


From a08ef4768f479a712f80426fba42d16253feb37c Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Tue, 24 Jan 2017 12:49:35 +0300
Subject: tipc: uninitialized return code in tipc_setsockopt()

We shuffled some code around and added some new case statements here and
now "res" isn't initialized on all paths.

Fixes: 01fd12bb189a ("tipc: make replicast a user selectable option")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 5bec8aac5008..103d1fd058c0 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2348,7 +2348,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	u32 value = 0;
-	int res;
+	int res = 0;
 
 	if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
 		return 0;
@@ -2388,7 +2388,6 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 		break;
 	case TIPC_CONN_TIMEOUT:
 		tipc_sk(sk)->conn_timeout = value;
-		/* no need to set "res", since already 0 at this point */
 		break;
 	case TIPC_MCAST_BROADCAST:
 		tsk->mc_method.rcast = false;
-- 
cgit v1.2.3


From 60b1af3300724d211bb0b420c1fbe6bf5b87b013 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Tue, 24 Jan 2017 14:57:36 -0800
Subject: tcp: reduce skb overhead in selected places

tcp_add_backlog() can use skb_condense() helper to get better
gains and less SKB_TRUESIZE() magic. This only happens when socket
backlog has to be used.

Some attacks involve specially crafted out of order tiny TCP packets,
clogging the ofo queue of (many) sockets.
Then later, expensive collapse happens, trying to copy all these skbs
into single ones.
This unfortunately does not work if each skb has no neighbor in TCP
sequence order.

By using skb_condense() if the skb could not be coalesced to a prior
one, we defeat these kind of threats, potentially saving 4K per skb
(or more, since this is one page fragment).

A typical NAPI driver allocates gro packets with GRO_MAX_HEAD bytes
in skb->head, meaning the copy done by skb_condense() is limited to
about 200 bytes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 net/ipv4/tcp_ipv4.c  | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bfa165cc455a..3de6eba378ad 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4507,6 +4507,7 @@ add_sack:
 end:
 	if (skb) {
 		tcp_grow_window(sk, skb);
+		skb_condense(skb);
 		skb_set_owner_r(skb, sk);
 	}
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f7325b25b06e..a90b4540c11e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1556,8 +1556,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 	 * It has been noticed pure SACK packets were sometimes dropped
 	 * (if cooked by drivers without copybreak feature).
 	 */
-	if (!skb->data_len)
-		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+	skb_condense(skb);
 
 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
 		bh_unlock_sock(sk);
-- 
cgit v1.2.3


From 065263f40f0972d5f1cd294bb0242bd5aa5f06b2 Mon Sep 17 00:00:00 2001
From: Wei Wang
Date: Mon, 23 Jan 2017 10:59:20 -0800
Subject: net/tcp-fastopen: refactor cookie check logic

Refactor the cookie check logic in tcp_send_syn_data() into a function.
This function will be called else where in later changes.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  2 ++
 net/ipv4/tcp_fastopen.c | 21 +++++++++++++++++++++
 net/ipv4/tcp_output.c   | 16 ++--------------
 3 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c55d65f74f7f..de67541d7adf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1493,6 +1493,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct tcp_fastopen_cookie *foc,
 			      struct dst_entry *dst);
 void tcp_fastopen_init_key_once(bool publish);
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+			     struct tcp_fastopen_cookie *cookie);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
 /* Fastopen key context */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f51919535ca7..f90e09e1ff4c 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -325,3 +325,24 @@ fastopen:
 	*foc = valid_foc;
 	return NULL;
 }
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+			       struct tcp_fastopen_cookie *cookie)
+{
+	unsigned long last_syn_loss = 0;
+	int syn_loss = 0;
+
+	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+	/* Recurring FO SYN losses: no cookie or data in SYN */
+	if (syn_loss > 1 &&
+	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+		cookie->len = -1;
+		return false;
+	}
+	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+		cookie->len = -1;
+		return true;
+	}
+	return cookie->len > 0;
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9a1a1494b9dd..671c69535671 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3267,23 +3267,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_fastopen_request *fo = tp->fastopen_req;
-	int syn_loss = 0, space, err = 0;
-	unsigned long last_syn_loss = 0;
+	int space, err = 0;
 	struct sk_buff *syn_data;
 
 	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
-	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
-			       &syn_loss, &last_syn_loss);
-	/* Recurring FO SYN losses: revert to regular handshake temporarily */
-	if (syn_loss > 1 &&
-	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
-		fo->cookie.len = -1;
-		goto fallback;
-	}
-
-	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
-		fo->cookie.len = -1;
-	else if (fo->cookie.len <= 0)
+	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
 		goto fallback;
 
 	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
-- 
cgit v1.2.3


From 25776aa943401662617437841b3d3ea4693ee98a Mon Sep 17 00:00:00 2001
From: Wei Wang
Date: Mon, 23 Jan 2017 10:59:21 -0800
Subject: net: Remove __sk_dst_reset() in tcp_v6_connect()

Remove __sk_dst_reset() in the failure handling because __sk_dst_reset()
will eventually get called when sk is released. No need to handle it in
the protocol specific connect call.
This is also to make the code path consistent with ipv4.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f72100eedd5d..0b7cd3d009b6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -295,7 +295,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 late_failure:
 	tcp_set_state(sk, TCP_CLOSE);
-	__sk_dst_reset(sk);
 failure:
 	inet->inet_dport = 0;
 	sk->sk_route_caps = 0;
-- 
cgit v1.2.3


From 19f6d3f3c8422d65b5e3d2162e30ef07c6e21ea2 Mon Sep 17 00:00:00 2001
From: Wei Wang
Date: Mon, 23 Jan 2017 10:59:22 -0800
Subject: net/tcp-fastopen: Add new API support

This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
alternative way to perform Fast Open on the active side (client). Prior
to this patch, a client needs to replace the connect() call with
sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
to use Fast Open: these socket operations are often done in lower layer
libraries used by many other applications. Changing these libraries
and/or the socket call sequences are not trivial. A more convenient
approach is to perform Fast Open by simply enabling a socket option when
the socket is created w/o changing other socket calls sequence:
  s = socket()
    create a new socket
  setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
    newly introduced sockopt
    If set, new functionality described below will be used.
    Return ENOTSUPP if TFO is not supported or not enabled in the
    kernel.

  connect()
    With cookie present, return 0 immediately.
    With no cookie, initiate 3WHS with TFO cookie-request option and
    return -1 with errno = EINPROGRESS.

  write()/sendmsg()
    With cookie present, send out SYN with data and return the number of
    bytes buffered.
    With no cookie, and 3WHS not yet completed, return -1 with errno =
    EINPROGRESS.
    No MSG_FASTOPEN flag is needed.

  read()
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
    write() is not called yet.
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
    established but no msg is received yet.
    Return number of bytes read if socket is established and there is
    msg received.

The new API simplifies life for applications that always perform a write()
immediately after a successful connect(). Such applications can now take
advantage of Fast Open by merely making one new setsockopt() call at the time
of creating the socket. Nothing else about the application's socket call
sequence needs to change.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  3 ++-
 include/net/inet_sock.h  |  6 +++++-
 include/net/tcp.h        |  1 +
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/af_inet.c       | 31 ++++++++++++++++++++++++-------
 net/ipv4/tcp.c           | 35 ++++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_fastopen.c  | 33 +++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  7 ++++++-
 net/ipv6/tcp_ipv6.c      |  5 +++++
 9 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 5371b3d70cfe..f88f4649ba6f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -222,7 +222,8 @@ struct tcp_sock {
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
-		unused:5;
+		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+		unused:4;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index c9cff977a7fb..aa95053dfc78 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -206,7 +206,11 @@ struct inet_sock {
 				transparent:1,
 				mc_all:1,
 				nodefrag:1;
-	__u8			bind_address_no_port:1;
+	__u8			bind_address_no_port:1,
+				defer_connect:1; /* Indicates that fastopen_connect is set
+						  * and cookie exists so we defer connect
+						  * until first data frame is written
+						  */
 	__u8			rcv_tos;
 	__u8			convert_csum;
 	int			uc_index;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de67541d7adf..6ec4ea652f3f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1495,6 +1495,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
 /* Fastopen key context */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c53de2691cec..6ff35eb48d10 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -116,6 +116,7 @@ enum {
 #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
+#define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 28fe8da4e1ac..92e7f3e957fa 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 	long timeo;
 
-	if (addr_len < sizeof(uaddr->sa_family))
-		return -EINVAL;
+	/*
+	 * uaddr can be NULL and addr_len can be 0 if:
+	 * sk is a TCP fastopen active socket and
+	 * TCP_FASTOPEN_CONNECT sockopt is set and
+	 * we already have a valid cookie for this socket.
+	 * In this case, user can call write() after connect().
+	 * write() will invoke tcp_sendmsg_fastopen() which calls
+	 * __inet_stream_connect().
+	 */
+	if (uaddr) {
+		if (addr_len < sizeof(uaddr->sa_family))
+			return -EINVAL;
 
-	if (uaddr->sa_family == AF_UNSPEC) {
-		err = sk->sk_prot->disconnect(sk, flags);
-		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
-		goto out;
+		if (uaddr->sa_family == AF_UNSPEC) {
+			err = sk->sk_prot->disconnect(sk, flags);
+			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+			goto out;
+		}
 	}
 
 	switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		err = -EISCONN;
 		goto out;
 	case SS_CONNECTING:
-		err = -EALREADY;
+		if (inet_sk(sk)->defer_connect)
+			err = -EINPROGRESS;
+		else
+			err = -EALREADY;
 		/* Fall out of switch with err, set for this state */
 		break;
 	case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 		sock->state = SS_CONNECTING;
 
+		if (!err && inet_sk(sk)->defer_connect)
+			goto out;
+
 		/* Just entered SS_CONNECTING state; the only
 		 * difference is that return value in non-blocking
 		 * case is EINPROGRESS, rather than EALREADY.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c43eb1a831d7..d9735b76d073 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
+	} else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+		/* Active TCP fastopen socket with defer_connect
+		 * Return POLLOUT so application can call write()
+		 * in order for kernel to generate SYN+data
+		 */
+		mask |= POLLOUT | POLLWRNORM;
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
@@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 				int *copied, size_t size)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
 	int err, flags;
 
 	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	tp->fastopen_req->data = msg;
 	tp->fastopen_req->size = size;
 
+	if (inet->defer_connect) {
+		err = tcp_connect(sk);
+		/* Same failure procedure as in tcp_v4/6_connect */
+		if (err) {
+			tcp_set_state(sk, TCP_CLOSE);
+			inet->inet_dport = 0;
+			sk->sk_route_caps = 0;
+		}
+	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
 				    msg->msg_namelen, flags);
+	inet->defer_connect = 0;
 	*copied = tp->fastopen_req->copied;
 	tcp_free_fastopen_req(tp);
 	return err;
@@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
-	if (flags & MSG_FASTOPEN) {
+	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
 		if (err == -EINPROGRESS && copied_syn > 0)
 			goto out;
@@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		}
 		break;
+	case TCP_FASTOPEN_CONNECT:
+		if (val > 1 || val < 0) {
+			err = -EINVAL;
+		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+			if (sk->sk_state == TCP_CLOSE)
+				tp->fastopen_connect = val;
+			else
+				err = -EINVAL;
+		} else {
+			err = -EOPNOTSUPP;
+		}
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = icsk->icsk_accept_queue.fastopenq.max_qlen;
 		break;
 
+	case TCP_FASTOPEN_CONNECT:
+		val = tp->fastopen_connect;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f90e09e1ff4c..9674bec4a0f8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -346,3 +346,36 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 	}
 	return cookie->len > 0;
 }
+
+/* This function checks if we want to defer sending SYN until the first
+ * write().  We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ *               return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+	struct tcp_fastopen_cookie cookie = { .len = 0 };
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 mss;
+
+	if (tp->fastopen_connect && !tp->fastopen_req) {
+		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+			inet_sk(sk)->defer_connect = 1;
+			return true;
+		}
+
+		/* Alloc fastopen_req in order for FO option to be included
+		 * in SYN
+		 */
+		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+					   sk->sk_allocation);
+		if (tp->fastopen_req)
+			tp->fastopen_req->cookie = cookie;
+		else
+			*err = -ENOBUFS;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a90b4540c11e..8c9e9aa17d66 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
+	rt = NULL;
 
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	inet->inet_id = tp->write_seq ^ jiffies;
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto failure;
+
 	err = tcp_connect(sk);
 
-	rt = NULL;
 	if (err)
 		goto failure;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0b7cd3d009b6..95c05e5293b1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 							     inet->inet_dport,
 							     &tp->tsoffset);
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto late_failure;
+
 	err = tcp_connect(sk);
 	if (err)
 		goto late_failure;
-- 
cgit v1.2.3


From 3979ad7e82dfe3fb94a51c3915e64ec64afa45c3 Mon Sep 17 00:00:00 2001
From: Willy Tarreau
Date: Wed, 25 Jan 2017 14:42:46 +0100
Subject: net/tcp-fastopen: make connect()'s return case more consistent with
 non-TFO

Without TFO, any subsequent connect() call after a successful one returns
-1 EISCONN. The last API update ensured that __inet_stream_connect() can
return -1 EINPROGRESS in response to sendmsg() when TFO is in use to
indicate that the connection is now in progress. Unfortunately since this
function is used both for connect() and sendmsg(), it has the undesired
side effect of making connect() now return -1 EINPROGRESS as well after
a successful call, while at the same time poll() returns POLLOUT. This
can confuse some applications which happen to call connect() and to
check for -1 EISCONN to ensure the connection is usable, and for which
EINPROGRESS indicates a need to poll, causing a loop.

This problem was encountered in haproxy where a call to connect() is
precisely used in certain cases to confirm a connection's readiness.
While arguably haproxy's behaviour should be improved here, it seems
important to aim at a more robust behaviour when the goal of the new
API is to make it easier to implement TFO in existing applications.

This patch simply ensures that we preserve the same semantics as in
the non-TFO case on the connect() syscall when using TFO, while still
returning -1 EINPROGRESS on sendmsg(). For this we simply tell
__inet_stream_connect() whether we're doing a regular connect() or in
fact connecting for a sendmsg() call.

Cc: Wei Wang <weiwan@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_common.h | 2 +-
 net/ipv4/af_inet.c        | 6 +++---
 net/ipv4/tcp.c            | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 5d683428fced..b7952d55b9c0 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -17,7 +17,7 @@ int inet_release(struct socket *sock);
 int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			int addr_len, int flags);
 int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-			  int addr_len, int flags);
+			  int addr_len, int flags, int is_sendmsg);
 int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 		       int addr_len, int flags);
 int inet_accept(struct socket *sock, struct socket *newsock, int flags);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 92e7f3e957fa..685ba53df2d1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -570,7 +570,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
  *	TCP 'magic' in here.
  */
 int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-			  int addr_len, int flags)
+			  int addr_len, int flags, int is_sendmsg)
 {
 	struct sock *sk = sock->sk;
 	int err;
@@ -605,7 +605,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		goto out;
 	case SS_CONNECTING:
 		if (inet_sk(sk)->defer_connect)
-			err = -EINPROGRESS;
+			err = is_sendmsg ? -EINPROGRESS : -EISCONN;
 		else
 			err = -EALREADY;
 		/* Fall out of switch with err, set for this state */
@@ -679,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 
 	lock_sock(sock->sk);
-	err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+	err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
 	release_sock(sock->sk);
 	return err;
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d9735b76d073..2ed472ebf3b5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1103,7 +1103,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
-				    msg->msg_namelen, flags);
+				    msg->msg_namelen, flags, 1);
 	inet->defer_connect = 0;
 	*copied = tp->fastopen_req->copied;
 	tcp_free_fastopen_req(tp);
-- 
cgit v1.2.3


From 5b9d6b154a622d0481123c1b149afdf377264268 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Wed, 25 Jan 2017 23:29:33 +0100
Subject: bridge: move maybe_deliver_addr() inside #ifdef

The only caller of this new function is inside of an #ifdef checking
for CONFIG_BRIDGE_IGMP_SNOOPING, so we should move the implementation
there too, in order to avoid this harmless warning:

net/bridge/br_forward.c:177:13: error: 'maybe_deliver_addr' defined but not used [-Werror=unused-function]

Fixes: 6db6f0eae605 ("bridge: multicast to unicast")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_forward.c | 50 ++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index a0f9d0037d24..5a1f8ef49899 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -174,31 +174,6 @@ out:
 	return p;
 }
 
-static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
-			       const unsigned char *addr, bool local_orig)
-{
-	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
-	const unsigned char *src = eth_hdr(skb)->h_source;
-
-	if (!should_deliver(p, skb))
-		return;
-
-	/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
-	if (skb->dev == p->dev && ether_addr_equal(src, addr))
-		return;
-
-	skb = skb_copy(skb, GFP_ATOMIC);
-	if (!skb) {
-		dev->stats.tx_dropped++;
-		return;
-	}
-
-	if (!is_broadcast_ether_addr(addr))
-		memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
-
-	__br_forward(p, skb, local_orig);
-}
-
 /* called under rcu_read_lock */
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
@@ -245,6 +220,31 @@ out:
 }
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
+			       const unsigned char *addr, bool local_orig)
+{
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+	const unsigned char *src = eth_hdr(skb)->h_source;
+
+	if (!should_deliver(p, skb))
+		return;
+
+	/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+	if (skb->dev == p->dev && ether_addr_equal(src, addr))
+		return;
+
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (!skb) {
+		dev->stats.tx_dropped++;
+		return;
+	}
+
+	if (!is_broadcast_ether_addr(addr))
+		memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+
+	__br_forward(p, skb, local_orig);
+}
+
 /* called with rcu_read_lock */
 void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 			struct sk_buff *skb,
-- 
cgit v1.2.3


From d3e9768ab97fbbff43d9face2711692206a576a9 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich
Date: Thu, 24 Nov 2016 16:11:01 +0100
Subject: batman-adv: don't add loop detect macs to TT

The bridge loop avoidance (BLA) feature of batman-adv sends packets to
probe for Mesh/LAN packet loops. Those packets are not sent by real
clients and should therefore not be added to the translation table (TT).

Signed-off-by: Simon Wunderlich <simon.wunderlich@open-mesh.com>
---
 net/batman-adv/bridge_loop_avoidance.h | 18 ++++++++++++++++++
 net/batman-adv/soft-interface.c        |  3 ++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 1ae93e46fb98..2827cd3c13d2 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -20,6 +20,8 @@
 
 #include "main.h"
 
+#include <linux/compiler.h>
+#include <linux/stddef.h>
 #include <linux/types.h>
 
 struct net_device;
@@ -27,6 +29,22 @@ struct netlink_callback;
 struct seq_file;
 struct sk_buff;
 
+/**
+ * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect
+ *  frame sent by bridge loop avoidance
+ * @mac: mac address to check
+ *
+ * Return: true if the it looks like a loop detect frame
+ * (mac starts with BA:BE), false otherwise
+ */
+static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac)
+{
+	if (mac[0] == 0xba && mac[1] == 0xbe)
+		return true;
+
+	return false;
+}
+
 #ifdef CONFIG_BATMAN_ADV_BLA
 bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
 		   unsigned short vid, bool is_bcast);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7b3494ae6ad9..1f55b4b9181c 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -258,7 +258,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
 	ethhdr = eth_hdr(skb);
 
 	/* Register the client MAC in the transtable */
-	if (!is_multicast_ether_addr(ethhdr->h_source)) {
+	if (!is_multicast_ether_addr(ethhdr->h_source) &&
+	    !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) {
 		client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source,
 						   vid, skb->skb_iif,
 						   skb->mark);
-- 
cgit v1.2.3


From ac79cbb96b58614ce13c4fccc00a9b4d43c2f79b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann
Date: Sun, 1 Jan 2017 00:00:00 +0100
Subject: batman-adv: update copyright years for 2017

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h        | 2 +-
 net/batman-adv/Makefile                | 2 +-
 net/batman-adv/bat_algo.c              | 2 +-
 net/batman-adv/bat_algo.h              | 2 +-
 net/batman-adv/bat_iv_ogm.c            | 2 +-
 net/batman-adv/bat_iv_ogm.h            | 2 +-
 net/batman-adv/bat_v.c                 | 2 +-
 net/batman-adv/bat_v.h                 | 2 +-
 net/batman-adv/bat_v_elp.c             | 2 +-
 net/batman-adv/bat_v_elp.h             | 2 +-
 net/batman-adv/bat_v_ogm.c             | 2 +-
 net/batman-adv/bat_v_ogm.h             | 2 +-
 net/batman-adv/bitarray.c              | 2 +-
 net/batman-adv/bitarray.h              | 2 +-
 net/batman-adv/bridge_loop_avoidance.c | 2 +-
 net/batman-adv/bridge_loop_avoidance.h | 2 +-
 net/batman-adv/debugfs.c               | 2 +-
 net/batman-adv/debugfs.h               | 2 +-
 net/batman-adv/distributed-arp-table.c | 2 +-
 net/batman-adv/distributed-arp-table.h | 2 +-
 net/batman-adv/fragmentation.c         | 2 +-
 net/batman-adv/fragmentation.h         | 2 +-
 net/batman-adv/gateway_client.c        | 2 +-
 net/batman-adv/gateway_client.h        | 2 +-
 net/batman-adv/gateway_common.c        | 2 +-
 net/batman-adv/gateway_common.h        | 2 +-
 net/batman-adv/hard-interface.c        | 2 +-
 net/batman-adv/hard-interface.h        | 2 +-
 net/batman-adv/hash.c                  | 2 +-
 net/batman-adv/hash.h                  | 2 +-
 net/batman-adv/icmp_socket.c           | 2 +-
 net/batman-adv/icmp_socket.h           | 2 +-
 net/batman-adv/log.c                   | 2 +-
 net/batman-adv/log.h                   | 2 +-
 net/batman-adv/main.c                  | 2 +-
 net/batman-adv/main.h                  | 2 +-
 net/batman-adv/multicast.c             | 2 +-
 net/batman-adv/multicast.h             | 2 +-
 net/batman-adv/netlink.c               | 2 +-
 net/batman-adv/netlink.h               | 2 +-
 net/batman-adv/network-coding.c        | 2 +-
 net/batman-adv/network-coding.h        | 2 +-
 net/batman-adv/originator.c            | 2 +-
 net/batman-adv/originator.h            | 2 +-
 net/batman-adv/packet.h                | 2 +-
 net/batman-adv/routing.c               | 2 +-
 net/batman-adv/routing.h               | 2 +-
 net/batman-adv/send.c                  | 2 +-
 net/batman-adv/send.h                  | 2 +-
 net/batman-adv/soft-interface.c        | 2 +-
 net/batman-adv/soft-interface.h        | 2 +-
 net/batman-adv/sysfs.c                 | 2 +-
 net/batman-adv/sysfs.h                 | 2 +-
 net/batman-adv/tp_meter.c              | 2 +-
 net/batman-adv/tp_meter.h              | 2 +-
 net/batman-adv/translation-table.c     | 2 +-
 net/batman-adv/translation-table.h     | 2 +-
 net/batman-adv/tvlv.c                  | 2 +-
 net/batman-adv/tvlv.h                  | 2 +-
 net/batman-adv/types.h                 | 2 +-
 60 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 734fe83ab645..a83ddb7b63db 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index f724d3c98a81..915987bc6d29 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 623d04302aa2..44fd073b7546 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 3b5b69cdd12b..29f6312f9bf1 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f00f666e2ccd..7c3d994e90d8 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index b9f3550faaf7..ae2ab526bdb1 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 2ac612d7bab4..0acd081dd286 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index 83b77639729e..dd7c4b647e6b 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index f2fb2f05b6bf..b90c9903e246 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index be17c0b1369e..376ead280ab9 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 38b9aab83fc0..03a35c9f456d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 4c4d45caa422..2068770b542d 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 032271421a20..2b070c7e31da 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0e6e9d09078c..cc262c9d97e0 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..2d22fd5ba96c 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 2827cd3c13d2..e157986bd01c 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 77925504379d..5406148b9497 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index e49121ee55f6..9c5d4a65b98c 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 49576c5a3fe3..dab466f97ccf 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 813ecea96cf9..ec364a3c1c66 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 9c561e683f4b..42bfbd801a1b 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index b95f619606af..1a2d6c308745 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 52b8bd6ec431..de9955d5224d 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 859166d03561..3baa3d466e5e 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 21184810d89f..5db2e43e3775 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 8a5e1ddf1175..0a6a97d201f2 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 61a431a9772b..e348f76ea8c1 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index d6309a423629..9f9890ff7a22 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a0a0fdb85805..b5f7e13918ac 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 557a7044cfbc..0c905e91c5e2 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index b310f381ae02..6308c9f0fd96 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e44a7da51431..f3fec40aae86 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index c73c31769aba..4ef4bde2cc2d 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 3284a7b0325d..7a2b9f4da078 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d46415edd3be..5000c540614d 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 8683542067ba..57a8103dbce7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 090a69fc342e..952ba81a565b 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 2cddaf52a21d..2a78cddab0e9 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 062738163bdc..ab13b4d58733 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 52eb16281aba..f1cd8c5da966 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index ab5a3bf0765f..e1f6fc72fe3e 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index d6d7fb4ec5d5..c66efb81d2f4 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 8f3b2969cc4e..8e2a4b205257 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index ebc56183f358..d94220a6d21a 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 7a36bcfa0ba0..8e8a5db197cb 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 6713bdf414cd..5f050fbdfff7 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 05c3ff42e181..5ede16c32f15 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 49021b7124f3..d7308263b8fa 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index a94e1e8639ca..f21166d10323 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 1f55b4b9181c..4a9923a95e8a 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index ec303ddbf647..639c3abb214a 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 17c844196eb2..0ae8b30e4eaa 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index c76021b4e198..e487412e256b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 981e8c5b07e9..07f64b60b528 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index ba922c425e56..a8ada5c123bd 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 30ecbfb40adf..941afad92121 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 783fdba84db2..411d586191da 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index a783420356ae..1d9e267caec9 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index e4369b547b43..4d01400ada30 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index e913aee28c98..8f64a5c01345 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
-- 
cgit v1.2.3


From 269cee621847a09b9cfcdb6469ff9b546efbcac1 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann
Date: Sat, 17 Dec 2016 11:58:29 +0100
Subject: batman-adv: Remove unused variable in batadv_tt_local_set_flags

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/translation-table.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 941afad92121..6077a87d46f0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -3714,7 +3714,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
 {
 	struct batadv_hashtable *hash = bat_priv->tt.local_hash;
 	struct batadv_tt_common_entry *tt_common_entry;
-	u16 changed_num = 0;
 	struct hlist_head *head;
 	u32 i;
 
@@ -3736,7 +3735,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
 					continue;
 				tt_common_entry->flags &= ~flags;
 			}
-			changed_num++;
 
 			if (!count)
 				continue;
-- 
cgit v1.2.3


From 0843f197c46dd3c432a3202de80e8bc1fcb96a6b Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Mon, 21 Nov 2016 23:01:09 +0800
Subject: batman-adv: Remove one condition check in batadv_route_unicast_packet

It could decrease one condition check to collect some statements in the
first condition block.

Signed-off-by: Gao Feng <gfree.wind@gmail.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/routing.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 5f050fbdfff7..7fd740b6e36d 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -719,20 +719,19 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 
 	len = skb->len;
 	res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
-	if (res == NET_XMIT_SUCCESS)
-		ret = NET_RX_SUCCESS;
-
-	/* skb was consumed */
-	skb = NULL;
 
 	/* translate transmit result into receive result */
 	if (res == NET_XMIT_SUCCESS) {
+		ret = NET_RX_SUCCESS;
 		/* skb was transmitted and consumed */
 		batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
 		batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
 				   len + ETH_HLEN);
 	}
 
+	/* skb was consumed */
+	skb = NULL;
+
 put_orig_node:
 	batadv_orig_node_put(orig_node);
 free_skb:
-- 
cgit v1.2.3


From c33705188c493b7de3b8dc2956d67de91b444727 Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Mon, 21 Nov 2016 23:00:32 +0800
Subject: batman-adv: Treat NET_XMIT_CN as transmit successfully

The tc could return NET_XMIT_CN as one congestion notification, but
it does not mean the packet is lost. Other modules like ipvlan,
macvlan, and others treat NET_XMIT_CN as success too.

So batman-adv should handle NET_XMIT_CN also as NET_XMIT_SUCCESS.

Signed-off-by: Gao Feng <gfree.wind@gmail.com>
[sven@narfation.org: Moved NET_XMIT_CN handling to batadv_send_skb_packet]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index d7308263b8fa..d9b2889064a6 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -115,7 +115,7 @@ int batadv_send_skb_packet(struct sk_buff *skb,
 	 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
 	 * (which is > 0). This will not be treated as an error.
 	 */
-	return dev_queue_xmit(skb);
+	return net_xmit_eval(dev_queue_xmit(skb));
 send_skb_err:
 	kfree_skb(skb);
 	return NET_XMIT_DROP;
-- 
cgit v1.2.3


From 731977e97b3697454a862fec656c2561eabc0b87 Mon Sep 17 00:00:00 2001
From: Amadeusz Sławiński
Date: Tue, 24 Jan 2017 16:42:10 +0100
Subject: mac80211: use helper function to access ieee802_1d_to_ac[]

cleanup patch to make use of ieee80211_ac_from_tid() to retrieve ac from
ieee802_1d_to_ac[]

Signed-off-by: Amadeusz Sławiński <amadeusz.slawinski@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c     | 2 +-
 net/mac80211/status.c | 2 +-
 net/mac80211/tx.c     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b791c4190564..50ca3828b124 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1391,7 +1391,7 @@ EXPORT_SYMBOL(ieee80211_sta_pspoll);
 void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-	u8 ac = ieee802_1d_to_ac[tid & 7];
+	int ac = ieee80211_ac_from_tid(tid);
 
 	/*
 	 * If this AC is not trigger-enabled do nothing unless the
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index d6a1bfaa7a81..a3af6e1bfd98 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -95,7 +95,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 		 */
 		if (*p & IEEE80211_QOS_CTL_EOSP)
 			*p &= ~IEEE80211_QOS_CTL_EOSP;
-		ac = ieee802_1d_to_ac[tid & 7];
+		ac = ieee80211_ac_from_tid(tid);
 	} else {
 		ac = IEEE80211_AC_BE;
 	}
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 986de098803d..ba8d7db0a071 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1414,7 +1414,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 		txqi->txq.sta = &sta->sta;
 		sta->sta.txq[tid] = &txqi->txq;
 		txqi->txq.tid = tid;
-		txqi->txq.ac = ieee802_1d_to_ac[tid & 7];
+		txqi->txq.ac = ieee80211_ac_from_tid(tid);
 	} else {
 		sdata->vif.txq = &txqi->txq;
 		txqi->txq.tid = 0;
@@ -4659,7 +4659,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb, int tid,
 				 enum nl80211_band band)
 {
-	int ac = ieee802_1d_to_ac[tid & 7];
+	int ac = ieee80211_ac_from_tid(tid);
 
 	skb_reset_mac_header(skb);
 	skb_set_queue_mapping(skb, ac);
-- 
cgit v1.2.3


From 55ed0ce0898e15fec30d2ca2a563d7934b082375 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Thu, 26 Jan 2017 10:45:51 -0800
Subject: net: dsa: Pass device pointer to dsa_register_switch

In preparation for allowing dsa_register_switch() to be supplied with
device/platform data, pass down a struct device pointer instead of a
struct device_node.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 2 +-
 drivers/net/dsa/mv88e6xxx/chip.c | 7 +++----
 drivers/net/dsa/qca8k.c          | 2 +-
 include/net/dsa.h                | 2 +-
 net/dsa/dsa2.c                   | 7 ++++---
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 5cbb14f6a03b..bb210b12ad1b 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1894,7 +1894,7 @@ int b53_switch_register(struct b53_device *dev)
 
 	pr_info("found switch: %s, rev %i\n", dev->name, dev->core_rev);
 
-	return dsa_register_switch(dev->ds, dev->ds->dev->of_node);
+	return dsa_register_switch(dev->ds, dev->ds->dev);
 }
 EXPORT_SYMBOL(b53_switch_register);
 
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 5668e778ed1d..921e53351786 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4356,8 +4356,7 @@ static struct dsa_switch_driver mv88e6xxx_switch_drv = {
 	.ops			= &mv88e6xxx_switch_ops,
 };
 
-static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip,
-				     struct device_node *np)
+static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip)
 {
 	struct device *dev = chip->dev;
 	struct dsa_switch *ds;
@@ -4372,7 +4371,7 @@ static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip,
 
 	dev_set_drvdata(dev, ds);
 
-	return dsa_register_switch(ds, np);
+	return dsa_register_switch(ds, dev);
 }
 
 static void mv88e6xxx_unregister_switch(struct mv88e6xxx_chip *chip)
@@ -4456,7 +4455,7 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 	if (err)
 		goto out_g2_irq;
 
-	err = mv88e6xxx_register_switch(chip, np);
+	err = mv88e6xxx_register_switch(chip);
 	if (err)
 		goto out_mdio;
 
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 54d270d59eb0..c084aa484d2b 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -964,7 +964,7 @@ qca8k_sw_probe(struct mdio_device *mdiodev)
 	mutex_init(&priv->reg_mutex);
 	dev_set_drvdata(&mdiodev->dev, priv);
 
-	return dsa_register_switch(priv->ds, priv->ds->dev->of_node);
+	return dsa_register_switch(priv->ds, &mdiodev->dev);
 }
 
 static void
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 08b340403927..92fd795e9573 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -387,7 +387,7 @@ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst)
 }
 
 void dsa_unregister_switch(struct dsa_switch *ds);
-int dsa_register_switch(struct dsa_switch *ds, struct device_node *np);
+int dsa_register_switch(struct dsa_switch *ds, struct device *dev);
 #ifdef CONFIG_PM_SLEEP
 int dsa_switch_suspend(struct dsa_switch *ds);
 int dsa_switch_resume(struct dsa_switch *ds);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 866222a8f9bf..2cf489c5e90f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -578,8 +578,9 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds,
 	return ports;
 }
 
-static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 {
+	struct device_node *np = dev->of_node;
 	struct device_node *ports = dsa_get_ports(ds, np);
 	struct dsa_switch_tree *dst;
 	u32 tree, index;
@@ -659,12 +660,12 @@ out:
 	return err;
 }
 
-int dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+int dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 {
 	int err;
 
 	mutex_lock(&dsa2_mutex);
-	err = _dsa_register_switch(ds, np);
+	err = _dsa_register_switch(ds, dev);
 	mutex_unlock(&dsa2_mutex);
 
 	return err;
-- 
cgit v1.2.3


From 293784a8f856e854b4742be4aacf435062d91e9c Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Thu, 26 Jan 2017 10:45:52 -0800
Subject: net: dsa: Make most functions take a dsa_port argument

In preparation for allowing platform data, and therefore no valid
device_node pointer, make most DSA functions takes a pointer to a
dsa_port structure whenever possible. While at it, introduce a
dsa_port_is_valid() helper function which checks whether port->dn is
NULL or not at the moment.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c      | 15 ++++++++------
 net/dsa/dsa2.c     | 61 +++++++++++++++++++++++++++++-------------------------
 net/dsa/dsa_priv.h |  4 ++--
 3 files changed, 44 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 1f3afeb673d6..07e863369e04 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -110,8 +110,9 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 
 /* basic switch operations **************************************************/
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
-		      struct device_node *port_dn, int port)
+		      struct dsa_port *dport, int port)
 {
+	struct device_node *port_dn = dport->dn;
 	struct phy_device *phydev;
 	int ret, mode;
 
@@ -141,15 +142,15 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 
 static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
 {
-	struct device_node *port_dn;
+	struct dsa_port *dport;
 	int ret, port;
 
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
 
-		port_dn = ds->ports[port].dn;
-		ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port);
+		dport = &ds->ports[port];
+		ret = dsa_cpu_dsa_setup(ds, dev, dport, port);
 		if (ret)
 			return ret;
 	}
@@ -364,8 +365,10 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 	return ds;
 }
 
-void dsa_cpu_dsa_destroy(struct device_node *port_dn)
+void dsa_cpu_dsa_destroy(struct dsa_port *port)
 {
+	struct device_node *port_dn = port->dn;
+
 	if (of_phy_is_fixed_link(port_dn))
 		of_phy_deregister_fixed_link(port_dn);
 }
@@ -389,7 +392,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
-		dsa_cpu_dsa_destroy(ds->ports[port].dn);
+		dsa_cpu_dsa_destroy(&ds->ports[port]);
 
 		/* Clearing a bit which is not set does no harm */
 		ds->cpu_port_mask |= ~(1 << port);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 2cf489c5e90f..56c43ca7c049 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -78,14 +78,19 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
 	kref_put(&dst->refcount, dsa_free_dst);
 }
 
-static bool dsa_port_is_dsa(struct device_node *port)
+static bool dsa_port_is_valid(struct dsa_port *port)
 {
-	return !!of_parse_phandle(port, "link", 0);
+	return !!port->dn;
 }
 
-static bool dsa_port_is_cpu(struct device_node *port)
+static bool dsa_port_is_dsa(struct dsa_port *port)
 {
-	return !!of_parse_phandle(port, "ethernet", 0);
+	return !!of_parse_phandle(port->dn, "link", 0);
+}
+
+static bool dsa_port_is_cpu(struct dsa_port *port)
+{
+	return !!of_parse_phandle(port->dn, "ethernet", 0);
 }
 
 static bool dsa_ds_find_port(struct dsa_switch *ds,
@@ -119,7 +124,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
 
 static int dsa_port_complete(struct dsa_switch_tree *dst,
 			     struct dsa_switch *src_ds,
-			     struct device_node *port,
+			     struct dsa_port *port,
 			     u32 src_port)
 {
 	struct device_node *link;
@@ -127,7 +132,7 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
 	struct dsa_switch *dst_ds;
 
 	for (index = 0;; index++) {
-		link = of_parse_phandle(port, "link", index);
+		link = of_parse_phandle(port->dn, "link", index);
 		if (!link)
 			break;
 
@@ -150,13 +155,13 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
  */
 static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 	int err;
 
 	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (!dsa_port_is_dsa(port))
@@ -196,7 +201,7 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst)
 	return 0;
 }
 
-static int dsa_dsa_port_apply(struct device_node *port, u32 index,
+static int dsa_dsa_port_apply(struct dsa_port *port, u32 index,
 			      struct dsa_switch *ds)
 {
 	int err;
@@ -211,13 +216,13 @@ static int dsa_dsa_port_apply(struct device_node *port, u32 index,
 	return 0;
 }
 
-static void dsa_dsa_port_unapply(struct device_node *port, u32 index,
+static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index,
 				 struct dsa_switch *ds)
 {
 	dsa_cpu_dsa_destroy(port);
 }
 
-static int dsa_cpu_port_apply(struct device_node *port, u32 index,
+static int dsa_cpu_port_apply(struct dsa_port *port, u32 index,
 			      struct dsa_switch *ds)
 {
 	int err;
@@ -234,7 +239,7 @@ static int dsa_cpu_port_apply(struct device_node *port, u32 index,
 	return 0;
 }
 
-static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
+static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index,
 				 struct dsa_switch *ds)
 {
 	dsa_cpu_dsa_destroy(port);
@@ -242,13 +247,13 @@ static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
 
 }
 
-static int dsa_user_port_apply(struct device_node *port, u32 index,
+static int dsa_user_port_apply(struct dsa_port *port, u32 index,
 			       struct dsa_switch *ds)
 {
 	const char *name;
 	int err;
 
-	name = of_get_property(port, "label", NULL);
+	name = of_get_property(port->dn, "label", NULL);
 	if (!name)
 		name = "eth%d";
 
@@ -262,7 +267,7 @@ static int dsa_user_port_apply(struct device_node *port, u32 index,
 	return 0;
 }
 
-static void dsa_user_port_unapply(struct device_node *port, u32 index,
+static void dsa_user_port_unapply(struct dsa_port *port, u32 index,
 				  struct dsa_switch *ds)
 {
 	if (ds->ports[index].netdev) {
@@ -274,7 +279,7 @@ static void dsa_user_port_unapply(struct device_node *port, u32 index,
 
 static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 	int err;
 
@@ -308,8 +313,8 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	}
 
 	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (dsa_port_is_dsa(port)) {
@@ -336,12 +341,12 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 
 static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 
 	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (dsa_port_is_dsa(port)) {
@@ -425,7 +430,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 	dst->applied = false;
 }
 
-static int dsa_cpu_parse(struct device_node *port, u32 index,
+static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 			 struct dsa_switch_tree *dst,
 			 struct dsa_switch *ds)
 {
@@ -433,7 +438,7 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
 	struct net_device *ethernet_dev;
 	struct device_node *ethernet;
 
-	ethernet = of_parse_phandle(port, "ethernet", 0);
+	ethernet = of_parse_phandle(port->dn, "ethernet", 0);
 	if (!ethernet)
 		return -EINVAL;
 
@@ -466,13 +471,13 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
 
 static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 	int err;
 
 	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (dsa_port_is_cpu(port)) {
@@ -533,7 +538,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
 		 * to have access to a correct value, just like what
 		 * net/dsa/dsa.c::dsa_switch_setup_one does.
 		 */
-		if (!dsa_port_is_cpu(port))
+		if (!dsa_port_is_cpu(&ds->ports[reg]))
 			ds->enabled_port_mask |= 1 << reg;
 	}
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 63ae1484abae..16194a4bb2fe 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -50,8 +50,8 @@ struct dsa_slave_priv {
 
 /* dsa.c */
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
-		      struct device_node *port_dn, int port);
-void dsa_cpu_dsa_destroy(struct device_node *port_dn);
+		      struct dsa_port *dport, int port);
+void dsa_cpu_dsa_destroy(struct dsa_port *dport);
 const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
 void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
-- 
cgit v1.2.3


From 3512a8e95e6acb51d4cd04480689ac484ed538c2 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Thu, 26 Jan 2017 10:45:53 -0800
Subject: net: dsa: Suffix function manipulating device_node with _dn

Make it clear that these functions take a device_node structure pointer

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 56c43ca7c049..4c11619a818b 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -93,8 +93,8 @@ static bool dsa_port_is_cpu(struct dsa_port *port)
 	return !!of_parse_phandle(port->dn, "ethernet", 0);
 }
 
-static bool dsa_ds_find_port(struct dsa_switch *ds,
-			     struct device_node *port)
+static bool dsa_ds_find_port_dn(struct dsa_switch *ds,
+				struct device_node *port)
 {
 	u32 index;
 
@@ -104,8 +104,8 @@ static bool dsa_ds_find_port(struct dsa_switch *ds,
 	return false;
 }
 
-static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
-					    struct device_node *port)
+static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst,
+					       struct device_node *port)
 {
 	struct dsa_switch *ds;
 	u32 index;
@@ -115,7 +115,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
 		if (!ds)
 			continue;
 
-		if (dsa_ds_find_port(ds, port))
+		if (dsa_ds_find_port_dn(ds, port))
 			return ds;
 	}
 
@@ -136,7 +136,7 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
 		if (!link)
 			break;
 
-		dst_ds = dsa_dst_find_port(dst, link);
+		dst_ds = dsa_dst_find_port_dn(dst, link);
 		of_node_put(link);
 
 		if (!dst_ds)
@@ -545,7 +545,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
 	return 0;
 }
 
-static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
+static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index)
 {
 	int err;
 
@@ -591,7 +591,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 	u32 tree, index;
 	int i, err;
 
-	err = dsa_parse_member(np, &tree, &index);
+	err = dsa_parse_member_dn(np, &tree, &index);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From bc1727d242fa595c84e8f42b292e82151ba8cf06 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Thu, 26 Jan 2017 10:45:54 -0800
Subject: net: dsa: Move ports assignment closer to error checking

Move the assignment of ports in _dsa_register_switch() closer to where
it is checked, no functional change. Re-order declarations to be
preserve the inverted christmas tree style.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 4c11619a818b..75f5d1f8554b 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -586,8 +586,8 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds,
 static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 {
 	struct device_node *np = dev->of_node;
-	struct device_node *ports = dsa_get_ports(ds, np);
 	struct dsa_switch_tree *dst;
+	struct device_node *ports;
 	u32 tree, index;
 	int i, err;
 
@@ -595,6 +595,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 	if (err)
 		return err;
 
+	ports = dsa_get_ports(ds, np);
 	if (IS_ERR(ports))
 		return PTR_ERR(ports);
 
-- 
cgit v1.2.3


From 3b7b2b0acdbf9ffe4d3a7915e59e6127988b05db Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 26 Jan 2017 14:08:36 -0800
Subject: net: ipv6: remove skb_reserve in getroute

Remove skb_reserve and skb_reset_mac_header from inet6_rtm_getroute. The
allocated skb is not passed through the routing engine (like it is for
IPv4) and has not since the beginning of git time.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4b1f0f98a0e9..74bb1190800e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3416,12 +3416,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 		goto errout;
 	}
 
-	/* Reserve room for dummy headers, this skb can pass
-	   through good chunk of routing engine.
-	 */
-	skb_reset_mac_header(skb);
-	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
-
 	skb_dst_set(skb, &rt->dst);
 
 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
-- 
cgit v1.2.3


From 1f17e2f2c8a8be3430813119fa7b633398f6185b Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 26 Jan 2017 13:54:08 -0800
Subject: net: ipv6: ignore null_entry on route dumps

lkp-robot reported a BUG:
[   10.151226] BUG: unable to handle kernel NULL pointer dereference at 00000198
[   10.152525] IP: rt6_fill_node+0x164/0x4b8
[   10.153307] *pdpt = 0000000012ee5001 *pde = 0000000000000000
[   10.153309]
[   10.154492] Oops: 0000 [#1]
[   10.154987] CPU: 0 PID: 909 Comm: netifd Not tainted 4.10.0-rc4-00722-g41e8c70ee162-dirty #10
[   10.156482] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[   10.158254] task: d0deb000 task.stack: d0e0c000
[   10.159059] EIP: rt6_fill_node+0x164/0x4b8
[   10.159780] EFLAGS: 00010296 CPU: 0
[   10.160404] EAX: 00000000 EBX: d10c2358 ECX: c1f7c6cc EDX: c1f6ff44
[   10.161469] ESI: 00000000 EDI: c2059900 EBP: d0e0dc4c ESP: d0e0dbe4
[   10.162534]  DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
[   10.163482] CR0: 80050033 CR2: 00000198 CR3: 10d94660 CR4: 000006b0
[   10.164535] Call Trace:
[   10.164993]  ? paravirt_sched_clock+0x9/0xd
[   10.165727]  ? sched_clock+0x9/0xc
[   10.166329]  ? sched_clock_cpu+0x19/0xe9
[   10.166991]  ? lock_release+0x13e/0x36c
[   10.167652]  rt6_dump_route+0x4c/0x56
[   10.168276]  fib6_dump_node+0x1d/0x3d
[   10.168913]  fib6_walk_continue+0xab/0x167
[   10.169611]  fib6_walk+0x2a/0x40
[   10.170182]  inet6_dump_fib+0xfb/0x1e0
[   10.170855]  netlink_dump+0xcd/0x21f

This happens when the loopback device is set down and a ipv6 fib route
dump is requested.

ip6_null_entry is the root of all ipv6 fib tables making it integrated
into the table and hence passed to the ipv6 route dump code. The
null_entry route uses the loopback device for dst.dev but may not have
rt6i_idev set because of the order in which initializations are done --
ip6_route_net_init is run before addrconf_init has initialized the
loopback device. Fixing the initialization order is a much bigger problem
with no obvious solution thus far.

The BUG is triggered when the loopback is set down and the netif_running
check added by a1a22c1206 fails. The fill_node descends to checking
rt->rt6i_idev for ignore_routes_with_linkdown and since rt6i_idev is
NULL it faults.

The null_entry route should not be processed in a dump request. Catch
and ignore. This check is done in rt6_dump_route as it is the highest
place in the callchain with knowledge of both the route and the network
namespace.

Fixes: a1a22c1206("net: ipv6: Keep nexthop of multipath route on admin down")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 74bb1190800e..5046d2b24004 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3320,6 +3320,10 @@ nla_put_failure:
 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+	struct net *net = arg->net;
+
+	if (rt == net->ipv6.ip6_null_entry)
+		return 0;
 
 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
@@ -3332,7 +3336,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 		}
 	}
 
-	return rt6_fill_node(arg->net,
+	return rt6_fill_node(net,
 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
 		     NLM_F_MULTI);
-- 
cgit v1.2.3


From d35a00b8e33dab7385f724e713ae71c8be0a49f4 Mon Sep 17 00:00:00 2001
From: Felix Jia
Date: Thu, 26 Jan 2017 16:59:17 +1300
Subject: net/ipv6: allow sysctl to change link-local address generation mode

The address generation mode for IPv6 link-local can only be configured
by netlink messages. This patch adds the ability to change the address
generation mode via sysctl.

v1 -> v2
Removed the rtnl lock and switch to use RCU lock to iterate through
the netdev list.

v2 -> v3
Removed the addrgenmode variable from the idev structure and use the
systcl storage for the flag.

Simplifed the logic for sysctl handling by removing the supported
for all operation.

Added support for more types of tunnel interfaces for link-local
address generation.

Based the patches from net-next.

v3 -> v4
Removed unnecessary whitespace changes.

Signed-off-by: Felix Jia <felix.jia@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/net/if_inet6.h    |   1 -
 include/uapi/linux/ipv6.h |   1 +
 net/ipv6/addrconf.c       | 104 +++++++++++++++++++++++++++++++++++++---------
 4 files changed, 86 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 671d014e6429..71be5b330d21 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -69,6 +69,7 @@ struct ipv6_devconf {
 	__s32		seg6_require_hmac;
 #endif
 	__u32		enhanced_dad;
+	__u32		addr_gen_mode;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 0fa4c324b713..f656f9051aca 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -205,7 +205,6 @@ struct inet6_dev {
 	__s32			rs_interval;	/* in jiffies */
 	__u8			rs_probes;
 
-	__u8			addr_gen_mode;
 	unsigned long		tstamp; /* ipv6InterfaceTable update timestamp */
 	struct rcu_head		rcu;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index eaf65dc82e22..8ef9e75e004e 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -182,6 +182,7 @@ enum {
 	DEVCONF_SEG6_ENABLED,
 	DEVCONF_SEG6_REQUIRE_HMAC,
 	DEVCONF_ENHANCED_DAD,
+	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ac9bd5620f81..e35259dd17ba 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,6 +243,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -294,6 +295,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 /* Check if a valid qdisc is available */
@@ -386,9 +388,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
 
 	if (ndev->cnf.stable_secret.initialized)
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	else
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+		ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
 
 	ndev->cnf.mtu6 = dev->mtu;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2387,8 +2389,8 @@ static void manage_tempaddrs(struct inet6_dev *idev,
 
 static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
 {
-	return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
-	       idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
+	return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
+	       idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
 }
 
 int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
@@ -3152,7 +3154,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 
 	ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
 
-	switch (idev->addr_gen_mode) {
+	switch (idev->cnf.addr_gen_mode) {
 	case IN6_ADDR_GEN_MODE_RANDOM:
 		ipv6_gen_mode_random_init(idev);
 		/* fallthrough */
@@ -3204,8 +3206,8 @@ static void addrconf_dev_config(struct net_device *dev)
 
 	/* this device type has no EUI support */
 	if (dev->type == ARPHRD_NONE &&
-	    idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
+	    idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
 
 	addrconf_addr_gen(idev, false);
 }
@@ -4982,6 +4984,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
 #endif
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
+	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5093,7 +5096,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
 	if (!nla)
 		goto nla_put_failure;
 
-	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode))
+	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
 		goto nla_put_failure;
 
 	read_lock_bh(&idev->lock);
@@ -5211,6 +5214,26 @@ static int inet6_validate_link_af(const struct net_device *dev,
 	return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
 }
 
+static int check_addr_gen_mode(int mode)
+{
+	if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
+	    mode != IN6_ADDR_GEN_MODE_NONE &&
+	    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    mode != IN6_ADDR_GEN_MODE_RANDOM)
+		return -EINVAL;
+	return 1;
+}
+
+static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
+				int mode)
+{
+	if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    !idev->cnf.stable_secret.initialized &&
+	    !net->ipv6.devconf_dflt->stable_secret.initialized)
+		return -EINVAL;
+	return 1;
+}
+
 static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 {
 	int err = -EINVAL;
@@ -5232,18 +5255,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 	if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
 		u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
 
-		if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
-		    mode != IN6_ADDR_GEN_MODE_NONE &&
-		    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    mode != IN6_ADDR_GEN_MODE_RANDOM)
-			return -EINVAL;
-
-		if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    !idev->cnf.stable_secret.initialized &&
-		    !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
+		if (check_addr_gen_mode(mode) < 0 ||
+		    check_stable_privacy(idev, dev_net(dev), mode) < 0)
 			return -EINVAL;
 
-		idev->addr_gen_mode = mode;
+		idev->cnf.addr_gen_mode = mode;
 		err = 0;
 	}
 
@@ -5652,6 +5668,47 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
+					 void __user *buffer, size_t *lenp,
+					 loff_t *ppos)
+{
+	int ret = 0;
+	int new_val;
+	struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+	struct net *net = (struct net *)ctl->extra2;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write) {
+		new_val = *((int *)ctl->data);
+
+		if (check_addr_gen_mode(new_val) < 0)
+			return -EINVAL;
+
+		/* request for default */
+		if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
+			ipv6_devconf_dflt.addr_gen_mode = new_val;
+
+		/* request for individual net device */
+		} else {
+			if (!idev)
+				return ret;
+
+			if (check_stable_privacy(idev, net, new_val) < 0)
+				return -EINVAL;
+
+			if (idev->cnf.addr_gen_mode != new_val) {
+				idev->cnf.addr_gen_mode = new_val;
+				rtnl_lock();
+				addrconf_dev_config(idev->dev);
+				rtnl_unlock();
+			}
+		}
+	}
+
+	return ret;
+}
+
 static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 					 void __user *buffer, size_t *lenp,
 					 loff_t *ppos)
@@ -5702,14 +5759,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 			struct inet6_dev *idev = __in6_dev_get(dev);
 
 			if (idev) {
-				idev->addr_gen_mode =
+				idev->cnf.addr_gen_mode =
 					IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 			}
 		}
 	} else {
 		struct inet6_dev *idev = ctl->extra1;
 
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	}
 
 out:
@@ -6096,6 +6153,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec,
 	},
+	{
+		.procname		= "addr_gen_mode",
+		.data			= &ipv6_devconf.addr_gen_mode,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= addrconf_sysctl_addr_gen_mode,
+	},
 	{
 		/* sentinel */
 	}
-- 
cgit v1.2.3


From 45ce0fd19da9ca21635c367d504b9e4904621ff4 Mon Sep 17 00:00:00 2001
From: Felix Jia
Date: Thu, 26 Jan 2017 16:59:18 +1300
Subject: net/ipv6: support more tunnel interfaces for EUI64 link-local
 generation

Signed-off-by: Felix Jia <felix.jia@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 5 +++++
 net/ipv6/ip6_gre.c  | 3 +++
 net/ipv6/ip6_vti.c  | 4 ++++
 3 files changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e35259dd17ba..4c47656b9f09 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2146,12 +2146,14 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
 	case ARPHRD_SIT:
 		return addrconf_ifid_sit(eui, dev);
 	case ARPHRD_IPGRE:
+	case ARPHRD_TUNNEL:
 		return addrconf_ifid_gre(eui, dev);
 	case ARPHRD_6LOWPAN:
 		return addrconf_ifid_eui64(eui, dev);
 	case ARPHRD_IEEE1394:
 		return addrconf_ifid_ieee1394(eui, dev);
 	case ARPHRD_TUNNEL6:
+	case ARPHRD_IP6GRE:
 		return addrconf_ifid_ip6tnl(eui, dev);
 	}
 	return -1;
@@ -3195,6 +3197,9 @@ static void addrconf_dev_config(struct net_device *dev)
 	    (dev->type != ARPHRD_IEEE1394) &&
 	    (dev->type != ARPHRD_TUNNEL6) &&
 	    (dev->type != ARPHRD_6LOWPAN) &&
+	    (dev->type != ARPHRD_IP6GRE) &&
+	    (dev->type != ARPHRD_IPGRE) &&
+	    (dev->type != ARPHRD_TUNNEL) &&
 	    (dev->type != ARPHRD_NONE)) {
 		/* Alas, we support only Ethernet autoconfiguration. */
 		return;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 65bdfd1cca80..1ba7567b4d8f 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -993,6 +993,9 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	netif_keep_dst(dev);
+	/* This perm addr will be used as interface identifier by IPv6 */
+	dev->addr_assign_type = NET_ADDR_RANDOM;
+	eth_random_addr(dev->perm_addr);
 }
 
 static int ip6gre_tunnel_init_common(struct net_device *dev)
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d82042c8d8fd..c795fee372c4 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -49,6 +49,7 @@
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <linux/etherdevice.h>
 
 #define IP6_VTI_HASH_SIZE_SHIFT  5
 #define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
@@ -842,6 +843,9 @@ static void vti6_dev_setup(struct net_device *dev)
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	netif_keep_dst(dev);
+	/* This perm addr will be used as interface identifier by IPv6 */
+	dev->addr_assign_type = NET_ADDR_RANDOM;
+	eth_random_addr(dev->perm_addr);
 }
 
 /**
-- 
cgit v1.2.3


From 158f323b9868b59967ad96957c4ca388161be321 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 27 Jan 2017 07:11:27 -0800
Subject: net: adjust skb->truesize in pskb_expand_head()

Slava Shwartsman reported a warning in skb_try_coalesce(), when we
detect skb->truesize is completely wrong.

In his case, issue came from IPv6 reassembly coping with malicious
datagrams, that forced various pskb_may_pull() to reallocate a bigger
skb->head than the one allocated by NIC driver before entering GRO
layer.

Current code does not change skb->truesize, leaving this burden to
callers if they care enough.

Blindly changing skb->truesize in pskb_expand_head() is not
easy, as some producers might track skb->truesize, for example
in xmit path for back pressure feedback (sk->sk_wmem_alloc)

We can detect the cases where it should be safe to change
skb->truesize :

1) skb is not attached to a socket.
2) If it is attached to a socket, destructor is sock_edemux()

My audit gave only two callers doing their own skb->truesize
manipulation.

I had to remove skb parameter in sock_edemux macro when
CONFIG_INET is not set to avoid a compile error.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Slava Shwartsman <slavash@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h       |  2 +-
 net/core/skbuff.c        | 14 +++++++++++---
 net/netlink/af_netlink.c |  8 +++-----
 net/wireless/util.c      |  2 --
 4 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 7144750d14e5..94e65fd70354 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1534,7 +1534,7 @@ void sock_efree(struct sk_buff *skb);
 #ifdef CONFIG_INET
 void sock_edemux(struct sk_buff *skb);
 #else
-#define sock_edemux(skb) sock_efree(skb)
+#define sock_edemux sock_efree
 #endif
 
 int sock_setsockopt(struct socket *sock, int level, int op,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f8dbe4a7ab46..26c1344cc23e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1192,10 +1192,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		     gfp_t gfp_mask)
 {
-	int i;
-	u8 *data;
-	int size = nhead + skb_end_offset(skb) + ntail;
+	int i, osize = skb_end_offset(skb);
+	int size = osize + nhead + ntail;
 	long off;
+	u8 *data;
 
 	BUG_ON(nhead < 0);
 
@@ -1257,6 +1257,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->hdr_len  = 0;
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+	/* It is not generally safe to change skb->truesize.
+	 * For the moment, we really care of rx path, or
+	 * when skb is orphaned (not attached to a socket).
+	 */
+	if (!skb->sk || skb->destructor == sock_edemux)
+		skb->truesize += size - osize;
+
 	return 0;
 
 nofrags:
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index edcc1e19ad53..7b73c7c161a9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1210,11 +1210,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 		skb = nskb;
 	}
 
-	if (!pskb_expand_head(skb, 0, -delta,
-			      (allocation & ~__GFP_DIRECT_RECLAIM) |
-			      __GFP_NOWARN | __GFP_NORETRY))
-		skb->truesize -= delta;
-
+	pskb_expand_head(skb, 0, -delta,
+			 (allocation & ~__GFP_DIRECT_RECLAIM) |
+			 __GFP_NOWARN | __GFP_NORETRY);
 	return skb;
 }
 
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 1b9296882dcd..68e5f2ecee1a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -618,8 +618,6 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 
 		if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
 			return -ENOMEM;
-
-		skb->truesize += head_need;
 	}
 
 	if (encaps_data) {
-- 
cgit v1.2.3


From 7c946062b3ae2b7f002383e5e402113e98ad3c77 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann
Date: Sat, 28 Jan 2017 10:12:39 +0100
Subject: batman-adv: Fix double call of dev_queue_xmit

The net_xmit_eval has side effects because it is not making sure that e
isn't evaluated twice.

    #define net_xmit_eval(e)        ((e) == NET_XMIT_CN ? 0 : (e))

The code requested by David Miller [1]

    return net_xmit_eval(dev_queue_xmit(skb));

will get transformed into

    return ((dev_queue_xmit(skb)) == NET_XMIT_CN ? 0 : (dev_queue_xmit(skb)))

dev_queue_xmit will therefore be tried again (with an already consumed skb)
whenever the return code is not NET_XMIT_CN.

[1] https://lkml.kernel.org/r/20170125.225624.965229145391320056.davem@davemloft.net

Fixes: c33705188c49 ("batman-adv: Treat NET_XMIT_CN as transmit successfully")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/send.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index d9b2889064a6..1489ec27daff 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -77,6 +77,7 @@ int batadv_send_skb_packet(struct sk_buff *skb,
 {
 	struct batadv_priv *bat_priv;
 	struct ethhdr *ethhdr;
+	int ret;
 
 	bat_priv = netdev_priv(hard_iface->soft_iface);
 
@@ -115,7 +116,8 @@ int batadv_send_skb_packet(struct sk_buff *skb,
 	 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
 	 * (which is > 0). This will not be treated as an error.
 	 */
-	return net_xmit_eval(dev_queue_xmit(skb));
+	ret = dev_queue_xmit(skb);
+	return net_xmit_eval(ret);
 send_skb_err:
 	kfree_skb(skb);
 	return NET_XMIT_DROP;
-- 
cgit v1.2.3


From 3e7514afc7d728dd47c5fe9d7a1f5216fe659cda Mon Sep 17 00:00:00 2001
From: Sven Eckelmann
Date: Sat, 28 Jan 2017 10:23:30 +0100
Subject: batman-adv: Fix includes for IS_ERR/ERR_PTR

IS_ERR/ERR_PTR are not defined in linux/device.h but in linux/err.h. The
files using these macros therefore have to include the correct one.

Reported-by: Linus Luessing <linus.luessing@web.de>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/debugfs.c  | 2 +-
 net/batman-adv/tp_meter.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 5406148b9497..e32ad47c6efd 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -19,7 +19,7 @@
 #include "main.h"
 
 #include <linux/debugfs.h>
-#include <linux/device.h>
+#include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/fs.h>
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 07f64b60b528..c94ebdecdc3d 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -23,7 +23,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/cache.h>
 #include <linux/compiler.h>
-#include <linux/device.h>
+#include <linux/err.h>
 #include <linux/etherdevice.h>
 #include <linux/fs.h>
 #include <linux/if_ether.h>
-- 
cgit v1.2.3


From a0c02161ecfc2f40a0837926efac5376bc6fd6d3 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 27 Jan 2017 15:29:36 -0500
Subject: net: dsa: variable number of ports

Change the ports[DSA_MAX_PORTS] array of the dsa_switch structure for a
zero-length array, allocated at the same time as the dsa_switch
structure itself. A dsa_switch_alloc() helper is provided for that.

This commit brings no functional change yet since we pass DSA_MAX_PORTS
as the number of ports for the moment. Future patches can update the DSA
drivers separately to support dynamic number of ports.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c |  7 ++++---
 drivers/net/dsa/mv88e6xxx/chip.c |  3 +--
 drivers/net/dsa/qca8k.c          |  3 +--
 include/net/dsa.h                |  6 +++++-
 net/dsa/dsa.c                    |  5 ++---
 net/dsa/dsa2.c                   | 16 ++++++++++++++++
 6 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index bb210b12ad1b..31afc4d4b68b 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1790,14 +1790,15 @@ struct b53_device *b53_switch_alloc(struct device *base,
 	struct dsa_switch *ds;
 	struct b53_device *dev;
 
-	ds = devm_kzalloc(base, sizeof(*ds) + sizeof(*dev), GFP_KERNEL);
+	ds = dsa_switch_alloc(base, DSA_MAX_PORTS);
 	if (!ds)
 		return NULL;
 
-	dev = (struct b53_device *)(ds + 1);
+	dev = devm_kzalloc(base, sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
 
 	ds->priv = dev;
-	ds->dev = base;
 	dev->dev = base;
 
 	dev->ds = ds;
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 921e53351786..cb7b24748336 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4361,11 +4361,10 @@ static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip)
 	struct device *dev = chip->dev;
 	struct dsa_switch *ds;
 
-	ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
+	ds = dsa_switch_alloc(dev, DSA_MAX_PORTS);
 	if (!ds)
 		return -ENOMEM;
 
-	ds->dev = dev;
 	ds->priv = chip;
 	ds->ops = &mv88e6xxx_switch_ops;
 
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index c084aa484d2b..f67c6a3cebff 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -954,12 +954,11 @@ qca8k_sw_probe(struct mdio_device *mdiodev)
 	if (id != QCA8K_ID_QCA8337)
 		return -ENODEV;
 
-	priv->ds = devm_kzalloc(&mdiodev->dev, sizeof(*priv->ds), GFP_KERNEL);
+	priv->ds = dsa_switch_alloc(&mdiodev->dev, DSA_MAX_PORTS);
 	if (!priv->ds)
 		return -ENOMEM;
 
 	priv->ds->priv = priv;
-	priv->ds->dev = &mdiodev->dev;
 	priv->ds->ops = &qca8k_switch_ops;
 	mutex_init(&priv->reg_mutex);
 	dev_set_drvdata(&mdiodev->dev, priv);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 92fd795e9573..24e1d935ae68 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -190,8 +190,11 @@ struct dsa_switch {
 	u32			cpu_port_mask;
 	u32			enabled_port_mask;
 	u32			phys_mii_mask;
-	struct dsa_port		ports[DSA_MAX_PORTS];
 	struct mii_bus		*slave_mii_bus;
+
+	/* Dynamically allocated ports, keep last */
+	size_t num_ports;
+	struct dsa_port ports[];
 };
 
 static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
@@ -386,6 +389,7 @@ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst)
 	return dst->rcv != NULL;
 }
 
+struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n);
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds, struct device *dev);
 #ifdef CONFIG_PM_SLEEP
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 07e863369e04..de3ffb421ee4 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -347,8 +347,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 	/*
 	 * Allocate and initialise switch state.
 	 */
-	ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL);
-	if (ds == NULL)
+	ds = dsa_switch_alloc(parent, DSA_MAX_PORTS);
+	if (!ds)
 		return ERR_PTR(-ENOMEM);
 
 	ds->dst = dst;
@@ -356,7 +356,6 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 	ds->cd = cd;
 	ds->ops = ops;
 	ds->priv = priv;
-	ds->dev = parent;
 
 	ret = dsa_switch_setup_one(ds, parent);
 	if (ret)
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 75f5d1f8554b..4b3a44bec5c8 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -666,6 +666,22 @@ out:
 	return err;
 }
 
+struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
+{
+	size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port);
+	struct dsa_switch *ds;
+
+	ds = devm_kzalloc(dev, size, GFP_KERNEL);
+	if (!ds)
+		return NULL;
+
+	ds->dev = dev;
+	ds->num_ports = n;
+
+	return ds;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_alloc);
+
 int dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 {
 	int err;
-- 
cgit v1.2.3


From 26895e299cfb583d304553e9c259e694a7e83397 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 27 Jan 2017 15:29:37 -0500
Subject: net: dsa: use ds->num_ports when possible

The dsa_switch structure contains the number of ports. Use it where the
structure is valid instead of the DSA_MAX_PORTS value.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c         | 16 ++++++++--------
 net/dsa/dsa2.c        | 12 ++++++------
 net/dsa/slave.c       |  2 +-
 net/dsa/tag_brcm.c    |  2 +-
 net/dsa/tag_dsa.c     |  2 +-
 net/dsa/tag_edsa.c    |  2 +-
 net/dsa/tag_trailer.c |  2 +-
 7 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index de3ffb421ee4..619e57a44d1d 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -145,7 +145,7 @@ static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
 	struct dsa_port *dport;
 	int ret, port;
 
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
+	for (port = 0; port < ds->num_ports; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
 
@@ -218,7 +218,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	/*
 	 * Validate supplied switch configuration.
 	 */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		char *name;
 
 		name = cd->port_names[i];
@@ -242,7 +242,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 		valid_name_found = true;
 	}
 
-	if (!valid_name_found && i == DSA_MAX_PORTS)
+	if (!valid_name_found && i == ds->num_ports)
 		return -EINVAL;
 
 	/* Make the built-in MII bus mask match the number of ports,
@@ -295,7 +295,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	/*
 	 * Create network devices for physical switch ports.
 	 */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		ds->ports[i].dn = cd->port_dn[i];
 
 		if (!(ds->enabled_port_mask & (1 << i)))
@@ -377,7 +377,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 	int port;
 
 	/* Destroy network devices for physical switch ports. */
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
+	for (port = 0; port < ds->num_ports; port++) {
 		if (!(ds->enabled_port_mask & (1 << port)))
 			continue;
 
@@ -388,7 +388,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 	}
 
 	/* Disable configuration of the CPU and DSA ports */
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
+	for (port = 0; port < ds->num_ports; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
 		dsa_cpu_dsa_destroy(&ds->ports[port]);
@@ -408,7 +408,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
 	int i, ret = 0;
 
 	/* Suspend slave network devices */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
@@ -435,7 +435,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
 		return ret;
 
 	/* Resume slave network devices */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 4b3a44bec5c8..6e7b3e88b778 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -98,7 +98,7 @@ static bool dsa_ds_find_port_dn(struct dsa_switch *ds,
 {
 	u32 index;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++)
+	for (index = 0; index < ds->num_ports; index++)
 		if (ds->ports[index].dn == port)
 			return true;
 	return false;
@@ -159,7 +159,7 @@ static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	u32 index;
 	int err;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
+	for (index = 0; index < ds->num_ports; index++) {
 		port = &ds->ports[index];
 		if (!dsa_port_is_valid(port))
 			continue;
@@ -312,7 +312,7 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 			return err;
 	}
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
+	for (index = 0; index < ds->num_ports; index++) {
 		port = &ds->ports[index];
 		if (!dsa_port_is_valid(port))
 			continue;
@@ -344,7 +344,7 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	struct dsa_port *port;
 	u32 index;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
+	for (index = 0; index < ds->num_ports; index++) {
 		port = &ds->ports[index];
 		if (!dsa_port_is_valid(port))
 			continue;
@@ -475,7 +475,7 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	u32 index;
 	int err;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
+	for (index = 0; index < ds->num_ports; index++) {
 		port = &ds->ports[index];
 		if (!dsa_port_is_valid(port))
 			continue;
@@ -529,7 +529,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
 		if (err)
 			return err;
 
-		if (reg >= DSA_MAX_PORTS)
+		if (reg >= ds->num_ports)
 			return -EINVAL;
 
 		ds->ports[reg].dn = port;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9750dd6f8c17..26b2c070b15a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -404,7 +404,7 @@ static int dsa_fastest_ageing_time(struct dsa_switch *ds,
 {
 	int i;
 
-	for (i = 0; i < DSA_MAX_PORTS; ++i) {
+	for (i = 0; i < ds->num_ports; ++i) {
 		struct dsa_port *dp = &ds->ports[i];
 
 		if (dp && dp->ageing_time && dp->ageing_time < ageing_time)
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index cb5a2b7a0118..93e4458c02f9 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -128,7 +128,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
 	/* Validate port against switch setup, either the port is totally */
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/* Remove Broadcom tag and update checksum */
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index bce79ffe342b..8fa4b1942671 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!ds)
 		goto out_drop;
 
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/*
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6c1720e88537..929de581a846 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!ds)
 		goto out_drop;
 
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/*
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 271128a2dc64..d8e55a845d7f 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -82,7 +82,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto out_drop;
 
 	source_port = trailer[1] & 7;
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	pskb_trim_rcsum(skb, skb->len - 4);
-- 
cgit v1.2.3


From 818be8489d6fc8f4cc2c7699bbfd8e1983080f10 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 27 Jan 2017 15:29:38 -0500
Subject: net: dsa: add ds and index to dsa_port

Add the physical switch instance and port index a DSA port belongs to to
the dsa_port structure.

That can be used later to retrieve information about a physical port
when configuring a switch fabric, or lighten up struct dsa_slave_priv.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 ++
 net/dsa/dsa2.c    | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 24e1d935ae68..6bd1f8b05dbd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -140,6 +140,8 @@ struct dsa_switch_tree {
 };
 
 struct dsa_port {
+	struct dsa_switch	*ds;
+	unsigned int		index;
 	struct net_device	*netdev;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 6e7b3e88b778..9f8cc26be9ea 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -670,6 +670,7 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
 {
 	size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port);
 	struct dsa_switch *ds;
+	int i;
 
 	ds = devm_kzalloc(dev, size, GFP_KERNEL);
 	if (!ds)
@@ -678,6 +679,11 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
 	ds->dev = dev;
 	ds->num_ports = n;
 
+	for (i = 0; i < ds->num_ports; ++i) {
+		ds->ports[i].index = i;
+		ds->ports[i].ds = ds;
+	}
+
 	return ds;
 }
 EXPORT_SYMBOL_GPL(dsa_switch_alloc);
-- 
cgit v1.2.3


From afdcf151c1f7346207dcee3f8d6d82991dbbb7e5 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 27 Jan 2017 15:29:39 -0500
Subject: net: dsa: store a dsa_port in dsa_slave_priv

Store a pointer to the dsa_port structure in the dsa_slave_priv
structure, instead of the switch/port index.

This will allow to store more information such as the bridge device,
needed in DSA drivers for multi-chip configuration.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa_priv.h    |   8 +--
 net/dsa/slave.c       | 164 +++++++++++++++++++++++++-------------------------
 net/dsa/tag_brcm.c    |   4 +-
 net/dsa/tag_dsa.c     |   8 +--
 net/dsa/tag_edsa.c    |   8 +--
 net/dsa/tag_qca.c     |   2 +-
 net/dsa/tag_trailer.c |   2 +-
 7 files changed, 96 insertions(+), 100 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 16194a4bb2fe..c519bd0e9206 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -25,12 +25,8 @@ struct dsa_slave_priv {
 	struct sk_buff *	(*xmit)(struct sk_buff *skb,
 					struct net_device *dev);
 
-	/*
-	 * Which switch this port is a part of, and the port index
-	 * for this port.
-	 */
-	struct dsa_switch	*parent;
-	u8			port;
+	/* DSA port data, such as switch, port index, etc. */
+	struct dsa_port		*dp;
 
 	/*
 	 * The phylib phy_device pointer for the PHY connected
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 26b2c070b15a..a2f0267c4a3c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -61,7 +61,7 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 
-	return p->parent->dst->master_netdev->ifindex;
+	return p->dp->ds->dst->master_netdev->ifindex;
 }
 
 static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
@@ -96,8 +96,8 @@ static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
 static int dsa_slave_open(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
-	struct dsa_switch *ds = p->parent;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
+	struct dsa_switch *ds = p->dp->ds;
 	u8 stp_state = dsa_port_is_bridged(p) ?
 			BR_STATE_BLOCKING : BR_STATE_FORWARDING;
 	int err;
@@ -123,12 +123,12 @@ static int dsa_slave_open(struct net_device *dev)
 	}
 
 	if (ds->ops->port_enable) {
-		err = ds->ops->port_enable(ds, p->port, p->phy);
+		err = ds->ops->port_enable(ds, p->dp->index, p->phy);
 		if (err)
 			goto clear_promisc;
 	}
 
-	dsa_port_set_stp_state(ds, p->port, stp_state);
+	dsa_port_set_stp_state(ds, p->dp->index, stp_state);
 
 	if (p->phy)
 		phy_start(p->phy);
@@ -151,8 +151,8 @@ out:
 static int dsa_slave_close(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
-	struct dsa_switch *ds = p->parent;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (p->phy)
 		phy_stop(p->phy);
@@ -168,9 +168,9 @@ static int dsa_slave_close(struct net_device *dev)
 		dev_uc_del(master, dev->dev_addr);
 
 	if (ds->ops->port_disable)
-		ds->ops->port_disable(ds, p->port, p->phy);
+		ds->ops->port_disable(ds, p->dp->index, p->phy);
 
-	dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
+	dsa_port_set_stp_state(ds, p->dp->index, BR_STATE_DISABLED);
 
 	return 0;
 }
@@ -178,7 +178,7 @@ static int dsa_slave_close(struct net_device *dev)
 static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
 
 	if (change & IFF_ALLMULTI)
 		dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -189,7 +189,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 static void dsa_slave_set_rx_mode(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
 
 	dev_mc_sync(master, dev);
 	dev_uc_sync(master, dev);
@@ -198,7 +198,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
 static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
 	struct sockaddr *addr = a;
 	int err;
 
@@ -228,16 +228,17 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
 				   struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_port *dp = p->dp;
+	struct dsa_switch *ds = dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans)) {
 		if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
 			return -EOPNOTSUPP;
 
-		return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans);
+		return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans);
 	}
 
-	ds->ops->port_vlan_add(ds, p->port, vlan, trans);
+	ds->ops->port_vlan_add(ds, dp->index, vlan, trans);
 
 	return 0;
 }
@@ -246,12 +247,12 @@ static int dsa_slave_port_vlan_del(struct net_device *dev,
 				   const struct switchdev_obj_port_vlan *vlan)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (!ds->ops->port_vlan_del)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_vlan_del(ds, p->port, vlan);
+	return ds->ops->port_vlan_del(ds, p->dp->index, vlan);
 }
 
 static int dsa_slave_port_vlan_dump(struct net_device *dev,
@@ -259,10 +260,10 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
 				    switchdev_obj_dump_cb_t *cb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_vlan_dump)
-		return ds->ops->port_vlan_dump(ds, p->port, vlan, cb);
+		return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb);
 
 	return -EOPNOTSUPP;
 }
@@ -272,16 +273,16 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
 				  struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans)) {
 		if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
 			return -EOPNOTSUPP;
 
-		return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans);
+		return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans);
 	}
 
-	ds->ops->port_fdb_add(ds, p->port, fdb, trans);
+	ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans);
 
 	return 0;
 }
@@ -290,11 +291,11 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
 				  const struct switchdev_obj_port_fdb *fdb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret = -EOPNOTSUPP;
 
 	if (ds->ops->port_fdb_del)
-		ret = ds->ops->port_fdb_del(ds, p->port, fdb);
+		ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb);
 
 	return ret;
 }
@@ -304,10 +305,10 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev,
 				   switchdev_obj_dump_cb_t *cb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_fdb_dump)
-		return ds->ops->port_fdb_dump(ds, p->port, fdb, cb);
+		return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb);
 
 	return -EOPNOTSUPP;
 }
@@ -317,16 +318,16 @@ static int dsa_slave_port_mdb_add(struct net_device *dev,
 				  struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans)) {
 		if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
 			return -EOPNOTSUPP;
 
-		return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans);
+		return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans);
 	}
 
-	ds->ops->port_mdb_add(ds, p->port, mdb, trans);
+	ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans);
 
 	return 0;
 }
@@ -335,10 +336,10 @@ static int dsa_slave_port_mdb_del(struct net_device *dev,
 				  const struct switchdev_obj_port_mdb *mdb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_mdb_del)
-		return ds->ops->port_mdb_del(ds, p->port, mdb);
+		return ds->ops->port_mdb_del(ds, p->dp->index, mdb);
 
 	return -EOPNOTSUPP;
 }
@@ -348,10 +349,10 @@ static int dsa_slave_port_mdb_dump(struct net_device *dev,
 				   switchdev_obj_dump_cb_t *cb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_mdb_dump)
-		return ds->ops->port_mdb_dump(ds, p->port, mdb, cb);
+		return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb);
 
 	return -EOPNOTSUPP;
 }
@@ -371,12 +372,12 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
 				   struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans))
 		return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
 
-	dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
+	dsa_port_set_stp_state(ds, p->dp->index, attr->u.stp_state);
 
 	return 0;
 }
@@ -386,14 +387,14 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
 				    struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	/* bridge skips -EOPNOTSUPP, so skip the prepare phase */
 	if (switchdev_trans_ph_prepare(trans))
 		return 0;
 
 	if (ds->ops->port_vlan_filtering)
-		return ds->ops->port_vlan_filtering(ds, p->port,
+		return ds->ops->port_vlan_filtering(ds, p->dp->index,
 						    attr->u.vlan_filtering);
 
 	return 0;
@@ -419,7 +420,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
 				 struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time);
 	unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
 
@@ -428,7 +429,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
 		return 0;
 
 	/* Keep the fastest ageing time in case of multiple bridges */
-	ds->ports[p->port].ageing_time = ageing_time;
+	p->dp->ageing_time = ageing_time;
 	ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
 
 	if (ds->ops->set_ageing_time)
@@ -553,13 +554,13 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 				      struct net_device *br)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret = -EOPNOTSUPP;
 
 	p->bridge_dev = br;
 
 	if (ds->ops->port_bridge_join)
-		ret = ds->ops->port_bridge_join(ds, p->port, br);
+		ret = ds->ops->port_bridge_join(ds, p->dp->index, br);
 
 	return ret == -EOPNOTSUPP ? 0 : ret;
 }
@@ -567,25 +568,25 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 static void dsa_slave_bridge_port_leave(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 
 	if (ds->ops->port_bridge_leave)
-		ds->ops->port_bridge_leave(ds, p->port);
+		ds->ops->port_bridge_leave(ds, p->dp->index);
 
 	p->bridge_dev = NULL;
 
 	/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
 	 */
-	dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
+	dsa_port_set_stp_state(ds, p->dp->index, BR_STATE_FORWARDING);
 }
 
 static int dsa_slave_port_attr_get(struct net_device *dev,
 				   struct switchdev_attr *attr)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
@@ -633,7 +634,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Queue the SKB for transmission on the parent interface, but
 	 * do not modify its EtherType
 	 */
-	nskb->dev = p->parent->dst->master_netdev;
+	nskb->dev = p->dp->ds->dst->master_netdev;
 	dev_queue_xmit(nskb);
 
 	return NETDEV_TX_OK;
@@ -680,10 +681,10 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
 static int dsa_slave_get_regs_len(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_regs_len)
-		return ds->ops->get_regs_len(ds, p->port);
+		return ds->ops->get_regs_len(ds, p->dp->index);
 
 	return -EOPNOTSUPP;
 }
@@ -692,10 +693,10 @@ static void
 dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_regs)
-		ds->ops->get_regs(ds, p->port, regs, _p);
+		ds->ops->get_regs(ds, p->dp->index, regs, _p);
 }
 
 static int dsa_slave_nway_reset(struct net_device *dev)
@@ -723,7 +724,7 @@ static u32 dsa_slave_get_link(struct net_device *dev)
 static int dsa_slave_get_eeprom_len(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->cd && ds->cd->eeprom_len)
 		return ds->cd->eeprom_len;
@@ -738,7 +739,7 @@ static int dsa_slave_get_eeprom(struct net_device *dev,
 				struct ethtool_eeprom *eeprom, u8 *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_eeprom)
 		return ds->ops->get_eeprom(ds, eeprom, data);
@@ -750,7 +751,7 @@ static int dsa_slave_set_eeprom(struct net_device *dev,
 				struct ethtool_eeprom *eeprom, u8 *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->set_eeprom)
 		return ds->ops->set_eeprom(ds, eeprom, data);
@@ -762,7 +763,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
 				  uint32_t stringset, uint8_t *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (stringset == ETH_SS_STATS) {
 		int len = ETH_GSTRING_LEN;
@@ -772,7 +773,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
 		strncpy(data + 2 * len, "rx_packets", len);
 		strncpy(data + 3 * len, "rx_bytes", len);
 		if (ds->ops->get_strings)
-			ds->ops->get_strings(ds, p->port, data + 4 * len);
+			ds->ops->get_strings(ds, p->dp->index, data + 4 * len);
 	}
 }
 
@@ -853,20 +854,20 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
 					uint64_t *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	data[0] = dev->stats.tx_packets;
 	data[1] = dev->stats.tx_bytes;
 	data[2] = dev->stats.rx_packets;
 	data[3] = dev->stats.rx_bytes;
 	if (ds->ops->get_ethtool_stats)
-		ds->ops->get_ethtool_stats(ds, p->port, data + 4);
+		ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4);
 }
 
 static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (sset == ETH_SS_STATS) {
 		int count;
@@ -884,20 +885,20 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
 static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_wol)
-		ds->ops->get_wol(ds, p->port, w);
+		ds->ops->get_wol(ds, p->dp->index, w);
 }
 
 static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret = -EOPNOTSUPP;
 
 	if (ds->ops->set_wol)
-		ret = ds->ops->set_wol(ds, p->port, w);
+		ret = ds->ops->set_wol(ds, p->dp->index, w);
 
 	return ret;
 }
@@ -905,13 +906,13 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret;
 
 	if (!ds->ops->set_eee)
 		return -EOPNOTSUPP;
 
-	ret = ds->ops->set_eee(ds, p->port, p->phy, e);
+	ret = ds->ops->set_eee(ds, p->dp->index, p->phy, e);
 	if (ret)
 		return ret;
 
@@ -924,13 +925,13 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret;
 
 	if (!ds->ops->get_eee)
 		return -EOPNOTSUPP;
 
-	ret = ds->ops->get_eee(ds, p->port, e);
+	ret = ds->ops->get_eee(ds, p->dp->index, e);
 	if (ret)
 		return ret;
 
@@ -945,7 +946,7 @@ static int dsa_slave_netpoll_setup(struct net_device *dev,
 				   struct netpoll_info *ni)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	struct net_device *master = ds->dst->master_netdev;
 	struct netpoll *netpoll;
 	int err = 0;
@@ -988,7 +989,7 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev,
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 
-	if (snprintf(name, len, "p%d", p->port) >= len)
+	if (snprintf(name, len, "p%d", p->dp->index) >= len)
 		return -EINVAL;
 
 	return 0;
@@ -1059,7 +1060,7 @@ static struct device_type dsa_type = {
 static void dsa_slave_adjust_link(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	unsigned int status_changed = 0;
 
 	if (p->old_link != p->phy->link) {
@@ -1078,7 +1079,7 @@ static void dsa_slave_adjust_link(struct net_device *dev)
 	}
 
 	if (ds->ops->adjust_link && status_changed)
-		ds->ops->adjust_link(ds, p->port, p->phy);
+		ds->ops->adjust_link(ds, p->dp->index, p->phy);
 
 	if (status_changed)
 		phy_print_status(p->phy);
@@ -1092,9 +1093,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
 
 	if (dev) {
 		p = netdev_priv(dev);
-		ds = p->parent;
+		ds = p->dp->ds;
 		if (ds->ops->fixed_link_update)
-			ds->ops->fixed_link_update(ds, p->port, status);
+			ds->ops->fixed_link_update(ds, p->dp->index, status);
 	}
 
 	return 0;
@@ -1105,7 +1106,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
 				 struct net_device *slave_dev,
 				 int addr)
 {
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr);
 	if (!p->phy) {
@@ -1123,13 +1124,13 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
 static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 				struct net_device *slave_dev)
 {
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	struct device_node *phy_dn, *port_dn;
 	bool phy_is_fixed = false;
 	u32 phy_flags = 0;
 	int mode, ret;
 
-	port_dn = ds->ports[p->port].dn;
+	port_dn = p->dp->dn;
 	mode = of_get_phy_mode(port_dn);
 	if (mode < 0)
 		mode = PHY_INTERFACE_MODE_NA;
@@ -1150,7 +1151,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 	}
 
 	if (ds->ops->get_phy_flags)
-		phy_flags = ds->ops->get_phy_flags(ds, p->port);
+		phy_flags = ds->ops->get_phy_flags(ds, p->dp->index);
 
 	if (phy_dn) {
 		int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
@@ -1185,9 +1186,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 	 * MDIO bus instead
 	 */
 	if (!p->phy) {
-		ret = dsa_slave_phy_connect(p, slave_dev, p->port);
+		ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index);
 		if (ret) {
-			netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret);
+			netdev_err(slave_dev, "failed to connect to port %d: %d\n",
+				   p->dp->index, ret);
 			if (phy_is_fixed)
 				of_phy_deregister_fixed_link(port_dn);
 			return ret;
@@ -1275,8 +1277,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	slave_dev->vlan_features = master->vlan_features;
 
 	p = netdev_priv(slave_dev);
-	p->parent = ds;
-	p->port = port;
+	p->dp = &ds->ports[port];
 	p->xmit = dst->tag_ops->xmit;
 
 	p->old_pause = -1;
@@ -1309,10 +1310,9 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 void dsa_slave_destroy(struct net_device *slave_dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(slave_dev);
-	struct dsa_switch *ds = p->parent;
 	struct device_node *port_dn;
 
-	port_dn = ds->ports[p->port].dn;
+	port_dn = p->dp->dn;
 
 	netif_carrier_off(slave_dev);
 	if (p->phy) {
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 93e4458c02f9..5d925b6b2bb1 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -80,9 +80,9 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
 			((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK);
 	brcm_tag[1] = 0;
 	brcm_tag[2] = 0;
-	if (p->port == 8)
+	if (p->dp->index == 8)
 		brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
-	brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK;
+	brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK;
 
 	return skb;
 
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 8fa4b1942671..72579ceea381 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -33,8 +33,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
 		 */
 		dsa_header = skb->data + 2 * ETH_ALEN;
-		dsa_header[0] = 0x60 | p->parent->index;
-		dsa_header[1] = p->port << 3;
+		dsa_header[0] = 0x60 | p->dp->ds->index;
+		dsa_header[1] = p->dp->index << 3;
 
 		/*
 		 * Move CFI field from byte 2 to byte 1.
@@ -54,8 +54,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * Construct untagged FROM_CPU DSA tag.
 		 */
 		dsa_header = skb->data + 2 * ETH_ALEN;
-		dsa_header[0] = 0x40 | p->parent->index;
-		dsa_header[1] = p->port << 3;
+		dsa_header[0] = 0x40 | p->dp->ds->index;
+		dsa_header[1] = p->dp->index << 3;
 		dsa_header[2] = 0x00;
 		dsa_header[3] = 0x00;
 	}
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 929de581a846..648c051817a1 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -42,8 +42,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		edsa_header[1] = ETH_P_EDSA & 0xff;
 		edsa_header[2] = 0x00;
 		edsa_header[3] = 0x00;
-		edsa_header[4] = 0x60 | p->parent->index;
-		edsa_header[5] = p->port << 3;
+		edsa_header[4] = 0x60 | p->dp->ds->index;
+		edsa_header[5] = p->dp->index << 3;
 
 		/*
 		 * Move CFI field from byte 6 to byte 5.
@@ -67,8 +67,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		edsa_header[1] = ETH_P_EDSA & 0xff;
 		edsa_header[2] = 0x00;
 		edsa_header[3] = 0x00;
-		edsa_header[4] = 0x40 | p->parent->index;
-		edsa_header[5] = p->port << 3;
+		edsa_header[4] = 0x40 | p->dp->ds->index;
+		edsa_header[5] = p->dp->index << 3;
 		edsa_header[6] = 0x00;
 		edsa_header[7] = 0x00;
 	}
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 736ca8e8c31e..30240f343aea 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -54,7 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Set the version field, and set destination port information */
 	hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
 		QCA_HDR_XMIT_FROM_CPU |
-		BIT(p->port);
+		BIT(p->dp->index);
 
 	*phdr = htons(hdr);
 
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index d8e55a845d7f..26f977176978 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -50,7 +50,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	trailer = skb_put(nskb, 4);
 	trailer[0] = 0x80;
-	trailer[1] = 1 << p->port;
+	trailer[1] = 1 << p->dp->index;
 	trailer[2] = 0x10;
 	trailer[3] = 0x00;
 
-- 
cgit v1.2.3


From a5e9a02e1f182237ef44eb3919cf4dd45ed4db9b Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 27 Jan 2017 15:29:40 -0500
Subject: net: dsa: move bridge device in dsa_port

Move the bridge_dev pointer from dsa_slave_priv to dsa_port so that DSA
drivers can access this information and remove the need to cache it.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  1 +
 net/dsa/dsa_priv.h |  1 -
 net/dsa/slave.c    | 10 +++++-----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6bd1f8b05dbd..924533fd4425 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -146,6 +146,7 @@ struct dsa_port {
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	u8			stp_state;
+	struct net_device	*bridge_dev;
 };
 
 struct dsa_switch {
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index c519bd0e9206..3022f2e42cdc 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -38,7 +38,6 @@ struct dsa_slave_priv {
 	int			old_pause;
 	int			old_duplex;
 
-	struct net_device	*bridge_dev;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a2f0267c4a3c..cdb1df87e111 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -64,9 +64,9 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
 	return p->dp->ds->dst->master_netdev->ifindex;
 }
 
-static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
+static inline bool dsa_port_is_bridged(struct dsa_port *dp)
 {
-	return !!p->bridge_dev;
+	return !!dp->bridge_dev;
 }
 
 static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
@@ -98,7 +98,7 @@ static int dsa_slave_open(struct net_device *dev)
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct net_device *master = p->dp->ds->dst->master_netdev;
 	struct dsa_switch *ds = p->dp->ds;
-	u8 stp_state = dsa_port_is_bridged(p) ?
+	u8 stp_state = dsa_port_is_bridged(p->dp) ?
 			BR_STATE_BLOCKING : BR_STATE_FORWARDING;
 	int err;
 
@@ -557,7 +557,7 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	struct dsa_switch *ds = p->dp->ds;
 	int ret = -EOPNOTSUPP;
 
-	p->bridge_dev = br;
+	p->dp->bridge_dev = br;
 
 	if (ds->ops->port_bridge_join)
 		ret = ds->ops->port_bridge_join(ds, p->dp->index, br);
@@ -574,7 +574,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev)
 	if (ds->ops->port_bridge_leave)
 		ds->ops->port_bridge_leave(ds, p->dp->index);
 
-	p->bridge_dev = NULL;
+	p->dp->bridge_dev = NULL;
 
 	/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
-- 
cgit v1.2.3


From f123f2fbedc7c2723ceb050cd88c2ea1d6a8be32 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 27 Jan 2017 15:29:41 -0500
Subject: net: dsa: pass bridge device when a port leaves

Upon reception of the NETDEV_CHANGEUPPER, a leaving port is already
unbridged, so reflect this by assigning the port's bridge_dev pointer to
NULL before calling the port_bridge_leave DSA driver operation.

Now that the bridge_dev pointer is exposed to the drivers, reflecting
the current state of the DSA switch fabric is necessary for the drivers
to adjust their port based VLANs correctly.

Pass the bridge device pointer to the port_bridge_leave operation so
that drivers have all information to re-program their chips properly,
and do not need to cache it anymore.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c |  2 +-
 drivers/net/dsa/b53/b53_priv.h   |  2 +-
 drivers/net/dsa/mv88e6xxx/chip.c |  3 ++-
 drivers/net/dsa/qca8k.c          |  2 +-
 include/net/dsa.h                |  3 ++-
 net/dsa/slave.c                  | 10 +++++-----
 6 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 31afc4d4b68b..32fdcf5570c8 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1354,7 +1354,7 @@ int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge)
 }
 EXPORT_SYMBOL(b53_br_join);
 
-void b53_br_leave(struct dsa_switch *ds, int port)
+void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *br)
 {
 	struct b53_device *dev = ds->priv;
 	struct net_device *bridge = dev->ports[port].bridge_dev;
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index a8031b382c55..5dafb70e75fc 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -382,7 +382,7 @@ void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data);
 void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
 int b53_get_sset_count(struct dsa_switch *ds);
 int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge);
-void b53_br_leave(struct dsa_switch *ds, int port);
+void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
 void b53_br_fast_age(struct dsa_switch *ds, int port);
 int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index cb7b24748336..8eb0dc063f4e 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2343,7 +2343,8 @@ static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
 	return err;
 }
 
-static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
+static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
+					struct net_device *br)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	struct net_device *bridge = chip->ports[port].bridge_dev;
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index f67c6a3cebff..c85b187aa3d9 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -775,7 +775,7 @@ qca8k_port_bridge_join(struct dsa_switch *ds, int port,
 }
 
 static void
-qca8k_port_bridge_leave(struct dsa_switch *ds, int port)
+qca8k_port_bridge_leave(struct dsa_switch *ds, int port, struct net_device *br)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	int i;
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 924533fd4425..b951e2ebda75 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -325,7 +325,8 @@ struct dsa_switch_ops {
 	int	(*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs);
 	int	(*port_bridge_join)(struct dsa_switch *ds, int port,
 				    struct net_device *bridge);
-	void	(*port_bridge_leave)(struct dsa_switch *ds, int port);
+	void	(*port_bridge_leave)(struct dsa_switch *ds, int port,
+				     struct net_device *bridge);
 	void	(*port_stp_state_set)(struct dsa_switch *ds, int port,
 				      u8 state);
 	void	(*port_fast_age)(struct dsa_switch *ds, int port);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index cdb1df87e111..08725286f79d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -565,16 +565,16 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	return ret == -EOPNOTSUPP ? 0 : ret;
 }
 
-static void dsa_slave_bridge_port_leave(struct net_device *dev)
+static void dsa_slave_bridge_port_leave(struct net_device *dev,
+					struct net_device *br)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->dp->ds;
 
+	p->dp->bridge_dev = NULL;
 
 	if (ds->ops->port_bridge_leave)
-		ds->ops->port_bridge_leave(ds, p->dp->index);
-
-	p->dp->bridge_dev = NULL;
+		ds->ops->port_bridge_leave(ds, p->dp->index, br);
 
 	/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
@@ -1343,7 +1343,7 @@ static int dsa_slave_port_upper_event(struct net_device *dev,
 			if (info->linking)
 				err = dsa_slave_bridge_port_join(dev, upper);
 			else
-				dsa_slave_bridge_port_leave(dev);
+				dsa_slave_bridge_port_leave(dev, upper);
 		}
 
 		break;
-- 
cgit v1.2.3


From 5b8784aaf29be20ba8d363e1124d7436d42ef9bf Mon Sep 17 00:00:00 2001
From: andy zhou
Date: Fri, 27 Jan 2017 13:45:28 -0800
Subject: openvswitch: Simplify do_execute_actions().

do_execute_actions() implements a worthwhile optimization: in case
an output action is the last action in an action list, skb_clone()
can be avoided by outputing the current skb. However, the
implementation is more complicated than necessary.  This patch
simplify this logic.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 514f7bcf7c63..efa9a8858cc6 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1141,12 +1141,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			      struct sw_flow_key *key,
 			      const struct nlattr *attr, int len)
 {
-	/* Every output action needs a separate clone of 'skb', but the common
-	 * case is just a single output action, so that doing a clone and
-	 * then freeing the original skbuff is wasteful.  So the following code
-	 * is slightly obscure just to avoid that.
-	 */
-	int prev_port = -1;
 	const struct nlattr *a;
 	int rem;
 
@@ -1154,20 +1148,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 	     a = nla_next(a, &rem)) {
 		int err = 0;
 
-		if (unlikely(prev_port != -1)) {
-			struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
-
-			if (out_skb)
-				do_output(dp, out_skb, prev_port, key);
+		switch (nla_type(a)) {
+		case OVS_ACTION_ATTR_OUTPUT: {
+			int port = nla_get_u32(a);
+			struct sk_buff *clone;
+
+			/* Every output action needs a separate clone
+			 * of 'skb', In case the output action is the
+			 * last action, cloning can be avoided.
+			 */
+			if (nla_is_last(a, rem)) {
+				do_output(dp, skb, port, key);
+				/* 'skb' has been used for output.
+				 */
+				return 0;
+			}
 
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if (clone)
+				do_output(dp, clone, port, key);
 			OVS_CB(skb)->cutlen = 0;
-			prev_port = -1;
-		}
-
-		switch (nla_type(a)) {
-		case OVS_ACTION_ATTR_OUTPUT:
-			prev_port = nla_get_u32(a);
 			break;
+		}
 
 		case OVS_ACTION_ATTR_TRUNC: {
 			struct ovs_action_trunc *trunc = nla_data(a);
@@ -1257,11 +1259,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		}
 	}
 
-	if (prev_port != -1)
-		do_output(dp, skb, prev_port, key);
-	else
-		consume_skb(skb);
-
+	consume_skb(skb);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7e98102f489775d8c000884fca8a0d995ea688a9 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Fri, 27 Jan 2017 16:24:38 -0800
Subject: tcp: record pkts sent and retransmistted

Add two stats in SCM_TIMESTAMPING_OPT_STATS:

TCP_NLA_DATA_SEGS_OUT: total data packets sent including retransmission
TCP_NLA_TOTAL_RETRANS: total data packets retransmitted

The names are picked to be consistent with corresponding fields in
TCP_INFO. This allows applications that are using the timestamping
API to measure latency stats to also retrive retransmission rate
of application write.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 6ff35eb48d10..38a2b07afdff 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -227,6 +227,8 @@ enum {
 	TCP_NLA_BUSY,		/* Time (usec) busy sending data */
 	TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */
 	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
+	TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */
+	TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2ed472ebf3b5..b751abc56935 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2870,7 +2870,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	struct sk_buff *stats;
 	struct tcp_info info;
 
-	stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+	stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -2881,6 +2881,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  info.tcpi_rwnd_limited, TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
 			  info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+			  tp->data_segs_out, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+			  tp->total_retrans, TCP_NLA_PAD);
 	return stats;
 }
 
-- 
cgit v1.2.3


From 678550c651aee051d66112933c87894f129b9355 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Fri, 27 Jan 2017 16:24:39 -0800
Subject: tcp: include locally failed retries in retransmission stats

Currently the retransmission stats are not incremented if the
retransmit fails locally. But we always increment the other packet
counters that track total packet/bytes sent.  Awkwardly while we
don't count these failed retransmits in RETRANSSEGS, we do count
them in FAILEDRETRANS.

If the qdisc is dropping many packets this could under-estimate
TCP retransmission rate substantially from both SNMP or per-socket
TCP_INFO stats. This patch changes this by always incrementing
retransmission stats on retransmission attempts and failures.

Another motivation is to properly track retransmists in
SCM_TIMESTAMPING_OPT_STATS. Since SCM_TSTAMP_SCHED collection is
triggered in tcp_transmit_skb(), If tp->total_retrans is incremented
after the function, we'll always mis-count by the amount of the
latest retransmission.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 671c69535671..7b2d8762f15f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2771,6 +2771,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
 		tcp_ecn_clear_syn(sk, skb);
 
+	/* Update global and local TCP statistics. */
+	segs = tcp_skb_pcount(skb);
+	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+	tp->total_retrans += segs;
+
 	/* make sure skb->data is aligned on arches that require it
 	 * and check if ack-trimming & collapsing extended the headroom
 	 * beyond what csum_start can cover.
@@ -2788,14 +2795,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	}
 
 	if (likely(!err)) {
-		segs = tcp_skb_pcount(skb);
-
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
-		/* Update global TCP statistics. */
-		TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
-			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
-		tp->total_retrans += segs;
+	} else if (err != -EBUSY) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
 	return err;
 }
@@ -2818,8 +2820,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		if (!tp->retrans_stamp)
 			tp->retrans_stamp = tcp_skb_timestamp(skb);
 
-	} else if (err != -EBUSY) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
 
 	if (tp->undo_retrans < 0)
-- 
cgit v1.2.3


From 40be0dda0725886b623d67868db3219a2e74683b Mon Sep 17 00:00:00 2001
From: Rafał Miłecki
Date: Sat, 28 Jan 2017 15:15:42 +0100
Subject: net: add devm version of alloc_etherdev_mqs function

This patch adds devm_alloc_etherdev_mqs function and devm_alloc_etherdev
macro. These can be used for simpler netdev allocation without having to
care about calling free_netdev.

Thanks to this change drivers, their error paths and removal paths may
get simpler by a bit.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h |  5 +++++
 net/ethernet/eth.c          | 28 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'net')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 42add77ae47d..c62b709b1ce0 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -54,6 +54,11 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
 #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)
 
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+					   unsigned int txqs,
+					   unsigned int rxqs);
+#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)
+
 struct sk_buff **eth_gro_receive(struct sk_buff **head,
 				 struct sk_buff *skb);
 int eth_gro_complete(struct sk_buff *skb, int nhoff);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 8c5a479681ca..efdaaab735fc 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -392,6 +392,34 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 }
 EXPORT_SYMBOL(alloc_etherdev_mqs);
 
+static void devm_free_netdev(struct device *dev, void *res)
+{
+	free_netdev(*(struct net_device **)res);
+}
+
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+					   unsigned int txqs, unsigned int rxqs)
+{
+	struct net_device **dr;
+	struct net_device *netdev;
+
+	dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return NULL;
+
+	netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+	if (!netdev) {
+		devres_free(dr);
+		return NULL;
+	}
+
+	*dr = netdev;
+	devres_add(dev, dr);
+
+	return netdev;
+}
+EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
+
 ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 {
 	return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
-- 
cgit v1.2.3


From f991bb9da142ba79b54ed0757f22e756f45e2c5a Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Mon, 30 Jan 2017 06:45:38 +0100
Subject: net: Drop secpath on free after gro merge.

With a followup patch, a gro merged skb can have a secpath.
So drop it before freeing or reusing the skb.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 56818f7eab2b..ef3a969477bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4623,6 +4623,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 	case GRO_MERGED_FREE:
 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
 			skb_dst_drop(skb);
+			secpath_reset(skb);
 			kmem_cache_free(skbuff_head_cache, skb);
 		} else {
 			__kfree_skb(skb);
@@ -4663,6 +4664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 	skb->encapsulation = 0;
 	skb_shinfo(skb)->gso_type = 0;
 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+	secpath_reset(skb);
 
 	napi->skb = skb;
 }
-- 
cgit v1.2.3


From 1995876a06bcf6f9f7d7b699bdbf387831679771 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Mon, 30 Jan 2017 06:45:43 +0100
Subject: xfrm: Add a dummy network device for napi.

This patch adds a dummy network device so that we can
use gro_cells for IPsec GRO. With this, we handle IPsec
GRO with no impact on the generic networking code.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_input.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6e3f0254d8a1..3213fe8027be 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -21,6 +21,9 @@ static struct kmem_cache *secpath_cachep __read_mostly;
 static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
 static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO];
 
+static struct gro_cells gro_cells;
+static struct net_device xfrm_napi_dev;
+
 int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
 {
 	int err = 0;
@@ -371,7 +374,7 @@ resume:
 
 	if (decaps) {
 		skb_dst_drop(skb);
-		netif_rx(skb);
+		gro_cells_receive(&gro_cells, skb);
 		return 0;
 	} else {
 		return x->inner_mode->afinfo->transport_finish(skb, async);
@@ -394,6 +397,13 @@ EXPORT_SYMBOL(xfrm_input_resume);
 
 void __init xfrm_input_init(void)
 {
+	int err;
+
+	init_dummy_netdev(&xfrm_napi_dev);
+	err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
+	if (err)
+		gro_cells.cells = NULL;
+
 	secpath_cachep = kmem_cache_create("secpath_cache",
 					   sizeof(struct sec_path),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-- 
cgit v1.2.3


From bf9f26485d232916cdc2257e42831781e1f075e8 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Mon, 30 Jan 2017 09:48:40 -0800
Subject: net: dsa: Hook {get,set}_rxnfc ethtool operations

In preparation for adding support for CFP/TCAMP in the bcm_sf2 driver add the
plumbing to call into driver specific {get,set}_rxnfc operations.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  8 ++++++++
 net/dsa/slave.c   | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index b951e2ebda75..d5d618c3de64 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -377,6 +377,14 @@ struct dsa_switch_ops {
 	int	(*port_mdb_dump)(struct dsa_switch *ds, int port,
 				 struct switchdev_obj_port_mdb *mdb,
 				 int (*cb)(struct switchdev_obj *obj));
+
+	/*
+	 * RXNFC
+	 */
+	int	(*get_rxnfc)(struct dsa_switch *ds, int port,
+			     struct ethtool_rxnfc *nfc, u32 *rule_locs);
+	int	(*set_rxnfc)(struct dsa_switch *ds, int port,
+			     struct ethtool_rxnfc *nfc);
 };
 
 struct dsa_switch_driver {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 08725286f79d..6881889e1a9b 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1002,6 +1002,30 @@ void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
 	ops->get_strings = dsa_cpu_port_get_strings;
 }
 
+static int dsa_slave_get_rxnfc(struct net_device *dev,
+			       struct ethtool_rxnfc *nfc, u32 *rule_locs)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->get_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs);
+}
+
+static int dsa_slave_set_rxnfc(struct net_device *dev,
+			       struct ethtool_rxnfc *nfc)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->set_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->set_rxnfc(ds, p->dp->index, nfc);
+}
+
 static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_drvinfo		= dsa_slave_get_drvinfo,
 	.get_regs_len		= dsa_slave_get_regs_len,
@@ -1020,6 +1044,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_eee		= dsa_slave_get_eee,
 	.get_link_ksettings	= dsa_slave_get_link_ksettings,
 	.set_link_ksettings	= dsa_slave_set_link_ksettings,
+	.get_rxnfc		= dsa_slave_get_rxnfc,
+	.set_rxnfc		= dsa_slave_set_rxnfc,
 };
 
 static const struct net_device_ops dsa_slave_netdev_ops = {
-- 
cgit v1.2.3


From 63a6fff353d01da5a22b72670c434bf12fa0e3b8 Mon Sep 17 00:00:00 2001
From: Robert Shearman
Date: Thu, 26 Jan 2017 18:02:24 +0000
Subject: net: Avoid receiving packets with an l3mdev on unbound UDP sockets

Packets arriving in a VRF currently are delivered to UDP sockets that
aren't bound to any interface. TCP defaults to not delivering packets
arriving in a VRF to unbound sockets. IP route lookup and socket
transmit both assume that unbound means using the default table and
UDP applications that haven't been changed to be aware of VRFs may not
function correctly in this case since they may not be able to handle
overlapping IP address ranges, or be able to send packets back to the
original sender if required.

So add a sysctl, udp_l3mdev_accept, to control this behaviour with it
being analgous to the existing tcp_l3mdev_accept, namely to allow a
process to have a VRF-global listen socket. Have this default to off
as this is the behaviour that users will expect, given that there is
no explicit mechanism to set unmodified VRF-unaware application into a
default VRF.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 Documentation/networking/vrf.txt       |  7 ++++---
 include/net/netns/ipv4.h               |  4 ++++
 net/ipv4/sysctl_net_ipv4.c             | 11 +++++++++++
 net/ipv4/udp.c                         | 27 ++++++++++++++++++++-------
 net/ipv6/udp.c                         | 27 ++++++++++++++++++++-------
 6 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 17f2e7791042..fc73eeb7b3b8 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -721,6 +721,13 @@ tcp_challenge_ack_limit - INTEGER
 
 UDP variables:
 
+udp_l3mdev_accept - BOOLEAN
+	Enabling this option allows a "global" bound socket to work
+	across L3 master domains (e.g., VRFs) with packets capable of
+	being received regardless of the L3 domain in which they
+	originated. Only valid when the kernel was compiled with
+	CONFIG_NET_L3_MASTER_DEV.
+
 udp_mem - vector of 3 INTEGERs: min, pressure, max
 	Number of pages allowed for queueing by all UDP sockets.
 
diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 755dab856392..3918dae964d4 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -98,10 +98,11 @@ VRF device:
 
 or to specify the output device using cmsg and IP_PKTINFO.
 
-TCP services running in the default VRF context (ie., not bound to any VRF
-device) can work across all VRF domains by enabling the tcp_l3mdev_accept
-sysctl option:
+TCP & UDP services running in the default VRF context (ie., not bound
+to any VRF device) can work across all VRF domains by enabling the
+tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
     sysctl -w net.ipv4.tcp_l3mdev_accept=1
+    sysctl -w net.ipv4.udp_l3mdev_accept=1
 
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e365732b8051..622d2da27135 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -124,6 +124,10 @@ struct netns_ipv4 {
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	int sysctl_udp_l3mdev_accept;
+#endif
+
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
 	int sysctl_igmp_llm_reports;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1b861997fdc5..d6880a6149ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1012,6 +1012,17 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_privileged_ports,
 	},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	{
+		.procname	= "udp_l3mdev_accept",
+		.data		= &init_net.ipv4.sysctl_udp_l3mdev_accept,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{ }
 };
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d6dddcf59e79..cf6ba3387401 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -134,6 +134,17 @@ EXPORT_SYMBOL(udp_memory_allocated);
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
+/* IPCB reference means this can not be used from early demux */
+static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+	    skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
+		return true;
+#endif
+	return false;
+}
+
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       const struct udp_hslot *hslot,
 			       unsigned long *bitmap,
@@ -369,7 +380,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
 
 static int compute_score(struct sock *sk, struct net *net,
 			 __be32 saddr, __be16 sport,
-			 __be32 daddr, unsigned short hnum, int dif)
+			 __be32 daddr, unsigned short hnum, int dif,
+			 bool exact_dif)
 {
 	int score;
 	struct inet_sock *inet;
@@ -400,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net,
 		score += 4;
 	}
 
-	if (sk->sk_bound_dev_if) {
+	if (sk->sk_bound_dev_if || exact_dif) {
 		if (sk->sk_bound_dev_if != dif)
 			return -1;
 		score += 4;
@@ -425,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
 /* called with rcu_read_lock() */
 static struct sock *udp4_lib_lookup2(struct net *net,
 		__be32 saddr, __be16 sport,
-		__be32 daddr, unsigned int hnum, int dif,
+		__be32 daddr, unsigned int hnum, int dif, bool exact_dif,
 		struct udp_hslot *hslot2,
 		struct sk_buff *skb)
 {
@@ -437,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 	badness = 0;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif);
+				      daddr, hnum, dif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -472,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+	bool exact_dif = udp_lib_exact_dif_match(net, skb);
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
 
@@ -484,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 
 		result = udp4_lib_lookup2(net, saddr, sport,
 					  daddr, hnum, dif,
-					  hslot2, skb);
+					  exact_dif, hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
 			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
@@ -499,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 
 			result = udp4_lib_lookup2(net, saddr, sport,
 						  daddr, hnum, dif,
-						  hslot2, skb);
+						  exact_dif, hslot2, skb);
 		}
 		return result;
 	}
@@ -508,7 +521,7 @@ begin:
 	badness = 0;
 	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif);
+				      daddr, hnum, dif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 05d69324862e..b4c6516a3a0c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -55,6 +55,16 @@
 #include <trace/events/skb.h>
 #include "udp_impl.h"
 
+static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+	if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+	    skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
+		return true;
+#endif
+	return false;
+}
+
 static u32 udp6_ehashfn(const struct net *net,
 			const struct in6_addr *laddr,
 			const u16 lport,
@@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk)
 static int compute_score(struct sock *sk, struct net *net,
 			 const struct in6_addr *saddr, __be16 sport,
 			 const struct in6_addr *daddr, unsigned short hnum,
-			 int dif)
+			 int dif, bool exact_dif)
 {
 	int score;
 	struct inet_sock *inet;
@@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net,
 		score++;
 	}
 
-	if (sk->sk_bound_dev_if) {
+	if (sk->sk_bound_dev_if || exact_dif) {
 		if (sk->sk_bound_dev_if != dif)
 			return -1;
 		score++;
@@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net,
 static struct sock *udp6_lib_lookup2(struct net *net,
 		const struct in6_addr *saddr, __be16 sport,
 		const struct in6_addr *daddr, unsigned int hnum, int dif,
-		struct udp_hslot *hslot2,
+		bool exact_dif, struct udp_hslot *hslot2,
 		struct sk_buff *skb)
 {
 	struct sock *sk, *result;
@@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 	badness = -1;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif);
+				      daddr, hnum, dif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+	bool exact_dif = udp6_lib_exact_dif_match(net, skb);
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
 
@@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			goto begin;
 
 		result = udp6_lib_lookup2(net, saddr, sport,
-					  daddr, hnum, dif,
+					  daddr, hnum, dif, exact_dif,
 					  hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
@@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
 
 			result = udp6_lib_lookup2(net, saddr, sport,
 						  daddr, hnum, dif,
-						  hslot2, skb);
+						  exact_dif, hslot2,
+						  skb);
 		}
 		return result;
 	}
@@ -247,7 +259,8 @@ begin:
 	result = NULL;
 	badness = -1;
 	sk_for_each_rcu(sk, &hslot->head) {
-		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif);
+		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
+				      exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
-- 
cgit v1.2.3


From 30357d7d8aaf2a980ab17c2ce054b2b87e60af88 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Mon, 30 Jan 2017 12:07:37 -0800
Subject: lwtunnel: remove device arg to lwtunnel_build_state

Nothing about lwt state requires a device reference, so remove the
input argument.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h    |  6 +++---
 net/core/lwt_bpf.c        |  2 +-
 net/core/lwtunnel.c       |  4 ++--
 net/ipv4/fib_semantics.c  | 27 ++++++++-------------------
 net/ipv4/ip_tunnel_core.c |  4 ++--
 net/ipv6/ila/ila_lwt.c    |  2 +-
 net/ipv6/route.c          |  2 +-
 net/ipv6/seg6_iptunnel.c  |  2 +-
 net/mpls/mpls_iptunnel.c  |  2 +-
 9 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 73dd87647460..45399ed132bf 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -33,7 +33,7 @@ struct lwtunnel_state {
 };
 
 struct lwtunnel_encap_ops {
-	int (*build_state)(struct net_device *dev, struct nlattr *encap,
+	int (*build_state)(struct nlattr *encap,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts);
 	void (*destroy_state)(struct lwtunnel_state *lws);
@@ -109,7 +109,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
 			   unsigned int num);
 int lwtunnel_valid_encap_type(u16 encap_type);
 int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len);
-int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+int lwtunnel_build_state(u16 encap_type,
 			 struct nlattr *encap,
 			 unsigned int family, const void *cfg,
 			 struct lwtunnel_state **lws);
@@ -181,7 +181,7 @@ static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len)
 	return -EOPNOTSUPP;
 }
 
-static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+static inline int lwtunnel_build_state(u16 encap_type,
 				       struct nlattr *encap,
 				       unsigned int family, const void *cfg,
 				       struct lwtunnel_state **lws)
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 03600459bcfd..0cfe7b0216c3 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -237,7 +237,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
 	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
 };
 
-static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+static int bpf_build_state(struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts)
 {
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index c23465005f2f..6df9f8fabf0c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -101,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
 }
 EXPORT_SYMBOL(lwtunnel_encap_del_ops);
 
-int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+int lwtunnel_build_state(u16 encap_type,
 			 struct nlattr *encap, unsigned int family,
 			 const void *cfg, struct lwtunnel_state **lws)
 {
@@ -116,7 +116,7 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 	rcu_read_lock();
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
 	if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
-		ret = ops->build_state(dev, encap, family, cfg, lws);
+		ret = ops->build_state(encap, family, cfg, lws);
 		if (ret)
 			module_put(ops->owner);
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 319c66de92eb..6306a67880e8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -471,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg)
 {
-	struct net *net = cfg->fc_nlinfo.nl_net;
 	int ret;
 
 	change_nexthops(fi) {
@@ -503,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			nla = nla_find(attrs, attrlen, RTA_ENCAP);
 			if (nla) {
 				struct lwtunnel_state *lwtstate;
-				struct net_device *dev = NULL;
 				struct nlattr *nla_entype;
 
 				nla_entype = nla_find(attrs, attrlen,
 						      RTA_ENCAP_TYPE);
 				if (!nla_entype)
 					goto err_inval;
-				if (cfg->fc_oif)
-					dev = __dev_get_by_index(net, cfg->fc_oif);
-				ret = lwtunnel_build_state(dev, nla_get_u16(
+
+				ret = lwtunnel_build_state(nla_get_u16(
 							   nla_entype),
 							   nla,  AF_INET, cfg,
 							   &lwtstate);
@@ -597,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi,
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
-static int fib_encap_match(struct net *net, u16 encap_type,
+static int fib_encap_match(u16 encap_type,
 			   struct nlattr *encap,
-			   int oif, const struct fib_nh *nh,
+			   const struct fib_nh *nh,
 			   const struct fib_config *cfg)
 {
 	struct lwtunnel_state *lwtstate;
-	struct net_device *dev = NULL;
 	int ret, result = 0;
 
 	if (encap_type == LWTUNNEL_ENCAP_NONE)
 		return 0;
 
-	if (oif)
-		dev = __dev_get_by_index(net, oif);
-	ret = lwtunnel_build_state(dev, encap_type, encap,
+	ret = lwtunnel_build_state(encap_type, encap,
 				   AF_INET, cfg, &lwtstate);
 	if (!ret) {
 		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -623,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type,
 
 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 {
-	struct net *net = cfg->fc_nlinfo.nl_net;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	struct rtnexthop *rtnh;
 	int remaining;
@@ -634,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 
 	if (cfg->fc_oif || cfg->fc_gw) {
 		if (cfg->fc_encap) {
-			if (fib_encap_match(net, cfg->fc_encap_type,
-					    cfg->fc_encap, cfg->fc_oif,
-					    fi->fib_nh, cfg))
+			if (fib_encap_match(cfg->fc_encap_type,
+					    cfg->fc_encap, fi->fib_nh, cfg))
 			    return 1;
 		}
 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -1093,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 
 		if (cfg->fc_encap) {
 			struct lwtunnel_state *lwtstate;
-			struct net_device *dev = NULL;
 
 			if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
 				goto err_inval;
-			if (cfg->fc_oif)
-				dev = __dev_get_by_index(net, cfg->fc_oif);
-			err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+			err = lwtunnel_build_state(cfg->fc_encap_type,
 						   cfg->fc_encap, AF_INET, cfg,
 						   &lwtstate);
 			if (err)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 9d6c10096d44..a31f47ccaad9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -226,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 };
 
-static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip_tun_build_state(struct nlattr *attr,
 			      unsigned int family, const void *cfg,
 			      struct lwtunnel_state **ts)
 {
@@ -323,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
 };
 
-static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip6_tun_build_state(struct nlattr *attr,
 			       unsigned int family, const void *cfg,
 			       struct lwtunnel_state **ts)
 {
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 13b5e85fe0d5..ce1aae4a7fc8 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -115,7 +115,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
 };
 
-static int ila_build_state(struct net_device *dev, struct nlattr *nla,
+static int ila_build_state(struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts)
 {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 61d7006324ed..2563331b0532 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1897,7 +1897,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
 	if (cfg->fc_encap) {
 		struct lwtunnel_state *lwtstate;
 
-		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+		err = lwtunnel_build_state(cfg->fc_encap_type,
 					   cfg->fc_encap, AF_INET6, cfg,
 					   &lwtstate);
 		if (err)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index c46f8cbf5ab5..6124e159c882 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -303,7 +303,7 @@ drop:
 	return err;
 }
 
-static int seg6_build_state(struct net_device *dev, struct nlattr *nla,
+static int seg6_build_state(struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts)
 {
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 67b7a955de65..e4e4424f9eb1 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -133,7 +133,7 @@ drop:
 	return -EINVAL;
 }
 
-static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+static int mpls_build_state(struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts)
 {
-- 
cgit v1.2.3


From f50f212749e8a28803af3628acbeb85ee0458ed5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Mon, 30 Jan 2017 12:41:40 -0800
Subject: net: dsa: Add plumbing for port mirroring

Add necessary plumbing at the slave network device level to have switch
drivers implement ndo_setup_tc() and most particularly the cls_matchall
classifier. We add support for two switch operations:

port_add_mirror and port_del_mirror() which configure, on a per-port
basis the mirror parameters requested from the cls_matchall classifier.

Code is largely borrowed from the Mellanox Spectrum switch driver.

Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  33 +++++++++++++
 net/dsa/dsa_priv.h |   3 ++
 net/dsa/slave.c    | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 172 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d5d618c3de64..2cb77e64d648 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -20,6 +20,8 @@
 #include <linux/phy_fixed.h>
 #include <linux/ethtool.h>
 
+struct tc_action;
+
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE = 0,
 	DSA_TAG_PROTO_DSA,
@@ -139,6 +141,28 @@ struct dsa_switch_tree {
 	const struct dsa_device_ops *tag_ops;
 };
 
+/* TC matchall action types, only mirroring for now */
+enum dsa_port_mall_action_type {
+	DSA_PORT_MALL_MIRROR,
+};
+
+/* TC mirroring entry */
+struct dsa_mall_mirror_tc_entry {
+	u8 to_local_port;
+	bool ingress;
+};
+
+/* TC matchall entry */
+struct dsa_mall_tc_entry {
+	struct list_head list;
+	unsigned long cookie;
+	enum dsa_port_mall_action_type type;
+	union {
+		struct dsa_mall_mirror_tc_entry mirror;
+	};
+};
+
+
 struct dsa_port {
 	struct dsa_switch	*ds;
 	unsigned int		index;
@@ -385,6 +409,15 @@ struct dsa_switch_ops {
 			     struct ethtool_rxnfc *nfc, u32 *rule_locs);
 	int	(*set_rxnfc)(struct dsa_switch *ds, int port,
 			     struct ethtool_rxnfc *nfc);
+
+	/*
+	 * TC integration
+	 */
+	int	(*port_mirror_add)(struct dsa_switch *ds, int port,
+				   struct dsa_mall_mirror_tc_entry *mirror,
+				   bool ingress);
+	void	(*port_mirror_del)(struct dsa_switch *ds, int port,
+				   struct dsa_mall_mirror_tc_entry *mirror);
 };
 
 struct dsa_switch_driver {
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 3022f2e42cdc..a5509b765fc0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -41,6 +41,9 @@ struct dsa_slave_priv {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
 #endif
+
+	/* TC context */
+	struct list_head	mall_tc_list;
 };
 
 /* dsa.c */
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6881889e1a9b..09fc3e9462c1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -16,12 +16,17 @@
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
 #include <linux/mdio.h>
+#include <linux/list.h>
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mirred.h>
 #include <linux/if_bridge.h>
 #include <linux/netpoll.h>
 #include "dsa_priv.h"
 
+static bool dsa_slave_dev_check(struct net_device *dev);
+
 /* slave mii_bus handling ***************************************************/
 static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
 {
@@ -995,6 +1000,133 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev,
 	return 0;
 }
 
+static struct dsa_mall_tc_entry *
+dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p,
+			     unsigned long cookie)
+{
+	struct dsa_mall_tc_entry *mall_tc_entry;
+
+	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
+		if (mall_tc_entry->cookie == cookie)
+			return mall_tc_entry;
+
+	return NULL;
+}
+
+static int dsa_slave_add_cls_matchall(struct net_device *dev,
+				      __be16 protocol,
+				      struct tc_cls_matchall_offload *cls,
+				      bool ingress)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = p->dp->ds;
+	struct net *net = dev_net(dev);
+	struct dsa_slave_priv *to_p;
+	struct net_device *to_dev;
+	const struct tc_action *a;
+	int err = -EOPNOTSUPP;
+	LIST_HEAD(actions);
+	int ifindex;
+
+	if (!ds->ops->port_mirror_add)
+		return err;
+
+	if (!tc_single_action(cls->exts))
+		return err;
+
+	tcf_exts_to_list(cls->exts, &actions);
+	a = list_first_entry(&actions, struct tc_action, list);
+
+	if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
+		struct dsa_mall_mirror_tc_entry *mirror;
+
+		ifindex = tcf_mirred_ifindex(a);
+		to_dev = __dev_get_by_index(net, ifindex);
+		if (!to_dev)
+			return -EINVAL;
+
+		if (!dsa_slave_dev_check(to_dev))
+			return -EOPNOTSUPP;
+
+		mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+		if (!mall_tc_entry)
+			return -ENOMEM;
+
+		mall_tc_entry->cookie = cls->cookie;
+		mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+		mirror = &mall_tc_entry->mirror;
+
+		to_p = netdev_priv(to_dev);
+
+		mirror->to_local_port = to_p->dp->index;
+		mirror->ingress = ingress;
+
+		err = ds->ops->port_mirror_add(ds, p->dp->index, mirror,
+					       ingress);
+		if (err) {
+			kfree(mall_tc_entry);
+			return err;
+		}
+
+		list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+	}
+
+	return 0;
+}
+
+static void dsa_slave_del_cls_matchall(struct net_device *dev,
+				       struct tc_cls_matchall_offload *cls)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->port_mirror_del)
+		return;
+
+	mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie);
+	if (!mall_tc_entry)
+		return;
+
+	list_del(&mall_tc_entry->list);
+
+	switch (mall_tc_entry->type) {
+	case DSA_PORT_MALL_MIRROR:
+		ds->ops->port_mirror_del(ds, p->dp->index,
+					 &mall_tc_entry->mirror);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	kfree(mall_tc_entry);
+}
+
+static int dsa_slave_setup_tc(struct net_device *dev, u32 handle,
+			      __be16 protocol, struct tc_to_netdev *tc)
+{
+	bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
+	int ret = -EOPNOTSUPP;
+
+	switch (tc->type) {
+	case TC_SETUP_MATCHALL:
+		switch (tc->cls_mall->command) {
+		case TC_CLSMATCHALL_REPLACE:
+			return dsa_slave_add_cls_matchall(dev, protocol,
+							  tc->cls_mall,
+							  ingress);
+		case TC_CLSMATCHALL_DESTROY:
+			dsa_slave_del_cls_matchall(dev, tc->cls_mall);
+			return 0;
+		}
+	default:
+		break;
+	}
+
+	return ret;
+}
+
 void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
 {
 	ops->get_sset_count = dsa_cpu_port_get_sset_count;
@@ -1069,6 +1201,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
 	.ndo_get_phys_port_name	= dsa_slave_get_phys_port_name,
+	.ndo_setup_tc		= dsa_slave_setup_tc,
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
@@ -1285,7 +1418,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	if (slave_dev == NULL)
 		return -ENOMEM;
 
-	slave_dev->features = master->vlan_features;
+	slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
+	slave_dev->hw_features |= NETIF_F_HW_TC;
 	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
 	eth_hw_addr_inherit(slave_dev, master);
 	slave_dev->priv_flags |= IFF_NO_QUEUE;
@@ -1304,6 +1438,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 
 	p = netdev_priv(slave_dev);
 	p->dp = &ds->ports[port];
+	INIT_LIST_HEAD(&p->mall_tc_list);
 	p->xmit = dst->tag_ops->xmit;
 
 	p->old_pause = -1;
-- 
cgit v1.2.3


From cdaf25dfc058ee6f7a7b2e2353de00fa288c0cd4 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Mon, 30 Jan 2017 10:55:04 +0100
Subject: smc: some potential use after free bugs

Say we got really unlucky and these failed on the last iteration, then
it could lead to a use after free bug.

Fixes: cd6851f30386 ("smc: remote memory buffers (RMBs)")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 8b1d34378829..0eac633fb354 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -532,6 +532,7 @@ int smc_sndbuf_create(struct smc_sock *smc)
 						__GFP_NORETRY);
 		if (!sndbuf_desc->cpu_addr) {
 			kfree(sndbuf_desc);
+			sndbuf_desc = NULL;
 			/* if send buffer allocation has failed,
 			 * try a smaller one
 			 */
@@ -543,6 +544,7 @@ int smc_sndbuf_create(struct smc_sock *smc)
 		if (rc) {
 			kfree(sndbuf_desc->cpu_addr);
 			kfree(sndbuf_desc);
+			sndbuf_desc = NULL;
 			continue; /* if mapping failed, try smaller one */
 		}
 		sndbuf_desc->used = 1;
@@ -596,6 +598,7 @@ int smc_rmb_create(struct smc_sock *smc)
 					     __GFP_NORETRY);
 		if (!rmb_desc->cpu_addr) {
 			kfree(rmb_desc);
+			rmb_desc = NULL;
 			/* if RMB allocation has failed,
 			 * try a smaller one
 			 */
@@ -607,6 +610,7 @@ int smc_rmb_create(struct smc_sock *smc)
 		if (rc) {
 			kfree(rmb_desc->cpu_addr);
 			kfree(rmb_desc);
+			rmb_desc = NULL;
 			continue; /* if mapping failed, try smaller one */
 		}
 		rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
@@ -619,6 +623,7 @@ int smc_rmb_create(struct smc_sock *smc)
 					 DMA_FROM_DEVICE);
 			kfree(rmb_desc->cpu_addr);
 			kfree(rmb_desc);
+			rmb_desc = NULL;
 			continue;
 		}
 		rmb_desc->used = 1;
-- 
cgit v1.2.3


From 4d1ceea8516cd6adf21f6b75995e2a7d4f376f9b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov
Date: Mon, 30 Jan 2017 18:25:18 -0800
Subject: net: ethtool: convert large order kmalloc allocations to vzalloc

under memory pressure 'ethtool -S' command may warn:
[ 2374.385195] ethtool: page allocation failure: order:4, mode:0x242c0c0
[ 2374.405573] CPU: 12 PID: 40211 Comm: ethtool Not tainted
[ 2374.423071] Call Trace:
[ 2374.423076]  [<ffffffff8148cb29>] dump_stack+0x4d/0x64
[ 2374.423080]  [<ffffffff811667cb>] warn_alloc_failed+0xeb/0x150
[ 2374.423082]  [<ffffffff81169cd3>] ? __alloc_pages_direct_compact+0x43/0xf0
[ 2374.423084]  [<ffffffff8116a25c>] __alloc_pages_nodemask+0x4dc/0xbf0
[ 2374.423091]  [<ffffffffa0023dc2>] ? cmd_exec+0x722/0xcd0 [mlx5_core]
[ 2374.423095]  [<ffffffff811b3dcc>] alloc_pages_current+0x8c/0x110
[ 2374.423097]  [<ffffffff81168859>] alloc_kmem_pages+0x19/0x90
[ 2374.423099]  [<ffffffff81186e5e>] kmalloc_order_trace+0x2e/0xe0
[ 2374.423101]  [<ffffffff811c0084>] __kmalloc+0x204/0x220
[ 2374.423105]  [<ffffffff816c269e>] dev_ethtool+0xe4e/0x1f80
[ 2374.423106]  [<ffffffff816b967e>] ? dev_get_by_name_rcu+0x5e/0x80
[ 2374.423108]  [<ffffffff816d6926>] dev_ioctl+0x156/0x560
[ 2374.423111]  [<ffffffff811d4c68>] ? mem_cgroup_commit_charge+0x78/0x3c0
[ 2374.423117]  [<ffffffff8169d542>] sock_do_ioctl+0x42/0x50
[ 2374.423119]  [<ffffffff8169d9c3>] sock_ioctl+0x1b3/0x250
[ 2374.423121]  [<ffffffff811f0f42>] do_vfs_ioctl+0x92/0x580
[ 2374.423123]  [<ffffffff8100222b>] ? do_audit_syscall_entry+0x4b/0x70
[ 2374.423124]  [<ffffffff8100287c>] ? syscall_trace_enter_phase1+0xfc/0x120
[ 2374.423126]  [<ffffffff811f14a9>] SyS_ioctl+0x79/0x90
[ 2374.423127]  [<ffffffff81002bb0>] do_syscall_64+0x50/0xa0
[ 2374.423129]  [<ffffffff817e19bc>] entry_SYSCALL64_slow_path+0x25/0x25

~1160 mlx5 counters ~= order 4 allocation which is unlikely to succeed
under memory pressure. Convert them to vzalloc() as ethtool_get_regs() does.
Also take care of drivers without counters similar to
commit 67ae7cf1eeda ("ethtool: Allow zero-length register dumps again")
and reduce warn_on to warn_on_once.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 236a21e3c878..6b3eee0834a0 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1817,11 +1817,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
 	ret = __ethtool_get_sset_count(dev, gstrings.string_set);
 	if (ret < 0)
 		return ret;
+	if (ret > S32_MAX / ETH_GSTRING_LEN)
+		return -ENOMEM;
+	WARN_ON_ONCE(!ret);
 
 	gstrings.len = ret;
-
-	data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);
-	if (!data)
+	data = vzalloc(gstrings.len * ETH_GSTRING_LEN);
+	if (gstrings.len && !data)
 		return -ENOMEM;
 
 	__ethtool_get_strings(dev, gstrings.string_set, data);
@@ -1830,12 +1832,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
 	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
 		goto out;
 	useraddr += sizeof(gstrings);
-	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+	if (gstrings.len &&
+	    copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
 		goto out;
 	ret = 0;
 
 out:
-	kfree(data);
+	vfree(data);
 	return ret;
 }
 
@@ -1912,14 +1915,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 	n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
 	if (n_stats < 0)
 		return n_stats;
-	WARN_ON(n_stats == 0);
-
+	if (n_stats > S32_MAX / sizeof(u64))
+		return -ENOMEM;
+	WARN_ON_ONCE(!n_stats);
 	if (copy_from_user(&stats, useraddr, sizeof(stats)))
 		return -EFAULT;
 
 	stats.n_stats = n_stats;
-	data = kmalloc(n_stats * sizeof(u64), GFP_USER);
-	if (!data)
+	data = vzalloc(n_stats * sizeof(u64));
+	if (n_stats && !data)
 		return -ENOMEM;
 
 	ops->get_ethtool_stats(dev, &stats, data);
@@ -1928,12 +1932,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
 		goto out;
 	useraddr += sizeof(stats);
-	if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+	if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
 		goto out;
 	ret = 0;
 
  out:
-	kfree(data);
+	vfree(data);
 	return ret;
 }
 
@@ -1948,17 +1952,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 		return -EOPNOTSUPP;
 
 	n_stats = phy_get_sset_count(phydev);
-
 	if (n_stats < 0)
 		return n_stats;
-	WARN_ON(n_stats == 0);
+	if (n_stats > S32_MAX / sizeof(u64))
+		return -ENOMEM;
+	WARN_ON_ONCE(!n_stats);
 
 	if (copy_from_user(&stats, useraddr, sizeof(stats)))
 		return -EFAULT;
 
 	stats.n_stats = n_stats;
-	data = kmalloc_array(n_stats, sizeof(u64), GFP_USER);
-	if (!data)
+	data = vzalloc(n_stats * sizeof(u64));
+	if (n_stats && !data)
 		return -ENOMEM;
 
 	mutex_lock(&phydev->lock);
@@ -1969,12 +1974,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
 		goto out;
 	useraddr += sizeof(stats);
-	if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+	if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
 		goto out;
 	ret = 0;
 
  out:
-	kfree(data);
+	vfree(data);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 160ca0142431c19386db976302fd4b07c587f651 Mon Sep 17 00:00:00 2001
From: Theuns Verwoerd
Date: Tue, 31 Jan 2017 12:23:46 +1300
Subject: rtnetlink: Handle IFLA_MASTER parameter when processing rtnl_newlink

Allow a master interface to be specified as one of the parameters when
creating a new interface via rtnl_newlink.  Previously this would
require invoking interface creation, waiting for it to complete, and
then separately binding that new interface to a master.

In particular, this is used when creating a macvlan child interface for
VRRP in a VRF configuration, allowing the interface creator to specify
directly what master interface should be inherited by the child,
without having to deal with asynchronous complications and potential
race conditions.

Signed-off-by: Theuns Verwoerd <theuns.verwoerd@alliedtelesis.co.nz>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 152744643074..adfb54b896da 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2570,7 +2570,7 @@ replay:
 			return -ENODEV;
 		}
 
-		if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+		if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
 			return -EOPNOTSUPP;
 
 		if (!ops) {
@@ -2652,6 +2652,11 @@ replay:
 			if (err < 0)
 				goto out_unregister;
 		}
+		if (tb[IFLA_MASTER]) {
+			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+			if (err)
+				goto out_unregister;
+		}
 out:
 		if (link_net)
 			put_net(link_net);
-- 
cgit v1.2.3


From b2504a5dbef3305ef41988ad270b0e8ec289331c Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Tue, 31 Jan 2017 10:20:32 -0800
Subject: net: reduce skb_warn_bad_offload() noise

Dmitry reported warnings occurring in __skb_gso_segment() [1]

All SKB_GSO_DODGY producers can allow user space to feed
packets that trigger the current check.

We could prevent them from doing so, rejecting packets, but
this might add regressions to existing programs.

It turns out our SKB_GSO_DODGY handlers properly set up checksum
information that is needed anyway when packets needs to be segmented.

By checking again skb_needs_check() after skb_mac_gso_segment(),
we should remove these pesky warnings, at a very minor cost.

With help from Willem de Bruijn

[1]
WARNING: CPU: 1 PID: 6768 at net/core/dev.c:2439 skb_warn_bad_offload+0x2af/0x390 net/core/dev.c:2434
lo: caps=(0x000000a2803b7c69, 0x0000000000000000) len=138 data_len=0 gso_size=15883 gso_type=4 ip_summed=0
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 6768 Comm: syz-executor1 Not tainted 4.9.0 #5
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
 ffff8801c063ecd8 ffffffff82346bdf ffffffff00000001 1ffff100380c7d2e
 ffffed00380c7d26 0000000041b58ab3 ffffffff84b37e38 ffffffff823468f1
 ffffffff84820740 ffffffff84f289c0 dffffc0000000000 ffff8801c063ee20
Call Trace:
 [<ffffffff82346bdf>] __dump_stack lib/dump_stack.c:15 [inline]
 [<ffffffff82346bdf>] dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
 [<ffffffff81827e34>] panic+0x1fb/0x412 kernel/panic.c:179
 [<ffffffff8141f704>] __warn+0x1c4/0x1e0 kernel/panic.c:542
 [<ffffffff8141f7e5>] warn_slowpath_fmt+0xc5/0x100 kernel/panic.c:565
 [<ffffffff8356cbaf>] skb_warn_bad_offload+0x2af/0x390 net/core/dev.c:2434
 [<ffffffff83585cd2>] __skb_gso_segment+0x482/0x780 net/core/dev.c:2706
 [<ffffffff83586f19>] skb_gso_segment include/linux/netdevice.h:3985 [inline]
 [<ffffffff83586f19>] validate_xmit_skb+0x5c9/0xc20 net/core/dev.c:2969
 [<ffffffff835892bb>] __dev_queue_xmit+0xe6b/0x1e70 net/core/dev.c:3383
 [<ffffffff8358a2d7>] dev_queue_xmit+0x17/0x20 net/core/dev.c:3424
 [<ffffffff83ad161d>] packet_snd net/packet/af_packet.c:2930 [inline]
 [<ffffffff83ad161d>] packet_sendmsg+0x32ed/0x4d30 net/packet/af_packet.c:2955
 [<ffffffff834f0aaa>] sock_sendmsg_nosec net/socket.c:621 [inline]
 [<ffffffff834f0aaa>] sock_sendmsg+0xca/0x110 net/socket.c:631
 [<ffffffff834f329a>] ___sys_sendmsg+0x8fa/0x9f0 net/socket.c:1954
 [<ffffffff834f5e58>] __sys_sendmsg+0x138/0x300 net/socket.c:1988
 [<ffffffff834f604d>] SYSC_sendmsg net/socket.c:1999 [inline]
 [<ffffffff834f604d>] SyS_sendmsg+0x2d/0x50 net/socket.c:1995
 [<ffffffff84371941>] entry_SYSCALL_64_fastpath+0x1f/0xc2

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov  <dvyukov@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index e61528c50209..727b6fda0e8c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2658,11 +2658,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 				  netdev_features_t features, bool tx_path)
 {
+	struct sk_buff *segs;
+
 	if (unlikely(skb_needs_check(skb, tx_path))) {
 		int err;
 
-		skb_warn_bad_offload(skb);
-
+		/* We're going to init ->check field in TCP or UDP header */
 		err = skb_cow_head(skb, 0);
 		if (err < 0)
 			return ERR_PTR(err);
@@ -2690,7 +2691,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 	skb_reset_mac_header(skb);
 	skb_reset_mac_len(skb);
 
-	return skb_mac_gso_segment(skb, features);
+	segs = skb_mac_gso_segment(skb, features);
+
+	if (unlikely(skb_needs_check(skb, tx_path)))
+		skb_warn_bad_offload(skb);
+
+	return segs;
 }
 EXPORT_SYMBOL(__skb_gso_segment);
 
-- 
cgit v1.2.3


From 1f5e29ce7989eb75652774004b962ee1eb6e56ca Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Tue, 31 Jan 2017 16:51:37 -0800
Subject: net: ipv6: add NLM_F_APPEND in notifications when applicable

IPv6 does not set the NLM_F_APPEND flag in notifications to signal that
a NEWROUTE is an append versus a new route or a replaced one. Add the
flag if the request has it.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef5485204522..febde6c112bf 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -746,6 +746,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 	u16 nlflags = NLM_F_EXCL;
 	int err;
 
+	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+		nlflags |= NLM_F_APPEND;
+
 	ins = &fn->leaf;
 
 	for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
-- 
cgit v1.2.3


From cadb9c9fdbc6a624d57b5ecdfd412b5800848703 Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Tue, 31 Jan 2017 11:33:53 +0200
Subject: net/sched: act_sample: Fix error path in init

Fix error path of in sample init, by releasing the tc hash in case of
failure in psample_group creation.

Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action")
Reported-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_sample.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 39229756de07..02b67495829c 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -81,8 +81,11 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
 	s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
 	psample_group = psample_group_get(net, s->psample_group_num);
-	if (!psample_group)
+	if (!psample_group) {
+		if (ret == ACT_P_CREATED)
+			tcf_hash_release(*a, bind);
 		return -ENOMEM;
+	}
 	RCU_INIT_POINTER(s->psample_group, psample_group);
 
 	if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
-- 
cgit v1.2.3


From f3b20313aef93f9cfda76fec2c05625f88510f18 Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Tue, 31 Jan 2017 11:33:54 +0200
Subject: net/sched: act_psample: Remove unnecessary ASSERT_RTNL

The ASSERT_RTNL is not necessary in the init function, as it does not
touch any rtnl protected structures, as opposed to the mirred action which
does have to hold a net device.

Reported-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_sample.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 02b67495829c..0b8217b4763f 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -76,7 +76,6 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	}
 	s = to_sample(*a);
 
-	ASSERT_RTNL();
 	s->tcf_action = parm->action;
 	s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
 	s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
-- 
cgit v1.2.3


From 10435c1192d06bdb0bac7666452d8219d7e7c477 Mon Sep 17 00:00:00 2001
From: Feng
Date: Fri, 20 Jan 2017 21:40:43 +0800
Subject: netfilter: nf_tables: Eliminate duplicated code in
 nf_tables_table_enable()

If something fails in nf_tables_table_enable(), it unregisters the
chains. But the rollback code is the same as nf_tables_table_disable()
almostly, except there is one counter check.  Now create one wrapper
function to eliminate the duplicated codes.

Signed-off-by: Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 48 ++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6e07c214c208..e6741ac4ccc1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -576,6 +576,28 @@ err:
 	return err;
 }
 
+static void _nf_tables_table_disable(struct net *net,
+				     const struct nft_af_info *afi,
+				     struct nft_table *table,
+				     u32 cnt)
+{
+	struct nft_chain *chain;
+	u32 i = 0;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_active_next(net, chain))
+			continue;
+		if (!(chain->flags & NFT_BASE_CHAIN))
+			continue;
+
+		if (cnt && i++ == cnt)
+			break;
+
+		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
+					afi->nops);
+	}
+}
+
 static int nf_tables_table_enable(struct net *net,
 				  const struct nft_af_info *afi,
 				  struct nft_table *table)
@@ -598,18 +620,8 @@ static int nf_tables_table_enable(struct net *net,
 	}
 	return 0;
 err:
-	list_for_each_entry(chain, &table->chains, list) {
-		if (!nft_is_active_next(net, chain))
-			continue;
-		if (!(chain->flags & NFT_BASE_CHAIN))
-			continue;
-
-		if (i-- <= 0)
-			break;
-
-		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
-					afi->nops);
-	}
+	if (i)
+		_nf_tables_table_disable(net, afi, table, i);
 	return err;
 }
 
@@ -617,17 +629,7 @@ static void nf_tables_table_disable(struct net *net,
 				    const struct nft_af_info *afi,
 				    struct nft_table *table)
 {
-	struct nft_chain *chain;
-
-	list_for_each_entry(chain, &table->chains, list) {
-		if (!nft_is_active_next(net, chain))
-			continue;
-		if (!(chain->flags & NFT_BASE_CHAIN))
-			continue;
-
-		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
-					afi->nops);
-	}
+	_nf_tables_table_disable(net, afi, table, 0);
 }
 
 static int nf_tables_updtable(struct nft_ctx *ctx)
-- 
cgit v1.2.3


From 11df4b760f11ca7528c62b1c4b870735d1c62116 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 23 Jan 2017 18:21:53 +0100
Subject: netfilter: conntrack: no need to pass ctinfo to error handler

It is never accessed for reading and the only places that write to it
are the icmp(6) handlers, which also set skb->nfct (and skb->nfctinfo).

The conntrack core specifically checks for attached skb->nfct after
->error() invocation and returns early in this case.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   |  2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 12 ++++++------
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 12 ++++++------
 net/netfilter/nf_conntrack_core.c              |  3 +--
 net/netfilter/nf_conntrack_proto_dccp.c        |  1 -
 net/netfilter/nf_conntrack_proto_sctp.c        |  2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         |  1 -
 net/netfilter/nf_conntrack_proto_udp.c         |  3 +--
 8 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index e7b836590f0b..85e993e278d5 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -55,7 +55,7 @@ struct nf_conntrack_l4proto {
 	void (*destroy)(struct nf_conn *ct);
 
 	int (*error)(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
-		     unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+		     unsigned int dataoff,
 		     u_int8_t pf, unsigned int hooknum);
 
 	/* Print out the per-protocol part of the tuple. Return like seq_* */
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d075b3cf2400..566afac98a88 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -128,13 +128,13 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
 static int
 icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
-		 enum ip_conntrack_info *ctinfo,
 		 unsigned int hooknum)
 {
 	struct nf_conntrack_tuple innertuple, origtuple;
 	const struct nf_conntrack_l4proto *innerproto;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_zone *zone;
+	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 		return -NF_ACCEPT;
 	}
 
-	*ctinfo = IP_CT_RELATED;
+	ctinfo = IP_CT_RELATED;
 
 	h = nf_conntrack_find_get(net, zone, &innertuple);
 	if (!h) {
@@ -169,11 +169,11 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	}
 
 	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
-		*ctinfo += IP_CT_IS_REPLY;
+		ctinfo += IP_CT_IS_REPLY;
 
 	/* Update skb to refer to this connection */
 	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
-	skb->nfctinfo = *ctinfo;
+	skb->nfctinfo = ctinfo;
 	return NF_ACCEPT;
 }
 
@@ -181,7 +181,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 static int
 icmp_error(struct net *net, struct nf_conn *tmpl,
 	   struct sk_buff *skb, unsigned int dataoff,
-	   enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+	   u8 pf, unsigned int hooknum)
 {
 	const struct icmphdr *icmph;
 	struct icmphdr _ih;
@@ -225,7 +225,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	    icmph->type != ICMP_REDIRECT)
 		return NF_ACCEPT;
 
-	return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+	return icmp_error_message(net, tmpl, skb, hooknum);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index f5a61bc3ec2b..44b9af3f813e 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -145,12 +145,12 @@ static int
 icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
 		     unsigned int icmp6off,
-		     enum ip_conntrack_info *ctinfo,
 		     unsigned int hooknum)
 {
 	struct nf_conntrack_tuple intuple, origtuple;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_l4proto *inproto;
+	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
@@ -176,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		return -NF_ACCEPT;
 	}
 
-	*ctinfo = IP_CT_RELATED;
+	ctinfo = IP_CT_RELATED;
 
 	h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
 				  &intuple);
@@ -185,19 +185,19 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		return -NF_ACCEPT;
 	} else {
 		if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
-			*ctinfo += IP_CT_IS_REPLY;
+			ctinfo += IP_CT_IS_REPLY;
 	}
 
 	/* Update skb to refer to this connection */
 	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
-	skb->nfctinfo = *ctinfo;
+	skb->nfctinfo = ctinfo;
 	return NF_ACCEPT;
 }
 
 static int
 icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	     struct sk_buff *skb, unsigned int dataoff,
-	     enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+	     u8 pf, unsigned int hooknum)
 {
 	const struct icmp6hdr *icmp6h;
 	struct icmp6hdr _ih;
@@ -232,7 +232,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	if (icmp6h->icmp6_type >= 128)
 		return NF_ACCEPT;
 
-	return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum);
+	return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3a073cd9fcf4..86186a2e2715 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1326,8 +1326,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	 * inverse of the return code tells to the netfilter
 	 * core what to do with the packet. */
 	if (l4proto->error != NULL) {
-		ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
-				     pf, hooknum);
+		ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
 		if (ret <= 0) {
 			NF_CT_STAT_INC_ATOMIC(net, error);
 			NF_CT_STAT_INC_ATOMIC(net, invalid);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index b68ce6ac13b3..93dd1c5b7bff 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -561,7 +561,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 
 static int dccp_error(struct net *net, struct nf_conn *tmpl,
 		      struct sk_buff *skb, unsigned int dataoff,
-		      enum ip_conntrack_info *ctinfo,
 		      u_int8_t pf, unsigned int hooknum)
 {
 	struct dccp_hdr _dh, *dh;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 44a647418948..33279aab583d 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -508,7 +508,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 }
 
 static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
-		      unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+		      unsigned int dataoff,
 		      u8 pf, unsigned int hooknum)
 {
 	const struct sctphdr *sh;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69f687740c76..b122e9dacfed 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -750,7 +750,6 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
 static int tcp_error(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
 		     unsigned int dataoff,
-		     enum ip_conntrack_info *ctinfo,
 		     u_int8_t pf,
 		     unsigned int hooknum)
 {
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index ae63944c9dc4..f6ebce6178ca 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -112,7 +112,6 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
 static int udplite_error(struct net *net, struct nf_conn *tmpl,
 			 struct sk_buff *skb,
 			 unsigned int dataoff,
-			 enum ip_conntrack_info *ctinfo,
 			 u8 pf, unsigned int hooknum)
 {
 	unsigned int udplen = skb->len - dataoff;
@@ -162,7 +161,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 #endif
 
 static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
-		     unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+		     unsigned int dataoff,
 		     u_int8_t pf,
 		     unsigned int hooknum)
 {
-- 
cgit v1.2.3


From 6e10148c5c85629832d9156f337cbf67e96b69fe Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 23 Jan 2017 18:21:54 +0100
Subject: netfilter: reset netfilter state when duplicating packet

We should also toss nf_bridge_info, if any -- packet is leaving via
ip_local_out, also, this skb isn't bridged -- it is a locally generated
copy.  Also this avoids the need to touch this later when skb->nfct is
replaced with 'unsigned long _nfct' in followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_dup_ipv4.c | 2 +-
 net/ipv6/netfilter/nf_dup_ipv6.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index cf986e1c7bbd..a981ef7151ca 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -68,7 +68,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Avoid counting cloned packets towards the original connection. */
-	nf_conntrack_put(skb->nfct);
+	nf_reset(skb);
 	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 4a84b5ad9ecb..5f52e5f90e7e 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -57,7 +57,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 		return;
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	nf_conntrack_put(skb->nfct);
+	nf_reset(skb);
 	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
-- 
cgit v1.2.3


From 97a6ad13decc16c5adbf181283932daba7e17faf Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 23 Jan 2017 18:21:55 +0100
Subject: netfilter: reduce direct skb->nfct usage

Next patch makes direct skb->nfct access illegal, reduce noise
in next patch by using accessors we already have.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h               |  9 ++++++---
 net/netfilter/nf_conntrack_core.c | 15 +++++++++------
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index cd6018a9ee24..2a344ebd7ebe 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1554,10 +1554,13 @@ static inline void ip_vs_notrack(struct sk_buff *skb)
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
 	if (!ct || !nf_ct_is_untracked(ct)) {
-		nf_conntrack_put(skb->nfct);
-		skb->nfct = &nf_ct_untracked_get()->ct_general;
+		struct nf_conn *untracked;
+
+		nf_conntrack_put(&ct->ct_general);
+		untracked = nf_ct_untracked_get();
+		nf_conntrack_get(&untracked->ct_general);
+		skb->nfct = &untracked->ct_general;
 		skb->nfctinfo = IP_CT_NEW;
-		nf_conntrack_get(skb->nfct);
 	}
 #endif
 }
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 86186a2e2715..adb7af3a4c4c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -686,8 +686,11 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 	    !nfct_nat(ct) &&
 	    !nf_ct_is_dying(ct) &&
 	    atomic_inc_not_zero(&ct->ct_general.use)) {
-		nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
-		nf_conntrack_put(skb->nfct);
+		enum ip_conntrack_info oldinfo;
+		struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
+
+		nf_ct_acct_merge(ct, ctinfo, loser_ct);
+		nf_conntrack_put(&loser_ct->ct_general);
 		/* Assign conntrack already in hashes to this skbuff. Don't
 		 * modify skb->nfctinfo to ensure consistent stateful filtering.
 		 */
@@ -1288,7 +1291,7 @@ unsigned int
 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		struct sk_buff *skb)
 {
-	struct nf_conn *ct, *tmpl = NULL;
+	struct nf_conn *ct, *tmpl;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_l3proto *l3proto;
 	struct nf_conntrack_l4proto *l4proto;
@@ -1298,9 +1301,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	int set_reply = 0;
 	int ret;
 
-	if (skb->nfct) {
+	tmpl = nf_ct_get(skb, &ctinfo);
+	if (tmpl) {
 		/* Previously seen (loopback or untracked)?  Ignore. */
-		tmpl = (struct nf_conn *)skb->nfct;
 		if (!nf_ct_is_template(tmpl)) {
 			NF_CT_STAT_INC_ATOMIC(net, ignore);
 			return NF_ACCEPT;
@@ -1364,7 +1367,7 @@ repeat:
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
-		nf_conntrack_put(skb->nfct);
+		nf_conntrack_put(&ct->ct_general);
 		skb->nfct = NULL;
 		NF_CT_STAT_INC_ATOMIC(net, invalid);
 		if (ret == -NF_DROP)
-- 
cgit v1.2.3


From cb9c68363efb6d1f950ec55fb06e031ee70db5fc Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 23 Jan 2017 18:21:56 +0100
Subject: skbuff: add and use skb_nfct helper

Followup patch renames skb->nfct and changes its type so add a helper to
avoid intrusive rename change later.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h                         | 13 ++++++++++---
 include/net/netfilter/nf_conntrack_core.h      |  2 +-
 net/core/skbuff.c                              |  2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c              |  8 ++++----
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c            |  4 ++--
 net/ipv4/netfilter/nf_dup_ipv4.c               |  2 +-
 net/ipv6/netfilter/ip6t_SYNPROXY.c             |  8 ++++----
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  4 ++--
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      |  4 ++--
 net/netfilter/nf_conntrack_core.c              |  4 ++--
 net/netfilter/nf_nat_helper.c                  |  2 +-
 net/netfilter/xt_CT.c                          |  2 +-
 net/openvswitch/conntrack.c                    |  6 +++---
 net/sched/cls_flow.c                           |  2 +-
 15 files changed, 36 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..276431e047af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3553,6 +3553,15 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
 	skb->csum = csum_add(skb->csum, delta);
 }
 
+static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	return skb->nfct;
+#else
+	return NULL;
+#endif
+}
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
@@ -3652,9 +3661,7 @@ static inline bool skb_irq_freeable(const struct sk_buff *skb)
 #if IS_ENABLED(CONFIG_XFRM)
 		!skb->sp &&
 #endif
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-		!skb->nfct &&
-#endif
+		!skb_nfct(skb) &&
 		!skb->_skb_refdst &&
 		!skb_has_frag_list(skb);
 }
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 62e17d1319ff..84ec7ca5f195 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -62,7 +62,7 @@ int __nf_conntrack_confirm(struct sk_buff *skb);
 /* Confirm a connection: returns NF_DROP if packet must be dropped. */
 static inline int nf_conntrack_confirm(struct sk_buff *skb)
 {
-	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+	struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb);
 	int ret = NF_ACCEPT;
 
 	if (ct && !nf_ct_is_untracked(ct)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..cac3ebfb4b45 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -655,7 +655,7 @@ static void skb_release_head_state(struct sk_buff *skb)
 		skb->destructor(skb);
 	}
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	nf_conntrack_put(skb->nfct);
+	nf_conntrack_put(skb_nfct(skb));
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(skb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 30c0de53e254..a12d4f0aa674 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -107,8 +107,8 @@ synproxy_send_client_synack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static void
@@ -230,8 +230,8 @@ synproxy_send_client_ack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 566afac98a88..478a025909fc 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -137,7 +137,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(skb->nfct == NULL);
+	NF_CT_ASSERT(!skb_nfct(skb));
 	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 
 	/* Are they talking about one of our connections? */
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 49bd6a54404f..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct) {
+	if (skb_nfct(skb)) {
 		enum ip_conntrack_info ctinfo;
 		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
@@ -75,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
 #if !IS_ENABLED(CONFIG_NF_NAT)
 	/* Previously seen (loopback)?  Ignore.  Do this before
 	   fragment check. */
-	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
 #endif
 #endif
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index a981ef7151ca..1a5e1f53ceaa 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -71,7 +71,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 	nf_reset(skb);
 	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	nf_conntrack_get(skb_nfct(skb));
 #endif
 	/*
 	 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 98c8dd38575a..2dc01d2c6ec0 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -121,8 +121,8 @@ synproxy_send_client_synack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static void
@@ -244,8 +244,8 @@ synproxy_send_client_ack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static bool
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 44b9af3f813e..09f1661a4e88 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -153,7 +153,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(skb->nfct == NULL);
+	NF_CT_ASSERT(!skb_nfct(skb));
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
@@ -224,7 +224,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	    noct_valid_new[type]) {
 		skb->nfct = &nf_ct_untracked_get()->ct_general;
 		skb->nfctinfo = IP_CT_NEW;
-		nf_conntrack_get(skb->nfct);
+		nf_conntrack_get(skb_nfct(skb));
 		return NF_ACCEPT;
 	}
 
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 8e0bdd058787..ada60d1a991b 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -37,7 +37,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct) {
+	if (skb_nfct(skb)) {
 		enum ip_conntrack_info ctinfo;
 		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
@@ -61,7 +61,7 @@ static unsigned int ipv6_defrag(void *priv,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Previously seen (loopback)?	*/
-	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
 #endif
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index adb7af3a4c4c..78aebf0ee6e3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1357,7 +1357,7 @@ repeat:
 		goto out;
 	}
 
-	NF_CT_ASSERT(skb->nfct);
+	NF_CT_ASSERT(skb_nfct(skb));
 
 	/* Decide what timeout policy we want to apply to this flow. */
 	timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
@@ -1528,7 +1528,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 	/* Attach to new skbuff, and increment count */
 	nskb->nfct = &ct->ct_general;
 	nskb->nfctinfo = ctinfo;
-	nf_conntrack_get(nskb->nfct);
+	nf_conntrack_get(skb_nfct(nskb));
 }
 
 /* Bring out ya dead! */
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 2840abb5bb99..211661cb2c90 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -60,7 +60,7 @@ static void mangle_contents(struct sk_buff *skb,
 		__skb_trim(skb, skb->len + rep_len - match_len);
 	}
 
-	if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) {
+	if (nf_ct_l3num((struct nf_conn *)skb_nfct(skb)) == NFPROTO_IPV4) {
 		/* fix IP hdr checksum information */
 		ip_hdr(skb)->tot_len = htons(skb->len);
 		ip_send_check(ip_hdr(skb));
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 26b0bccfa0c5..cd7e29910ae1 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -415,7 +415,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	skb->nfct = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	nf_conntrack_get(skb_nfct(skb));
 
 	return XT_CONTINUE;
 }
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 6b78bab27755..452557946147 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -721,8 +721,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 
 		/* Associate skb with specified zone. */
 		if (tmpl) {
-			if (skb->nfct)
-				nf_conntrack_put(skb->nfct);
+			if (skb_nfct(skb))
+				nf_conntrack_put(skb_nfct(skb));
 			nf_conntrack_get(&tmpl->ct_general);
 			skb->nfct = &tmpl->ct_general;
 			skb->nfctinfo = IP_CT_NEW;
@@ -819,7 +819,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		if (err)
 			return err;
 
-		ct = (struct nf_conn *)skb->nfct;
+		ct = (struct nf_conn *)skb_nfct(skb);
 		if (ct)
 			nf_ct_deliver_cached_events(ct);
 	}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6575aba87630..3d6b9286c203 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -129,7 +129,7 @@ static u32 flow_get_mark(const struct sk_buff *skb)
 static u32 flow_get_nfct(const struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	return addr_fold(skb->nfct);
+	return addr_fold(skb_nfct(skb));
 #else
 	return 0;
 #endif
-- 
cgit v1.2.3


From c74454fadd5ea6fc866ffe2c417a0dba56b2bf1c Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 23 Jan 2017 18:21:57 +0100
Subject: netfilter: add and use nf_ct_set helper

Add a helper to assign a nf_conn entry and the ctinfo bits to an sk_buff.
This avoids changing code in followup patch that merges skb->nfct and
skb->nfctinfo into skb->_nfct.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h                            |  3 +--
 include/net/netfilter/nf_conntrack.h           |  8 ++++++++
 net/ipv4/netfilter/ipt_SYNPROXY.c              |  3 +--
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  3 +--
 net/ipv4/netfilter/nf_dup_ipv4.c               |  3 +--
 net/ipv6/netfilter/ip6t_SYNPROXY.c             |  3 +--
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  6 ++----
 net/ipv6/netfilter/nf_dup_ipv6.c               |  3 +--
 net/netfilter/nf_conntrack_core.c              | 11 +++--------
 net/netfilter/nft_ct.c                         |  3 +--
 net/netfilter/xt_CT.c                          |  6 ++----
 net/openvswitch/conntrack.c                    |  6 ++----
 12 files changed, 24 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 2a344ebd7ebe..4b46c591b542 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1559,8 +1559,7 @@ static inline void ip_vs_notrack(struct sk_buff *skb)
 		nf_conntrack_put(&ct->ct_general);
 		untracked = nf_ct_untracked_get();
 		nf_conntrack_get(&untracked->ct_general);
-		skb->nfct = &untracked->ct_general;
-		skb->nfctinfo = IP_CT_NEW;
+		nf_ct_set(skb, untracked, IP_CT_NEW);
 	}
 #endif
 }
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 5916aa9ab3f0..d704aed11684 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -34,6 +34,7 @@ union nf_conntrack_proto {
 	struct ip_ct_sctp sctp;
 	struct ip_ct_tcp tcp;
 	struct nf_ct_gre gre;
+	unsigned int tmpl_padto;
 };
 
 union nf_conntrack_expect_proto {
@@ -341,6 +342,13 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 				 gfp_t flags);
 void nf_ct_tmpl_free(struct nf_conn *tmpl);
 
+static inline void
+nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info)
+{
+	skb->nfct = &ct->ct_general;
+	skb->nfctinfo = info;
+}
+
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v))
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index a12d4f0aa674..3240a2614e82 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net,
 		goto free_nskb;
 
 	if (nfct) {
-		nskb->nfct = nfct;
-		nskb->nfctinfo = ctinfo;
+		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
 		nf_conntrack_get(nfct);
 	}
 
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 478a025909fc..73c591d8a9a8 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -172,8 +172,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 		ctinfo += IP_CT_IS_REPLY;
 
 	/* Update skb to refer to this connection */
-	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
-	skb->nfctinfo = ctinfo;
+	nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
 	return NF_ACCEPT;
 }
 
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 1a5e1f53ceaa..f0dbff05fc28 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -69,8 +69,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Avoid counting cloned packets towards the original connection. */
 	nf_reset(skb);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
 	nf_conntrack_get(skb_nfct(skb));
 #endif
 	/*
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 2dc01d2c6ec0..4ef1ddd4bbbd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -71,8 +71,7 @@ synproxy_send_tcp(struct net *net,
 	skb_dst_set(nskb, dst);
 
 	if (nfct) {
-		nskb->nfct = nfct;
-		nskb->nfctinfo = ctinfo;
+		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
 		nf_conntrack_get(nfct);
 	}
 
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 09f1661a4e88..d2c2ccbfbe72 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -189,8 +189,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* Update skb to refer to this connection */
-	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
-	skb->nfctinfo = ctinfo;
+	nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
 	return NF_ACCEPT;
 }
 
@@ -222,8 +221,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	type = icmp6h->icmp6_type - 130;
 	if (type >= 0 && type < sizeof(noct_valid_new) &&
 	    noct_valid_new[type]) {
-		skb->nfct = &nf_ct_untracked_get()->ct_general;
-		skb->nfctinfo = IP_CT_NEW;
+		nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
 		nf_conntrack_get(skb_nfct(skb));
 		return NF_ACCEPT;
 	}
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 5f52e5f90e7e..ff04f6a7f45b 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -58,8 +58,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	nf_reset(skb);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
 	nf_conntrack_get(skb->nfct);
 #endif
 	if (hooknum == NF_INET_PRE_ROUTING ||
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 78aebf0ee6e3..c9bd10747864 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -691,10 +691,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 
 		nf_ct_acct_merge(ct, ctinfo, loser_ct);
 		nf_conntrack_put(&loser_ct->ct_general);
-		/* Assign conntrack already in hashes to this skbuff. Don't
-		 * modify skb->nfctinfo to ensure consistent stateful filtering.
-		 */
-		skb->nfct = &ct->ct_general;
+		nf_ct_set(skb, ct, oldinfo);
 		return NF_ACCEPT;
 	}
 	NF_CT_STAT_INC(net, drop);
@@ -1282,8 +1279,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		}
 		*set_reply = 0;
 	}
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = *ctinfo;
+	nf_ct_set(skb, ct, *ctinfo);
 	return ct;
 }
 
@@ -1526,8 +1522,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 		ctinfo = IP_CT_RELATED;
 
 	/* Attach to new skbuff, and increment count */
-	nskb->nfct = &ct->ct_general;
-	nskb->nfctinfo = ctinfo;
+	nf_ct_set(nskb, ct, ctinfo);
 	nf_conntrack_get(skb_nfct(nskb));
 }
 
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index d774d7823688..66a2377510e1 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -554,8 +554,7 @@ static void nft_notrack_eval(const struct nft_expr *expr,
 
 	ct = nf_ct_untracked_get();
 	atomic_inc(&ct->ct_general.use);
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	nf_ct_set(skb, ct, IP_CT_NEW);
 }
 
 static struct nft_expr_type nft_notrack_type;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index cd7e29910ae1..51f00e1e1208 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -30,8 +30,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
 	if (!ct)
 		ct = nf_ct_untracked_get();
 	atomic_inc(&ct->ct_general.use);
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	nf_ct_set(skb, ct, IP_CT_NEW);
 
 	return XT_CONTINUE;
 }
@@ -413,8 +412,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (skb->nfct != NULL)
 		return XT_CONTINUE;
 
-	skb->nfct = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
 	nf_conntrack_get(skb_nfct(skb));
 
 	return XT_CONTINUE;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 452557946147..d1fbfcaa009a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -460,8 +460,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 
 	ct = nf_ct_tuplehash_to_ctrack(h);
 
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = ovs_ct_get_info(h);
+	nf_ct_set(skb, ct, ovs_ct_get_info(h));
 	return ct;
 }
 
@@ -724,8 +723,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			if (skb_nfct(skb))
 				nf_conntrack_put(skb_nfct(skb));
 			nf_conntrack_get(&tmpl->ct_general);
-			skb->nfct = &tmpl->ct_general;
-			skb->nfctinfo = IP_CT_NEW;
+			nf_ct_set(skb, tmpl, IP_CT_NEW);
 		}
 
 		err = nf_conntrack_in(net, info->family,
-- 
cgit v1.2.3


From 303223092081963513494b4377fa1ac9e362ed4b Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 23 Jan 2017 18:21:58 +0100
Subject: netfilter: guarantee 8 byte minalign for template addresses

The next change will merge skb->nfct pointer and skb->nfctinfo
status bits into single skb->_nfct (unsigned long) area.

For this to work nf_conn addresses must always be aligned at least on
an 8 byte boundary since we will need the lower 3bits to store nfctinfo.

Conntrack templates are allocated via kmalloc.
kbuild test robot reported
BUILD_BUG_ON failed: NFCT_INFOMASK >= ARCH_KMALLOC_MINALIGN
on v1 of this patchset, so not all platforms meet this requirement.

Do manual alignment if needed,  the alignment offset is stored in the
nf_conn entry protocol area. This works because templates are not
handed off to L4 protocol trackers.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |  2 ++
 net/netfilter/nf_conntrack_core.c    | 29 ++++++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index d704aed11684..06d3d2d24fe0 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -163,6 +163,8 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
 int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			     const struct nf_conn *ignored_conntrack);
 
+#define NFCT_INFOMASK	7UL
+
 /* Return conntrack_info and tuple hash for given skb. */
 static inline struct nf_conn *
 nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c9bd10747864..768968fba7f6 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -350,16 +350,31 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 	spin_unlock(&pcpu->lock);
 }
 
+#define NFCT_ALIGN(len)	(((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
+
 /* Released via destroy_conntrack() */
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 				 const struct nf_conntrack_zone *zone,
 				 gfp_t flags)
 {
-	struct nf_conn *tmpl;
+	struct nf_conn *tmpl, *p;
 
-	tmpl = kzalloc(sizeof(*tmpl), flags);
-	if (tmpl == NULL)
-		return NULL;
+	if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
+		tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
+		if (!tmpl)
+			return NULL;
+
+		p = tmpl;
+		tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+		if (tmpl != p) {
+			tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+			tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
+		}
+	} else {
+		tmpl = kzalloc(sizeof(*tmpl), flags);
+		if (!tmpl)
+			return NULL;
+	}
 
 	tmpl->status = IPS_TEMPLATE;
 	write_pnet(&tmpl->ct_net, net);
@@ -374,7 +389,11 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
 	nf_ct_ext_destroy(tmpl);
 	nf_ct_ext_free(tmpl);
-	kfree(tmpl);
+
+	if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
+		kfree((char *)tmpl - tmpl->proto.tmpl_padto);
+	else
+		kfree(tmpl);
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 
-- 
cgit v1.2.3


From a9e419dc7be6997409dca6d1b9daf3cc7046902f Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Mon, 23 Jan 2017 18:21:59 +0100
Subject: netfilter: merge ctinfo into nfct pointer storage area

After this change conntrack operations (lookup, creation, matching from
ruleset) only access one instead of two sk_buff cache lines.

This works for normal conntracks because those are allocated from a slab
that guarantees hw cacheline or 8byte alignment (whatever is larger)
so the 3 bits needed for ctinfo won't overlap with nf_conn addresses.

Template allocation now does manual address alignment (see previous change)
on arches that don't have sufficent kmalloc min alignment.

Some spots intentionally use skb->_nfct instead of skb_nfct() helpers,
this is to avoid undoing the skb_nfct() use when we remove untracked
conntrack object in the future.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h                  | 21 +++++++++------------
 include/net/netfilter/nf_conntrack.h    | 11 ++++++-----
 net/ipv6/netfilter/nf_dup_ipv6.c        |  2 +-
 net/netfilter/core.c                    |  2 +-
 net/netfilter/nf_conntrack_core.c       | 11 ++++++-----
 net/netfilter/nf_conntrack_standalone.c |  3 +++
 net/netfilter/xt_CT.c                   |  4 ++--
 7 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 276431e047af..ac0bc085b139 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -585,7 +585,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@cloned: Head may be cloned (check refcnt to be sure)
  *	@ip_summed: Driver fed us an IP checksum
  *	@nohdr: Payload reference only, must not modify header
- *	@nfctinfo: Relationship of this skb to the connection
  *	@pkt_type: Packet class
  *	@fclone: skbuff clone status
  *	@ipvs_property: skbuff is owned by ipvs
@@ -594,7 +593,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@nf_trace: netfilter packet trace flag
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
- *	@nfct: Associated connection, if any
+ *	@_nfct: Associated connection, if any (with nfctinfo bits)
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
  *	@tc_index: Traffic control index
@@ -668,7 +667,7 @@ struct sk_buff {
 	struct	sec_path	*sp;
 #endif
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	struct nf_conntrack	*nfct;
+	unsigned long		 _nfct;
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	struct nf_bridge_info	*nf_bridge;
@@ -721,7 +720,6 @@ struct sk_buff {
 	__u8			pkt_type:3;
 	__u8			pfmemalloc:1;
 	__u8			ignore_df:1;
-	__u8			nfctinfo:3;
 
 	__u8			nf_trace:1;
 	__u8			ip_summed:2;
@@ -836,6 +834,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb)
 #define SKB_DST_NOREF	1UL
 #define SKB_DST_PTRMASK	~(SKB_DST_NOREF)
 
+#define SKB_NFCT_PTRMASK	~(7UL)
 /**
  * skb_dst - returns skb dst_entry
  * @skb: buffer
@@ -3556,7 +3555,7 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
 static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	return skb->nfct;
+	return (void *)(skb->_nfct & SKB_NFCT_PTRMASK);
 #else
 	return NULL;
 #endif
@@ -3590,8 +3589,8 @@ static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge)
 static inline void nf_reset(struct sk_buff *skb)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	nf_conntrack_put(skb->nfct);
-	skb->nfct = NULL;
+	nf_conntrack_put(skb_nfct(skb));
+	skb->_nfct = 0;
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(skb->nf_bridge);
@@ -3611,10 +3610,8 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
 			     bool copy)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	dst->nfct = src->nfct;
-	nf_conntrack_get(src->nfct);
-	if (copy)
-		dst->nfctinfo = src->nfctinfo;
+	dst->_nfct = src->_nfct;
+	nf_conntrack_get(skb_nfct(src));
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	dst->nf_bridge  = src->nf_bridge;
@@ -3629,7 +3626,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
 static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	nf_conntrack_put(dst->nfct);
+	nf_conntrack_put(skb_nfct(dst));
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(dst->nf_bridge);
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 06d3d2d24fe0..f540f9ad2af4 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -76,7 +76,7 @@ struct nf_conn {
 	/* Usage count in here is 1 for hash table, 1 per skb,
 	 * plus 1 for any connection(s) we are `master' for
 	 *
-	 * Hint, SKB address this struct and refcnt via skb->nfct and
+	 * Hint, SKB address this struct and refcnt via skb->_nfct and
 	 * helpers nf_conntrack_get() and nf_conntrack_put().
 	 * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
 	 * beware nf_ct_get() is different and don't inc refcnt.
@@ -164,13 +164,15 @@ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			     const struct nf_conn *ignored_conntrack);
 
 #define NFCT_INFOMASK	7UL
+#define NFCT_PTRMASK	~(NFCT_INFOMASK)
 
 /* Return conntrack_info and tuple hash for given skb. */
 static inline struct nf_conn *
 nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
 {
-	*ctinfo = skb->nfctinfo;
-	return (struct nf_conn *)skb->nfct;
+	*ctinfo = skb->_nfct & NFCT_INFOMASK;
+
+	return (struct nf_conn *)(skb->_nfct & NFCT_PTRMASK);
 }
 
 /* decrement reference count on a conntrack */
@@ -347,8 +349,7 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl);
 static inline void
 nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info)
 {
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = info;
+	skb->_nfct = (unsigned long)ct | info;
 }
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index ff04f6a7f45b..888ecd106e5f 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -59,7 +59,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	nf_reset(skb);
 	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
-	nf_conntrack_get(skb->nfct);
+	nf_conntrack_get(skb_nfct(skb));
 #endif
 	if (hooknum == NF_INET_PRE_ROUTING ||
 	    hooknum == NF_INET_LOCAL_IN) {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index ce6adfae521a..a87a6f8a74d8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -375,7 +375,7 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 {
 	void (*attach)(struct sk_buff *, const struct sk_buff *);
 
-	if (skb->nfct) {
+	if (skb->_nfct) {
 		rcu_read_lock();
 		attach = rcu_dereference(ip_ct_attach);
 		if (attach)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 768968fba7f6..47c4ea53daa6 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1239,7 +1239,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
 }
 
-/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */
 static inline struct nf_conn *
 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		  struct sk_buff *skb,
@@ -1323,7 +1323,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			NF_CT_STAT_INC_ATOMIC(net, ignore);
 			return NF_ACCEPT;
 		}
-		skb->nfct = NULL;
+		skb->_nfct = 0;
 	}
 
 	/* rcu_read_lock()ed by nf_hook_thresh */
@@ -1352,7 +1352,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			goto out;
 		}
 		/* ICMP[v6] protocol trackers may assign one conntrack. */
-		if (skb->nfct)
+		if (skb->_nfct)
 			goto out;
 	}
 repeat:
@@ -1383,7 +1383,7 @@ repeat:
 		 * the netfilter core what to do */
 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
 		nf_conntrack_put(&ct->ct_general);
-		skb->nfct = NULL;
+		skb->_nfct = 0;
 		NF_CT_STAT_INC_ATOMIC(net, invalid);
 		if (ret == -NF_DROP)
 			NF_CT_STAT_INC_ATOMIC(net, drop);
@@ -1878,7 +1878,8 @@ int nf_conntrack_init_start(void)
 	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
-						sizeof(struct nf_conn), 0,
+						sizeof(struct nf_conn),
+						NFCT_INFOMASK + 1,
 						SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
 	if (!nf_conntrack_cachep)
 		goto err_cachep;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index d009ae663453..2256147dcaad 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -642,6 +642,9 @@ static int __init nf_conntrack_standalone_init(void)
 	if (ret < 0)
 		goto out_start;
 
+	BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
+	BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
+
 #ifdef CONFIG_SYSCTL
 	nf_ct_netfilter_header =
 		register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 51f00e1e1208..b008db0184b8 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -23,7 +23,7 @@
 static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
 {
 	/* Previously seen (loopback)? Ignore. */
-	if (skb->nfct != NULL)
+	if (skb->_nfct != 0)
 		return XT_CONTINUE;
 
 	/* special case the untracked ct : we want the percpu object */
@@ -409,7 +409,7 @@ static unsigned int
 notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	/* Previously seen (loopback)? Ignore. */
-	if (skb->nfct != NULL)
+	if (skb->_nfct != 0)
 		return XT_CONTINUE;
 
 	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
-- 
cgit v1.2.3


From 90c1aff702d449a1a248c4829d51c0bc677f968e Mon Sep 17 00:00:00 2001
From: David Windsor
Date: Mon, 23 Jan 2017 22:24:29 -0500
Subject: ipvs: free ip_vs_dest structs when refcnt=0

Currently, the ip_vs_dest cache frees ip_vs_dest objects when their
reference count becomes < 0.  Aside from not being semantically sound,
this is problematic for the new type refcount_t, which will be introduced
shortly in a separate patch. refcount_t is the new kernel type for
holding reference counts, and provides overflow protection and a
constrained interface relative to atomic_t (the type currently being
used for kernel reference counts).

Per Julian Anastasov: "The problem is that dest_trash currently holds
deleted dests (unlinked from RCU lists) with refcnt=0."  Changing
dest_trash to hold dest with refcnt=1 will allow us to free ip_vs_dest
structs when their refcnt=0, in ip_vs_dest_put_and_free().

Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h            | 2 +-
 net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4b46c591b542..7bdfa7d78363 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1421,7 +1421,7 @@ static inline void ip_vs_dest_put(struct ip_vs_dest *dest)
 
 static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest)
 {
-	if (atomic_dec_return(&dest->refcnt) < 0)
+	if (atomic_dec_and_test(&dest->refcnt))
 		kfree(dest);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 55e0169caa4c..5fc4836e7c79 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -711,7 +711,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
 		      dest->vport == svc->port))) {
 			/* HIT */
 			list_del(&dest->t_list);
-			ip_vs_dest_hold(dest);
 			goto out;
 		}
 	}
@@ -741,7 +740,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
  *  When the ip_vs_control_clearup is activated by ipvs module exit,
  *  the service tables must have been flushed and all the connections
  *  are expired, and the refcnt of each destination in the trash must
- *  be 0, so we simply release them here.
+ *  be 1, so we simply release them here.
  */
 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
 {
@@ -1080,11 +1079,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
 	if (list_empty(&ipvs->dest_trash) && !cleanup)
 		mod_timer(&ipvs->dest_trash_timer,
 			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
-	/* dest lives in trash without reference */
+	/* dest lives in trash with reference */
 	list_add(&dest->t_list, &ipvs->dest_trash);
 	dest->idle_start = 0;
 	spin_unlock_bh(&ipvs->dest_trash_lock);
-	ip_vs_dest_put(dest);
 }
 
 
@@ -1160,7 +1158,7 @@ static void ip_vs_dest_trash_expire(unsigned long data)
 
 	spin_lock(&ipvs->dest_trash_lock);
 	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
-		if (atomic_read(&dest->refcnt) > 0)
+		if (atomic_read(&dest->refcnt) > 1)
 			continue;
 		if (dest->idle_start) {
 			if (time_before(now, dest->idle_start +
-- 
cgit v1.2.3


From 2851940ffee313e0ff12540a8e11a8c54dea9c65 Mon Sep 17 00:00:00 2001
From: Michal Kubeček
Date: Tue, 31 Jan 2017 10:30:06 +0100
Subject: netfilter: allow logging from non-init namespaces

Commit 69b34fb996b2 ("netfilter: xt_LOG: add net namespace support for
xt_LOG") disabled logging packets using the LOG target from non-init
namespaces. The motivation was to prevent containers from flooding
kernel log of the host. The plan was to keep it that way until syslog
namespace implementation allows containers to log in a safe way.

However, the work on syslog namespace seems to have hit a dead end
somewhere in 2013 and there are users who want to use xt_LOG in all
network namespaces. This patch allows to do so by setting

  /proc/sys/net/netfilter/nf_log_all_netns

to a nonzero value. This sysctl is only accessible from init_net so that
one cannot switch the behaviour from inside a container.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/netfilter-sysctl.txt | 10 ++++++++++
 include/net/netfilter/nf_log.h                |  3 +++
 net/bridge/netfilter/ebt_log.c                |  2 +-
 net/ipv4/netfilter/nf_log_arp.c               |  2 +-
 net/ipv4/netfilter/nf_log_ipv4.c              |  2 +-
 net/ipv6/netfilter/nf_log_ipv6.c              |  2 +-
 net/netfilter/nf_log.c                        | 24 ++++++++++++++++++++++++
 7 files changed, 41 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/networking/netfilter-sysctl.txt

(limited to 'net')

diff --git a/Documentation/networking/netfilter-sysctl.txt b/Documentation/networking/netfilter-sysctl.txt
new file mode 100644
index 000000000000..55791e50e169
--- /dev/null
+++ b/Documentation/networking/netfilter-sysctl.txt
@@ -0,0 +1,10 @@
+/proc/sys/net/netfilter/* Variables:
+
+nf_log_all_netns - BOOLEAN
+	0 - disabled (default)
+	not 0 - enabled
+
+	By default, only init_net namespace can log packets into kernel log
+	with LOG target; this aims to prevent containers from flooding host
+	kernel log. If enabled, this target also works in other network
+	namespaces. This variable is only accessible from init_net.
diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 450f87f95415..42e0696f38d8 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -51,6 +51,9 @@ struct nf_logger {
 	struct module		*me;
 };
 
+/* sysctl_nf_log_all_netns - allow LOG target in all network namespaces */
+extern int sysctl_nf_log_all_netns;
+
 /* Function to register/unregister log function. */
 int nf_log_register(u_int8_t pf, struct nf_logger *logger);
 void nf_log_unregister(struct nf_logger *logger);
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e88bd4827ac1..98b9c8e8615e 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -78,7 +78,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 	unsigned int bitmask;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	spin_lock_bh(&ebt_log_lock);
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index b24795e2ee6d..f6f713376e6e 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
 	struct nf_log_buf *m;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 856648966f4c..c83a9963269b 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
 	struct nf_log_buf *m;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	m = nf_log_buf_open();
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 57d86066a13b..055c51b80f5d 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -351,7 +351,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 	struct nf_log_buf *m;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	m = nf_log_buf_open();
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 3dca90dc24ad..0a034f52b912 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,6 +16,9 @@
 #define NF_LOG_PREFIXLEN		128
 #define NFLOGGER_NAME_LEN		64
 
+int sysctl_nf_log_all_netns __read_mostly;
+EXPORT_SYMBOL(sysctl_nf_log_all_netns);
+
 static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
 static DEFINE_MUTEX(nf_log_mutex);
 
@@ -414,6 +417,18 @@ static const struct file_operations nflog_file_ops = {
 #ifdef CONFIG_SYSCTL
 static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
 static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+static struct ctl_table_header *nf_log_sysctl_fhdr;
+
+static struct ctl_table nf_log_sysctl_ftable[] = {
+	{
+		.procname	= "nf_log_all_netns",
+		.data		= &sysctl_nf_log_all_netns,
+		.maxlen		= sizeof(sysctl_nf_log_all_netns),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
 
 static int nf_log_proc_dostring(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -483,6 +498,10 @@ static int netfilter_log_sysctl_init(struct net *net)
 			nf_log_sysctl_table[i].extra1 =
 				(void *)(unsigned long) i;
 		}
+		nf_log_sysctl_fhdr = register_net_sysctl(net, "net/netfilter",
+							 nf_log_sysctl_ftable);
+		if (!nf_log_sysctl_fhdr)
+			goto err_freg;
 	}
 
 	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
@@ -499,6 +518,9 @@ static int netfilter_log_sysctl_init(struct net *net)
 err_reg:
 	if (!net_eq(net, &init_net))
 		kfree(table);
+	else
+		unregister_net_sysctl_table(nf_log_sysctl_fhdr);
+err_freg:
 err_alloc:
 	return -ENOMEM;
 }
@@ -511,6 +533,8 @@ static void netfilter_log_sysctl_exit(struct net *net)
 	unregister_net_sysctl_table(net->nf.nf_log_dir_header);
 	if (!net_eq(net, &init_net))
 		kfree(table);
+	else
+		unregister_net_sysctl_table(nf_log_sysctl_fhdr);
 }
 #else
 static int netfilter_log_sysctl_init(struct net *net)
-- 
cgit v1.2.3


From ba94f3088b792b16ea576a256a6030feddc87f24 Mon Sep 17 00:00:00 2001
From: Andrey Vagin
Date: Wed, 1 Feb 2017 11:00:45 -0800
Subject: unix: add ioctl to open a unix socket file with O_PATH

This ioctl opens a file to which a socket is bound and
returns a file descriptor. The caller has to have CAP_NET_ADMIN
in the socket network namespace.

Currently it is impossible to get a path and a mount point
for a socket file. socket_diag reports address, device ID and inode
number for unix sockets. An address can contain a relative path or
a file may be moved somewhere. And these properties say nothing about
a mount namespace and a mount point of a socket file.

With the introduced ioctl, we can get a path by reading
/proc/self/fd/X and get mnt_id from /proc/self/fdinfo/X.

In CRIU we are going to use this ioctl to dump and restore unix socket.

Here is an example how it can be used:

$ strace -e socket,bind,ioctl ./test /tmp/test_sock
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
bind(3, {sa_family=AF_UNIX, sun_path="test_sock"}, 11) = 0
ioctl(3, SIOCUNIXFILE, 0)           = 4
^Z

$ ss -a | grep test_sock
u_str  LISTEN     0      1      test_sock 17798                 * 0

$ ls -l /proc/760/fd/{3,4}
lrwx------ 1 root root 64 Feb  1 09:41 3 -> 'socket:[17798]'
l--------- 1 root root 64 Feb  1 09:41 4 -> /tmp/test_sock

$ cat /proc/760/fdinfo/4
pos:	0
flags:	012000000
mnt_id:	40

$ cat /proc/self/mountinfo | grep "^40\s"
40 19 0:37 / /tmp rw shared:23 - tmpfs tmpfs rw

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/un.h |  2 ++
 net/unix/af_unix.c      | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/un.h b/include/uapi/linux/un.h
index 3ed3e46c1b1f..4f0ab3a548ad 100644
--- a/include/uapi/linux/un.h
+++ b/include/uapi/linux/un.h
@@ -10,4 +10,6 @@ struct sockaddr_un {
 	char sun_path[UNIX_PATH_MAX];	/* pathname */
 };
 
+#define SIOCUNIXFILE (SIOCPROTOPRIVATE + 0) /* open a socket file with O_PATH */
+
 #endif /* _LINUX_UN_H */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cef79873b09d..e2d18b9f910f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,6 +117,7 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 #include <linux/freezer.h>
+#include <linux/file.h>
 
 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_socket_table);
@@ -2592,6 +2593,43 @@ long unix_outq_len(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(unix_outq_len);
 
+static int unix_open_file(struct sock *sk)
+{
+	struct path path;
+	struct file *f;
+	int fd;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	unix_state_lock(sk);
+	path = unix_sk(sk)->path;
+	if (!path.dentry) {
+		unix_state_unlock(sk);
+		return -ENOENT;
+	}
+
+	path_get(&path);
+	unix_state_unlock(sk);
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		goto out;
+
+	f = dentry_open(&path, O_PATH, current_cred());
+	if (IS_ERR(f)) {
+		put_unused_fd(fd);
+		fd = PTR_ERR(f);
+		goto out;
+	}
+
+	fd_install(fd, f);
+out:
+	path_put(&path);
+
+	return fd;
+}
+
 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
@@ -2610,6 +2648,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		else
 			err = put_user(amount, (int __user *)arg);
 		break;
+	case SIOCUNIXFILE:
+		err = unix_open_file(sk);
+		break;
 	default:
 		err = -ENOIOCTLCMD;
 		break;
-- 
cgit v1.2.3


From b9ea2a7be758f2c07a0c044645edc409a8a388fd Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Wed, 1 Feb 2017 18:13:23 -0800
Subject: net: remove useless pfmemalloc setting

When __alloc_skb() allocates an skb from fast clone cache,
setting pfmemalloc on the clone is not needed.

Clone will be properly initialized later at skb_clone() time,
including pfmemalloc field, as it is included in the
headers_start/headers_end section which is fully copied.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 26c1344cc23e..4f8f2a1a66b5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		atomic_set(&fclones->fclone_ref, 1);
 
 		fclones->skb2.fclone = SKB_FCLONE_CLONE;
-		fclones->skb2.pfmemalloc = pfmemalloc;
 	}
 out:
 	return skb;
-- 
cgit v1.2.3


From 661091093918657ab544fb8ca91a5ab172a986dc Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Wed, 1 Feb 2017 18:41:25 -0800
Subject: net: ipv4: remove fib_lookup.h from devinet.c include list

nothing in devinet.c relies on fib_lookup.h; remove it from the includes

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 4cd2ee8857d2..5d367b7ff542 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -65,8 +65,6 @@
 #include <net/net_namespace.h>
 #include <net/addrconf.h>
 
-#include "fib_lookup.h"
-
 static struct ipv4_devconf ipv4_devconf = {
 	.data = {
 		[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
-- 
cgit v1.2.3


From 8fe809a992639b2013c0d8da2ba55cdea28a959a Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Wed, 1 Feb 2017 20:47:59 -0800
Subject: net: add LINUX_MIB_PFMEMALLOCDROP counter

Debugging issues caused by pfmemalloc is often tedious.

Add a new SNMP counter to more easily diagnose these problems.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Josef Bacik <jbacik@fb.com>
Acked-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/core/filter.c         | 5 +++--
 net/ipv4/proc.c           | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index e7a31f830690..3b2bed7ca9a4 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -240,6 +240,7 @@ enum
 	LINUX_MIB_SACKMERGED,
 	LINUX_MIB_SACKSHIFTFALLBACK,
 	LINUX_MIB_TCPBACKLOGDROP,
+	LINUX_MIB_PFMEMALLOCDROP,
 	LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */
 	LINUX_MIB_TCPDEFERACCEPTDROP,
 	LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
diff --git a/net/core/filter.c b/net/core/filter.c
index 1e00737e3bc3..0b753cbb2536 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -76,9 +76,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 	 * allow SOCK_MEMALLOC sockets to use it as this socket is
 	 * helping free memory
 	 */
-	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
 		return -ENOMEM;
-
+	}
 	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
 	if (err)
 		return err;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a9deeb90dd36..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -262,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
 	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
 	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+	SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
 	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
-- 
cgit v1.2.3


From 3541f9e8bdebce02458882b66b638d7302c1f616 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 2 Feb 2017 08:04:56 -0800
Subject: tcp: add tcp_mss_clamp() helper

Small cleanup factorizing code doing the TCP_MAXSEG clamping.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  9 +++++++++
 net/ipv4/tcp_ipv4.c      |  5 +----
 net/ipv4/tcp_minisocks.c |  7 ++-----
 net/ipv4/tcp_output.c    | 14 ++++----------
 net/ipv6/tcp_ipv6.c      |  5 +----
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f88f4649ba6f..cfc2d9506ce8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -445,4 +445,13 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
 
 struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);
 
+static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
+{
+	/* We use READ_ONCE() here because socket might not be locked.
+	 * This happens for listeners.
+	 */
+	u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
+
+	return (user_mss && user_mss < mss) ? user_mss : mss;
+}
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8c9e9aa17d66..8c124d4ef4b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1324,10 +1324,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	tcp_ca_openreq_child(newsk, dst);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
-	newtp->advmss = dst_metric_advmss(dst);
-	if (tcp_sk(sk)->rx_opt.user_mss &&
-	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
-		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(newsk);
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bdb443471c39..dff7d2aaf861 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -360,15 +360,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	const struct tcp_sock *tp = tcp_sk(sk_listener);
-	u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
 	int full_space = tcp_full_space(sk_listener);
-	int mss = dst_metric_advmss(dst);
 	u32 window_clamp;
 	__u8 rcv_wscale;
+	int mss;
 
-	if (user_mss && user_mss < mss)
-		mss = user_mss;
-
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 	window_clamp = READ_ONCE(tp->window_clamp);
 	/* Set this up on the first call only */
 	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6d5bab8a3ea6..956bea9a5394 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3062,7 +3062,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	struct sk_buff *skb;
 	int tcp_header_size;
 	struct tcphdr *th;
-	u16 user_mss;
 	int mss;
 
 	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3092,10 +3091,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	}
 	skb_dst_set(skb, dst);
 
-	mss = dst_metric_advmss(dst);
-	user_mss = READ_ONCE(tp->rx_opt.user_mss);
-	if (user_mss && user_mss < mss)
-		mss = user_mss;
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
@@ -3201,9 +3197,7 @@ static void tcp_connect_init(struct sock *sk)
 
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
-	tp->advmss = dst_metric_advmss(dst);
-	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
-		tp->advmss = tp->rx_opt.user_mss;
+	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(sk);
 
@@ -3280,8 +3274,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	 * user-MSS. Reserve maximum option space for middleboxes that add
 	 * private TCP options. The cost is reduced data space in SYN :(
 	 */
-	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
-		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+
 	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
 		MAX_TCP_OPTION_SPACE;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 64834ec5ab73..6b9fc63fd4d2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1147,10 +1147,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	tcp_ca_openreq_child(newsk, dst);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
-	newtp->advmss = dst_metric_advmss(dst);
-	if (tcp_sk(sk)->rx_opt.user_mss &&
-	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
-		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(newsk);
 
-- 
cgit v1.2.3


From 1d5e7c859e81a66674d194c346119d154d31e9dc Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Wed, 1 Feb 2017 15:30:01 +0200
Subject: net/sched: act_ife: Unexport ife_tlv_meta_encode

As the function ife_tlv_meta_encode is not used by any other module,
unexport it and make it static for the act_ife module.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h | 2 --
 net/sched/act_ife.c         | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 9fd2bea0a6e0..f37e7516ab28 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -45,8 +45,6 @@ struct tcf_meta_ops {
 
 int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi);
 int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi);
-int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
-			const void *dval);
 int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 921fb20eaa7c..70148c10ede9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -48,7 +48,8 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
 
 /* Caller takes care of presenting data in network order
 */
-int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+static int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			       const void *dval)
 {
 	u32 *tlv = (u32 *)(skbdata);
 	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
@@ -61,7 +62,6 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
 
 	return totlen;
 }
-EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
 
 int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
 {
-- 
cgit v1.2.3


From 1ce8460496c05379c66edc178c3c55ca4e953044 Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Wed, 1 Feb 2017 15:30:02 +0200
Subject: net: Introduce ife encapsulation module

This module is responsible for the ife encapsulation protocol
encode/decode logics. That module can:
 - ife_encode: encode skb and reserve space for the ife meta header
 - ife_decode: decode skb and extract the meta header size
 - ife_tlv_meta_encode - encodes one tlv entry into the reserved ife
   header space.
 - ife_tlv_meta_decode - decodes one tlv entry from the packet
 - ife_tlv_meta_next - advance to the next tlv

Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS               |   7 +++
 include/net/ife.h         |  51 +++++++++++++++++
 include/uapi/linux/Kbuild |   1 +
 include/uapi/linux/ife.h  |  18 ++++++
 net/Kconfig               |   1 +
 net/Makefile              |   1 +
 net/ife/Kconfig           |  16 ++++++
 net/ife/Makefile          |   5 ++
 net/ife/ife.c             | 142 ++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 242 insertions(+)
 create mode 100644 include/net/ife.h
 create mode 100644 include/uapi/linux/ife.h
 create mode 100644 net/ife/Kconfig
 create mode 100644 net/ife/Makefile
 create mode 100644 net/ife/ife.c

(limited to 'net')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5e637e2b3ff9..2abda6cb3150 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6250,6 +6250,13 @@ F:	include/net/cfg802154.h
 F:	include/net/ieee802154_netdev.h
 F:	Documentation/networking/ieee802154.txt
 
+IFE PROTOCOL
+M:	Yotam Gigi <yotamg@mellanox.com>
+M:	Jamal Hadi Salim <jhs@mojatatu.com>
+F:	net/ife
+F:	include/net/ife.h
+F:	include/uapi/linux/ife.h
+
 IGORPLUG-USB IR RECEIVER
 M:	Sean Young <sean@mess.org>
 L:	linux-media@vger.kernel.org
diff --git a/include/net/ife.h b/include/net/ife.h
new file mode 100644
index 000000000000..2d87d6898b0a
--- /dev/null
+++ b/include/net/ife.h
@@ -0,0 +1,51 @@
+#ifndef __NET_IFE_H
+#define __NET_IFE_H
+
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <uapi/linux/ife.h>
+
+#if IS_ENABLED(CONFIG_NET_IFE)
+
+void *ife_encode(struct sk_buff *skb, u16 metalen);
+void *ife_decode(struct sk_buff *skb, u16 *metalen);
+
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen);
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval);
+
+void *ife_tlv_meta_next(void *skbdata);
+
+#else
+
+static inline void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+	return NULL;
+}
+
+static inline void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+	return NULL;
+}
+
+static inline void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen,
+					u16 *totlen)
+{
+	return NULL;
+}
+
+static inline int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval)
+{
+	return 0;
+}
+
+static inline void *ife_tlv_meta_next(void *skbdata)
+{
+	return NULL;
+}
+
+#endif
+
+#endif /* __NET_IFE_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 486e050e64c5..a2e90722a4c4 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -195,6 +195,7 @@ header-y += if_tun.h
 header-y += if_tunnel.h
 header-y += if_vlan.h
 header-y += if_x25.h
+header-y += ife.h
 header-y += igmp.h
 header-y += ila.h
 header-y += in6.h
diff --git a/include/uapi/linux/ife.h b/include/uapi/linux/ife.h
new file mode 100644
index 000000000000..2954da32e012
--- /dev/null
+++ b/include/uapi/linux/ife.h
@@ -0,0 +1,18 @@
+#ifndef __UAPI_IFE_H
+#define __UAPI_IFE_H
+
+#define IFE_METAHDRLEN 2
+
+enum {
+	IFE_META_SKBMARK = 1,
+	IFE_META_HASHID,
+	IFE_META_PRIO,
+	IFE_META_QMAP,
+	IFE_META_TCINDEX,
+	__IFE_META_MAX
+};
+
+/*Can be overridden at runtime by module option*/
+#define IFE_META_MAX (__IFE_META_MAX - 1)
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index ce4aee69fc0d..2f2842d2d3ed 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -391,6 +391,7 @@ source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
 source "net/psample/Kconfig"
+source "net/ife/Kconfig"
 
 config LWTUNNEL
 	bool "Network light weight tunnels"
diff --git a/net/Makefile b/net/Makefile
index 7d41de48310e..9b681550e3a3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_CEPH_LIB)		+= ceph/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
 obj-$(CONFIG_NFC)		+= nfc/
 obj-$(CONFIG_PSAMPLE)		+= psample/
+obj-$(CONFIG_NET_IFE)		+= ife/
 obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_MPLS)		+= mpls/
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 000000000000..31e48b652c7c
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
+#
+# IFE subsystem configuration
+#
+
+menuconfig NET_IFE
+	depends on NET
+        tristate "Inter-FE based on IETF ForCES InterFE LFB"
+	default n
+	help
+	  Say Y here to add support of IFE encapsulation protocol
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this support as a module, choose M here: the module will
+	  be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 000000000000..2a90d97746cc
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the IFE encapsulation protocol
+#
+
+obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 000000000000..f360341c72eb
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,142 @@
+/*
+ * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
+ * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/etherdevice.h>
+#include <net/ife.h>
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+	/* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	 * where ORIGDATA = original ethernet header ...
+	 */
+	int hdrm = metalen + IFE_METAHDRLEN;
+	int total_push = hdrm + skb->dev->hard_header_len;
+	struct ifeheadr *ifehdr;
+	struct ethhdr *iethh;	/* inner ether header */
+	int skboff = 0;
+	int err;
+
+	err = skb_cow_head(skb, total_push);
+	if (unlikely(err))
+		return NULL;
+
+	iethh = (struct ethhdr *) skb->data;
+
+	__skb_push(skb, total_push);
+	memcpy(skb->data, iethh, skb->dev->hard_header_len);
+	skb_reset_mac_header(skb);
+	skboff += skb->dev->hard_header_len;
+
+	/* total metadata length */
+	ifehdr = (struct ifeheadr *) (skb->data + skboff);
+	metalen += IFE_METAHDRLEN;
+	ifehdr->metalen = htons(metalen);
+
+	return ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_encode);
+
+void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+	struct ifeheadr *ifehdr;
+	int total_pull;
+	u16 ifehdrln;
+
+	ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
+	ifehdrln = ntohs(ifehdr->metalen);
+	total_pull = skb->dev->hard_header_len + ifehdrln;
+
+	if (unlikely(ifehdrln < 2))
+		return NULL;
+
+	if (unlikely(!pskb_may_pull(skb, total_pull)))
+		return NULL;
+
+	skb_set_mac_header(skb, total_pull);
+	__skb_pull(skb, total_pull);
+	*metalen = ifehdrln - IFE_METAHDRLEN;
+
+	return &ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_decode);
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+/* Caller takes care of presenting data in network order
+ */
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+
+	*dlen = ntohs(tlv->len) - NLA_HDRLEN;
+	*attrtype = ntohs(tlv->type);
+
+	if (totlen)
+		*totlen = nla_total_size(*dlen);
+
+	return skbdata + sizeof(struct meta_tlvhdr);
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
+
+void *ife_tlv_meta_next(void *skbdata)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+	u16 tlvlen = ntohs(tlv->len);
+
+	tlvlen = NLA_ALIGN(tlvlen);
+
+	return skbdata + tlvlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
+
+/* Caller takes care of presenting data in network order
+ */
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	__be32 *tlv = (__be32 *) (skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *) tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 295a6e06d21e1f469c9f38b00125a13b60ad4e7c Mon Sep 17 00:00:00 2001
From: Yotam Gigi
Date: Wed, 1 Feb 2017 15:30:03 +0200
Subject: net/sched: act_ife: Change to use ife module

Use the encode/decode functionality from the ife module instead of using
implementation inside the act_ife.

Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h        |   1 -
 include/uapi/linux/tc_act/tc_ife.h |  10 +---
 net/sched/Kconfig                  |   1 +
 net/sched/act_ife.c                | 110 +++++++++++--------------------------
 4 files changed, 34 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index f37e7516ab28..30ba459ddd34 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -6,7 +6,6 @@
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 
-#define IFE_METAHDRLEN 2
 struct tcf_ife_info {
 	struct tc_action common;
 	u8 eth_dst[ETH_ALEN];
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
index cd18360eca24..7c2817866c97 100644
--- a/include/uapi/linux/tc_act/tc_ife.h
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
+#include <linux/ife.h>
 
 #define TCA_ACT_IFE 25
 /* Flag bits for now just encoding/decoding; mutually exclusive */
@@ -28,13 +29,4 @@ enum {
 };
 #define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
 
-#define IFE_META_SKBMARK 1
-#define IFE_META_HASHID 2
-#define	IFE_META_PRIO 3
-#define	IFE_META_QMAP 4
-#define	IFE_META_TCINDEX 5
-/*Can be overridden at runtime by module option*/
-#define	__IFE_META_MAX 6
-#define IFE_META_MAX (__IFE_META_MAX - 1)
-
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 72cfa3a6bac0..403790cce7d2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -776,6 +776,7 @@ config NET_ACT_SKBMOD
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
+        select NET_IFE
         ---help---
 	  Say Y here to allow for sourcing and terminating metadata
 	  For details refer to netdev01 paper:
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 70148c10ede9..71e7ff22f7c9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -32,6 +32,7 @@
 #include <uapi/linux/tc_act/tc_ife.h>
 #include <net/tc_act/tc_ife.h>
 #include <linux/etherdevice.h>
+#include <net/ife.h>
 
 #define IFE_TAB_MASK 15
 
@@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
 	[TCA_IFE_TYPE] = { .type = NLA_U16},
 };
 
-/* Caller takes care of presenting data in network order
-*/
-static int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
-			       const void *dval)
-{
-	u32 *tlv = (u32 *)(skbdata);
-	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
-	char *dptr = (char *)tlv + NLA_HDRLEN;
-	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
-
-	*tlv = htonl(htlv);
-	memset(dptr, 0, totlen - NLA_HDRLEN);
-	memcpy(dptr, dval, dlen);
-
-	return totlen;
-}
-
 int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
 {
 	u16 edata = 0;
@@ -637,69 +621,59 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
 	return 0;
 }
 
-struct ifeheadr {
-	__be16 metalen;
-	u8 tlv_data[];
-};
-
-struct meta_tlvhdr {
-	__be16 type;
-	__be16 len;
-};
-
 static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
-	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
-	int ifehdrln = (int)ifehdr->metalen;
-	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+	u8 *ifehdr_end;
+	u8 *tlv_data;
+	u16 metalen;
 
 	spin_lock(&ife->tcf_lock);
 	bstats_update(&ife->tcf_bstats, skb);
 	tcf_lastuse_update(&ife->tcf_tm);
 	spin_unlock(&ife->tcf_lock);
 
-	ifehdrln = ntohs(ifehdrln);
-	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+	if (skb_at_tc_ingress(skb))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	tlv_data = ife_decode(skb, &metalen);
+	if (unlikely(!tlv_data)) {
 		spin_lock(&ife->tcf_lock);
 		ife->tcf_qstats.drops++;
 		spin_unlock(&ife->tcf_lock);
 		return TC_ACT_SHOT;
 	}
 
-	skb_set_mac_header(skb, ifehdrln);
-	__skb_pull(skb, ifehdrln);
-	skb->protocol = eth_type_trans(skb, skb->dev);
-	ifehdrln -= IFE_METAHDRLEN;
-
-	while (ifehdrln > 0) {
-		u8 *tlvdata = (u8 *)tlv;
-		u16 mtype = tlv->type;
-		u16 mlen = tlv->len;
-		u16 alen;
+	ifehdr_end = tlv_data + metalen;
+	for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
+		u8 *curr_data;
+		u16 mtype;
+		u16 dlen;
 
-		mtype = ntohs(mtype);
-		mlen = ntohs(mlen);
-		alen = NLA_ALIGN(mlen);
+		curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
 
-		if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN),
-				       (void *)(tlvdata + NLA_HDRLEN))) {
+		if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
 			/* abuse overlimits to count when we receive metadata
 			 * but dont have an ops for it
 			 */
-			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
-					    mtype, mlen);
+			pr_info_ratelimited("Unknown metaid %d dlen %d\n",
+					    mtype, dlen);
 			ife->tcf_qstats.overlimits++;
 		}
+	}
 
-		tlvdata += alen;
-		ifehdrln -= alen;
-		tlv = (struct meta_tlvhdr *)tlvdata;
+	if (WARN_ON(tlv_data != ifehdr_end)) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
 	}
 
+	skb->protocol = eth_type_trans(skb, skb->dev);
 	skb_reset_network_header(skb);
+
 	return action;
 }
 
@@ -727,7 +701,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
 	struct ethhdr *oethh;	/* outer ether header */
-	struct ethhdr *iethh;	/* inner eth header */
 	struct tcf_meta_info *e;
 	/*
 	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
@@ -735,10 +708,11 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	u16 metalen = ife_get_sz(skb, ife);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
-	unsigned int skboff = skb->dev->hard_header_len;
+	unsigned int skboff = 0;
 	int new_len = skb->len + hdrm;
 	bool exceed_mtu = false;
-	int err;
+	void *ife_meta;
+	int err = 0;
 
 	if (!skb_at_tc_ingress(skb)) {
 		if (new_len > skb->dev->mtu)
@@ -765,27 +739,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		return TC_ACT_SHOT;
 	}
 
-	err = skb_cow_head(skb, hdrm);
-	if (unlikely(err)) {
-		ife->tcf_qstats.drops++;
-		spin_unlock(&ife->tcf_lock);
-		return TC_ACT_SHOT;
-	}
-
 	if (skb_at_tc_ingress(skb))
 		skb_push(skb, skb->dev->hard_header_len);
 
-	iethh = (struct ethhdr *)skb->data;
-	__skb_push(skb, hdrm);
-	memcpy(skb->data, iethh, skb->mac_len);
-	skb_reset_mac_header(skb);
-	oethh = eth_hdr(skb);
-
-	/*total metadata length */
-	metalen += IFE_METAHDRLEN;
-	metalen = htons(metalen);
-	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
-	skboff += IFE_METAHDRLEN;
+	ife_meta = ife_encode(skb, metalen);
 
 	/* XXX: we dont have a clever way of telling encode to
 	 * not repeat some of the computations that are done by
@@ -793,7 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	list_for_each_entry(e, &ife->metalist, metalist) {
 		if (e->ops->encode) {
-			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+			err = e->ops->encode(skb, (void *)(ife_meta + skboff),
 					     e);
 		}
 		if (err < 0) {
@@ -804,15 +761,12 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		}
 		skboff += err;
 	}
+	oethh = (struct ethhdr *)skb->data;
 
 	if (!is_zero_ether_addr(ife->eth_src))
 		ether_addr_copy(oethh->h_source, ife->eth_src);
-	else
-		ether_addr_copy(oethh->h_source, iethh->h_source);
 	if (!is_zero_ether_addr(ife->eth_dst))
 		ether_addr_copy(oethh->h_dest, ife->eth_dst);
-	else
-		ether_addr_copy(oethh->h_dest, iethh->h_dest);
 	oethh->h_proto = htons(ife->eth_type);
 
 	if (skb_at_tc_ingress(skb))
-- 
cgit v1.2.3


From efa5356b0d9753b9d7e63e41459eba106cce30f3 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu
Date: Tue, 31 Jan 2017 22:59:54 -0800
Subject: bridge: per vlan dst_metadata netlink support

This patch adds support to attach per vlan tunnel info dst
metadata. This enables bridge driver to map vlan to tunnel_info
at ingress and egress. It uses the kernel dst_metadata infrastructure.

The initial use case is vlan to vni bridging, but the api is generic
to extend to any tunnel_info in the future:
    - Uapi to configure/unconfigure/dump per vlan tunnel data
    - netlink functions to configure vlan and tunnel_info mapping
    - Introduces bridge port flag BR_LWT_VLAN to enable attach/detach
    dst_metadata to bridged packets on ports. off by default.
    - changes to existing code is mainly refactor some existing vlan
    handling netlink code + hooks for new vlan tunnel code
    - I have kept the vlan tunnel code isolated in separate files.
    - most of the netlink vlan tunnel code is handling of vlan-tunid
    ranges (follows the vlan range handling code). To conserve space
    vlan-tunid by default are always dumped in ranges if applicable.

Use case:
example use for this is a vxlan bridging gateway or vtep
which maps vlans to vn-segments (or vnis).

iproute2 example (patched and pruned iproute2 output to just show
relevant fdb entries):
example shows same host mac learnt on two vni's and
vlan 100 maps to vni 1000, vlan 101 maps to vni 1001

before (netdev per vni):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan1001 vlan 101 master bridge
00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan1000 vlan 100 master bridge
00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self

after this patch with collect metdata in bridged mode (single netdev):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan0 vlan 101 master bridge
00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan0 vlan 100 master bridge
00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self

CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/Makefile            |   5 +-
 net/bridge/br_netlink.c        | 140 ++++++++++++-------
 net/bridge/br_netlink_tunnel.c | 296 +++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_private.h        |  10 ++
 net/bridge/br_private_tunnel.h |  72 ++++++++++
 net/bridge/br_vlan.c           |  17 ++-
 net/bridge/br_vlan_tunnel.c    | 149 +++++++++++++++++++++
 7 files changed, 641 insertions(+), 48 deletions(-)
 create mode 100644 net/bridge/br_netlink_tunnel.c
 create mode 100644 net/bridge/br_private_tunnel.h
 create mode 100644 net/bridge/br_vlan_tunnel.c

(limited to 'net')

diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 0aefc011b668..40b1ede527ca 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 
 bridge-y	:= br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
 			br_ioctl.o br_stp.o br_stp_bpdu.o \
-			br_stp_if.o br_stp_timer.o br_netlink.o
+			br_stp_if.o br_stp_timer.o br_netlink.o \
+			br_netlink_tunnel.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
@@ -18,7 +19,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
 
 bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
 
-bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
 
 bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 1ca25498fe4d..fc5d885dbb22 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -20,6 +20,7 @@
 
 #include "br_private.h"
 #include "br_private_stp.h"
+#include "br_private_tunnel.h"
 
 static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
 				u32 filter_mask)
@@ -95,9 +96,10 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
 					   u32 filter_mask)
 {
 	struct net_bridge_vlan_group *vg = NULL;
-	struct net_bridge_port *p;
+	struct net_bridge_port *p = NULL;
 	struct net_bridge *br;
 	int num_vlan_infos;
+	size_t vinfo_sz = 0;
 
 	rcu_read_lock();
 	if (br_port_exists(dev)) {
@@ -110,8 +112,13 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
 	num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
 	rcu_read_unlock();
 
+	if (p && (p->flags & BR_VLAN_TUNNEL))
+		vinfo_sz += br_get_vlan_tunnel_info_size(vg);
+
 	/* Each VLAN is returned in bridge_vlan_info along with flags */
-	return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+	vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+
+	return vinfo_sz;
 }
 
 static inline size_t br_port_info_size(void)
@@ -128,6 +135,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_UNICAST_FLOOD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
+		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -194,7 +202,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
 	    nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK,
 		       p->topology_change_ack) ||
-	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending))
+	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
+	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
+							BR_VLAN_TUNNEL)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -417,6 +427,9 @@ static int br_fill_ifinfo(struct sk_buff *skb,
 			err = br_fill_ifvlaninfo_compressed(skb, vg);
 		else
 			err = br_fill_ifvlaninfo(skb, vg);
+
+		if (port && (port->flags & BR_VLAN_TUNNEL))
+			err = br_fill_vlan_tunnel_info(skb, vg);
 		rcu_read_unlock();
 		if (err)
 			goto nla_put_failure;
@@ -517,60 +530,91 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
 	return err;
 }
 
+static int br_process_vlan_info(struct net_bridge *br,
+				struct net_bridge_port *p, int cmd,
+				struct bridge_vlan_info *vinfo_curr,
+				struct bridge_vlan_info **vinfo_last)
+{
+	if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
+		return -EINVAL;
+
+	if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+		/* check if we are already processing a range */
+		if (*vinfo_last)
+			return -EINVAL;
+		*vinfo_last = vinfo_curr;
+		/* don't allow range of pvids */
+		if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID)
+			return -EINVAL;
+		return 0;
+	}
+
+	if (*vinfo_last) {
+		struct bridge_vlan_info tmp_vinfo;
+		int v, err;
+
+		if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END))
+			return -EINVAL;
+
+		if (vinfo_curr->vid <= (*vinfo_last)->vid)
+			return -EINVAL;
+
+		memcpy(&tmp_vinfo, *vinfo_last,
+		       sizeof(struct bridge_vlan_info));
+		for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
+			tmp_vinfo.vid = v;
+			err = br_vlan_info(br, p, cmd, &tmp_vinfo);
+			if (err)
+				break;
+		}
+		*vinfo_last = NULL;
+
+		return 0;
+	}
+
+	return br_vlan_info(br, p, cmd, vinfo_curr);
+}
+
 static int br_afspec(struct net_bridge *br,
 		     struct net_bridge_port *p,
 		     struct nlattr *af_spec,
 		     int cmd)
 {
-	struct bridge_vlan_info *vinfo_start = NULL;
-	struct bridge_vlan_info *vinfo = NULL;
+	struct bridge_vlan_info *vinfo_curr = NULL;
+	struct bridge_vlan_info *vinfo_last = NULL;
 	struct nlattr *attr;
-	int err = 0;
-	int rem;
+	struct vtunnel_info tinfo_last = {};
+	struct vtunnel_info tinfo_curr = {};
+	int err = 0, rem;
 
 	nla_for_each_nested(attr, af_spec, rem) {
-		if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO)
-			continue;
-		if (nla_len(attr) != sizeof(struct bridge_vlan_info))
-			return -EINVAL;
-		vinfo = nla_data(attr);
-		if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
-			return -EINVAL;
-		if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
-			if (vinfo_start)
+		err = 0;
+		switch (nla_type(attr)) {
+		case IFLA_BRIDGE_VLAN_TUNNEL_INFO:
+			if (!(p->flags & BR_VLAN_TUNNEL))
 				return -EINVAL;
-			vinfo_start = vinfo;
-			/* don't allow range of pvids */
-			if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID)
+			err = br_parse_vlan_tunnel_info(attr, &tinfo_curr);
+			if (err)
+				return err;
+			err = br_process_vlan_tunnel_info(br, p, cmd,
+							  &tinfo_curr,
+							  &tinfo_last);
+			if (err)
+				return err;
+			break;
+		case IFLA_BRIDGE_VLAN_INFO:
+			if (nla_len(attr) != sizeof(struct bridge_vlan_info))
 				return -EINVAL;
-			continue;
+			vinfo_curr = nla_data(attr);
+			err = br_process_vlan_info(br, p, cmd, vinfo_curr,
+						   &vinfo_last);
+			if (err)
+				return err;
+			break;
 		}
 
-		if (vinfo_start) {
-			struct bridge_vlan_info tmp_vinfo;
-			int v;
-
-			if (!(vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END))
-				return -EINVAL;
-
-			if (vinfo->vid <= vinfo_start->vid)
-				return -EINVAL;
-
-			memcpy(&tmp_vinfo, vinfo_start,
-			       sizeof(struct bridge_vlan_info));
-
-			for (v = vinfo_start->vid; v <= vinfo->vid; v++) {
-				tmp_vinfo.vid = v;
-				err = br_vlan_info(br, p, cmd, &tmp_vinfo);
-				if (err)
-					break;
-			}
-			vinfo_start = NULL;
-		} else {
-			err = br_vlan_info(br, p, cmd, vinfo);
-		}
 		if (err)
-			break;
+			return err;
 	}
 
 	return err;
@@ -630,8 +674,9 @@ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
 /* Process bridge protocol info on port */
 static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 {
-	int err;
 	unsigned long old_flags = p->flags;
+	bool br_vlan_tunnel_old = false;
+	int err;
 
 	br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
 	br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
@@ -644,6 +689,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
 
+	br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false;
+	br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
+	if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL))
+		nbp_vlan_tunnel_info_flush(p);
+
 	if (tb[IFLA_BRPORT_COST]) {
 		err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
 		if (err)
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
new file mode 100644
index 000000000000..99c68012c9d4
--- /dev/null
+++ b/net/bridge/br_netlink_tunnel.c
@@ -0,0 +1,296 @@
+/*
+ *	Bridge per vlan tunnel port dst_metadata netlink control interface
+ *
+ *	Authors:
+ *	Roopa Prabhu		<roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/if_bridge.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static size_t __get_vlan_tinfo_size(void)
+{
+	return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */
+		  nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */
+		  nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */
+		  nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
+}
+
+static bool vlan_tunnel_id_isrange(struct net_bridge_vlan *v,
+				   struct net_bridge_vlan *v_end)
+{
+	__be32 tunid_curr = tunnel_id_to_key32(v->tinfo.tunnel_id);
+	__be32 tunid_end = tunnel_id_to_key32(v_end->tinfo.tunnel_id);
+
+	return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_end)) == 1;
+}
+
+static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
+{
+	struct net_bridge_vlan *v, *v_start = NULL, *v_end = NULL;
+	int num_tinfos = 0;
+
+	/* Count number of vlan infos */
+	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+		/* only a context, bridge vlan not activated */
+		if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id)
+			continue;
+
+		if (!v_start) {
+			goto initvars;
+		} else if ((v->vid - v_end->vid) == 1 &&
+			   vlan_tunnel_id_isrange(v_end, v) == 1) {
+			v_end = v;
+			continue;
+		} else {
+			if ((v_end->vid - v->vid) > 0 &&
+			    vlan_tunnel_id_isrange(v_end, v) > 0)
+				num_tinfos += 2;
+			else
+				num_tinfos += 1;
+		}
+initvars:
+		v_start = v;
+		v_end = v;
+	}
+
+	if (v_start) {
+		if ((v_end->vid - v->vid) > 0 &&
+		    vlan_tunnel_id_isrange(v_end, v) > 0)
+			num_tinfos += 2;
+		else
+			num_tinfos += 1;
+	}
+
+	return num_tinfos;
+}
+
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg)
+{
+	int num_tinfos;
+
+	if (!vg)
+		return 0;
+
+	rcu_read_lock();
+	num_tinfos = __get_num_vlan_tunnel_infos(vg);
+	rcu_read_unlock();
+
+	return num_tinfos * __get_vlan_tinfo_size();
+}
+
+static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid,
+			      __be64 tunnel_id, u16 flags)
+{
+	__be32 tid = tunnel_id_to_key32(tunnel_id);
+	struct nlattr *tmap;
+
+	tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
+	if (!tmap)
+		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID,
+			be32_to_cpu(tid)))
+		goto nla_put_failure;
+	if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID,
+			vid))
+		goto nla_put_failure;
+	if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
+			flags))
+		goto nla_put_failure;
+	nla_nest_end(skb, tmap);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, tmap);
+
+	return -EMSGSIZE;
+}
+
+static int br_fill_vlan_tinfo_range(struct sk_buff *skb,
+				    struct net_bridge_vlan *vtbegin,
+				    struct net_bridge_vlan *vtend)
+{
+	int err;
+
+	if (vtbegin && vtend && (vtend->vid - vtbegin->vid) > 0) {
+		/* add range to skb */
+		err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+					 vtbegin->tinfo.tunnel_id,
+					 BRIDGE_VLAN_INFO_RANGE_BEGIN);
+		if (err)
+			return err;
+
+		err = br_fill_vlan_tinfo(skb, vtend->vid,
+					 vtend->tinfo.tunnel_id,
+					 BRIDGE_VLAN_INFO_RANGE_END);
+		if (err)
+			return err;
+	} else {
+		err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+					 vtbegin->tinfo.tunnel_id,
+					 0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+			     struct net_bridge_vlan_group *vg)
+{
+	struct net_bridge_vlan *vtbegin = NULL;
+	struct net_bridge_vlan *vtend = NULL;
+	struct net_bridge_vlan *v;
+	int err;
+
+	/* Count number of vlan infos */
+	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+		/* only a context, bridge vlan not activated */
+		if (!br_vlan_should_use(v))
+			continue;
+
+		if (!v->tinfo.tunnel_dst)
+			continue;
+
+		if (!vtbegin) {
+			goto initvars;
+		} else if ((v->vid - vtend->vid) == 1 &&
+			    vlan_tunnel_id_isrange(v, vtend)) {
+			vtend = v;
+			continue;
+		} else {
+			err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+			if (err)
+				return err;
+		}
+initvars:
+		vtbegin = v;
+		vtend = v;
+	}
+
+	if (vtbegin) {
+		err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
+	[IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
+	[IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
+	[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
+};
+
+static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
+			       u16 vid, u32 tun_id)
+{
+	int err = 0;
+
+	if (!p)
+		return -EINVAL;
+
+	switch (cmd) {
+	case RTM_SETLINK:
+		err = nbp_vlan_tunnel_info_add(p, vid, tun_id);
+		break;
+	case RTM_DELLINK:
+		nbp_vlan_tunnel_info_delete(p, vid);
+		break;
+	}
+
+	return err;
+}
+
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+			      struct vtunnel_info *tinfo)
+{
+	struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1];
+	u32 tun_id;
+	u16 vid, flags = 0;
+	int err;
+
+	memset(tinfo, 0, sizeof(*tinfo));
+
+	if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
+	    !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+			       attr, vlan_tunnel_policy);
+	if (err < 0)
+		return err;
+
+	tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]);
+	vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]);
+	if (vid >= VLAN_VID_MASK)
+		return -ERANGE;
+
+	if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS])
+		flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]);
+
+	tinfo->tunid = tun_id;
+	tinfo->vid = vid;
+	tinfo->flags = flags;
+
+	return 0;
+}
+
+int br_process_vlan_tunnel_info(struct net_bridge *br,
+				struct net_bridge_port *p, int cmd,
+				struct vtunnel_info *tinfo_curr,
+				struct vtunnel_info *tinfo_last)
+{
+	int err;
+
+	if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+		if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)
+			return -EINVAL;
+		memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info));
+	} else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) {
+		int t, v;
+
+		if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN))
+			return -EINVAL;
+		if ((tinfo_curr->vid - tinfo_last->vid) !=
+		    (tinfo_curr->tunid - tinfo_last->tunid))
+			return -EINVAL;
+		t = tinfo_last->tunid;
+		for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) {
+			err = br_vlan_tunnel_info(p, cmd, v, t);
+			if (err)
+				return err;
+			t++;
+		}
+		memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+		memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+	} else {
+		if (tinfo_last->flags)
+			return -EINVAL;
+		err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid,
+					  tinfo_curr->tunid);
+		if (err)
+			return err;
+		memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+		memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+	}
+
+	return 0;
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0b82a227fc34..61de90f28afa 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -91,6 +91,11 @@ struct br_vlan_stats {
 	struct u64_stats_sync syncp;
 };
 
+struct br_tunnel_info {
+	__be64			tunnel_id;
+	struct metadata_dst	*tunnel_dst;
+};
+
 /**
  * struct net_bridge_vlan - per-vlan entry
  *
@@ -113,6 +118,7 @@ struct br_vlan_stats {
  */
 struct net_bridge_vlan {
 	struct rhash_head		vnode;
+	struct rhash_head		tnode;
 	u16				vid;
 	u16				flags;
 	struct br_vlan_stats __percpu	*stats;
@@ -124,6 +130,9 @@ struct net_bridge_vlan {
 		atomic_t		refcnt;
 		struct net_bridge_vlan	*brvlan;
 	};
+
+	struct br_tunnel_info		tinfo;
+
 	struct list_head		vlist;
 
 	struct rcu_head			rcu;
@@ -145,6 +154,7 @@ struct net_bridge_vlan {
  */
 struct net_bridge_vlan_group {
 	struct rhashtable		vlan_hash;
+	struct rhashtable		tunnel_hash;
 	struct list_head		vlan_list;
 	u16				num_vlans;
 	u16				pvid;
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
new file mode 100644
index 000000000000..1c8d0d5302cb
--- /dev/null
+++ b/net/bridge/br_private_tunnel.h
@@ -0,0 +1,72 @@
+/*
+ *	Bridge per vlan tunnels
+ *
+ *	Authors:
+ *	Roopa Prabhu		<roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _BR_PRIVATE_TUNNEL_H
+#define _BR_PRIVATE_TUNNEL_H
+
+struct vtunnel_info {
+	u32	tunid;
+	u16	vid;
+	u16	flags;
+};
+
+/* br_netlink_tunnel.c */
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+			      struct vtunnel_info *tinfo);
+int br_process_vlan_tunnel_info(struct net_bridge *br,
+				struct net_bridge_port *p,
+				int cmd,
+				struct vtunnel_info *tinfo_curr,
+				struct vtunnel_info *tinfo_last);
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg);
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+			     struct net_bridge_vlan_group *vg);
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+/* br_vlan_tunnel.c */
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
+int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
+int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+			  struct net_bridge_vlan *vlan);
+#else
+static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+	return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
+					      u16 vid)
+{
+	return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
+					   u16 vid, u32 tun_id)
+{
+	return 0;
+}
+
+static inline void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+}
+
+static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+					struct net_bridge_vlan *vlan)
+{
+}
+
+#endif
+
+#endif
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index b6de4f457161..64002e3941ca 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -5,6 +5,7 @@
 #include <net/switchdev.h>
 
 #include "br_private.h"
+#include "br_private_tunnel.h"
 
 static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
 			      const void *ptr)
@@ -310,6 +311,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
 	}
 
 	if (masterv != v) {
+		vlan_tunnel_info_del(vg, v);
 		rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
 				       br_vlan_rht_params);
 		__vlan_del_list(v);
@@ -325,6 +327,7 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg)
 {
 	WARN_ON(!list_empty(&vg->vlan_list));
 	rhashtable_destroy(&vg->vlan_hash);
+	vlan_tunnel_deinit(vg);
 	kfree(vg);
 }
 
@@ -613,6 +616,8 @@ int br_vlan_delete(struct net_bridge *br, u16 vid)
 	br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
 	br_fdb_delete_by_port(br, NULL, vid, 0);
 
+	vlan_tunnel_info_del(vg, v);
+
 	return __vlan_del(v);
 }
 
@@ -918,6 +923,9 @@ int br_vlan_init(struct net_bridge *br)
 	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
 	if (ret)
 		goto err_rhtbl;
+	ret = vlan_tunnel_init(vg);
+	if (ret)
+		goto err_tunnel_init;
 	INIT_LIST_HEAD(&vg->vlan_list);
 	br->vlan_proto = htons(ETH_P_8021Q);
 	br->default_pvid = 1;
@@ -932,6 +940,8 @@ out:
 	return ret;
 
 err_vlan_add:
+	vlan_tunnel_deinit(vg);
+err_tunnel_init:
 	rhashtable_destroy(&vg->vlan_hash);
 err_rhtbl:
 	kfree(vg);
@@ -961,6 +971,9 @@ int nbp_vlan_init(struct net_bridge_port *p)
 	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
 	if (ret)
 		goto err_rhtbl;
+	ret = vlan_tunnel_init(vg);
+	if (ret)
+		goto err_tunnel_init;
 	INIT_LIST_HEAD(&vg->vlan_list);
 	rcu_assign_pointer(p->vlgrp, vg);
 	if (p->br->default_pvid) {
@@ -976,8 +989,10 @@ out:
 err_vlan_add:
 	RCU_INIT_POINTER(p->vlgrp, NULL);
 	synchronize_rcu();
-	rhashtable_destroy(&vg->vlan_hash);
+	vlan_tunnel_deinit(vg);
 err_vlan_enabled:
+err_tunnel_init:
+	rhashtable_destroy(&vg->vlan_hash);
 err_rhtbl:
 	kfree(vg);
 
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
new file mode 100644
index 000000000000..b3fd29d20a3c
--- /dev/null
+++ b/net/bridge/br_vlan_tunnel.c
@@ -0,0 +1,149 @@
+/*
+ *	Bridge per vlan tunnel port dst_metadata handling code
+ *
+ *	Authors:
+ *	Roopa Prabhu		<roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/switchdev.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
+				    const void *ptr)
+{
+	const struct net_bridge_vlan *vle = ptr;
+	__be64 tunid = *(__be64 *)arg->key;
+
+	return vle->tinfo.tunnel_id != tunid;
+}
+
+static const struct rhashtable_params br_vlan_tunnel_rht_params = {
+	.head_offset = offsetof(struct net_bridge_vlan, tnode),
+	.key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
+	.key_len = sizeof(__be64),
+	.nelem_hint = 3,
+	.locks_mul = 1,
+	.obj_cmpfn = br_vlan_tunid_cmp,
+	.automatic_shrinking = true,
+};
+
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+			  struct net_bridge_vlan *vlan)
+{
+	if (!vlan->tinfo.tunnel_dst)
+		return;
+	rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode,
+			       br_vlan_tunnel_rht_params);
+	vlan->tinfo.tunnel_id = 0;
+	dst_release(&vlan->tinfo.tunnel_dst->dst);
+	vlan->tinfo.tunnel_dst = NULL;
+}
+
+static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
+				  struct net_bridge_vlan *vlan, u32 tun_id)
+{
+	struct metadata_dst *metadata = NULL;
+	__be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
+	int err;
+
+	if (vlan->tinfo.tunnel_dst)
+		return -EEXIST;
+
+	metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
+				    key, 0);
+	if (!metadata)
+		return -EINVAL;
+
+	metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE;
+	vlan->tinfo.tunnel_dst = metadata;
+	vlan->tinfo.tunnel_id = key;
+
+	err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode,
+					    br_vlan_tunnel_rht_params);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	dst_release(&vlan->tinfo.tunnel_dst->dst);
+
+	return err;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *vlan;
+
+	ASSERT_RTNL();
+
+	vg = nbp_vlan_group(port);
+	vlan = br_vlan_find(vg, vid);
+	if (!vlan)
+		return -EINVAL;
+
+	return __vlan_tunnel_info_add(vg, vlan, tun_id);
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+
+	ASSERT_RTNL();
+
+	vg = nbp_vlan_group(port);
+	v = br_vlan_find(vg, vid);
+	if (!v)
+		return -ENOENT;
+
+	vlan_tunnel_info_del(vg, v);
+
+	return 0;
+}
+
+static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg)
+{
+	struct net_bridge_vlan *vlan, *tmp;
+
+	list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
+		vlan_tunnel_info_del(vg, vlan);
+}
+
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+	struct net_bridge_vlan_group *vg;
+
+	ASSERT_RTNL();
+
+	vg = nbp_vlan_group(port);
+	__vlan_tunnel_info_flush(vg);
+}
+
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+	return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params);
+}
+
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
+{
+	rhashtable_destroy(&vg->tunnel_hash);
+}
-- 
cgit v1.2.3


From 11538d039ac6efcf4f1a6c536e1b87cd3668a9fd Mon Sep 17 00:00:00 2001
From: Roopa Prabhu
Date: Tue, 31 Jan 2017 22:59:55 -0800
Subject: bridge: vlan dst_metadata hooks in ingress and egress paths

- ingress hook:
    - if port is a tunnel port, use tunnel info in
      attached dst_metadata to map it to a local vlan
- egress hook:
    - if port is a tunnel port, use tunnel info attached to
      vlan to set dst_metadata on the skb

CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_forward.c        |  2 +-
 net/bridge/br_input.c          |  8 ++++++-
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_private_tunnel.h | 11 +++++++++
 net/bridge/br_vlan.c           |  7 ++++++
 net/bridge/br_vlan_tunnel.c    | 54 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 82 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 5a1f8ef49899..6bfac29318f2 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -80,7 +80,7 @@ static void __br_forward(const struct net_bridge_port *to,
 	int br_hook;
 
 	vg = nbp_vlan_group_rcu(to);
-	skb = br_handle_vlan(to->br, vg, skb);
+	skb = br_handle_vlan(to->br, to, vg, skb);
 	if (!skb)
 		return;
 
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 855b72fbe1da..fba38d8a1a08 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/rculist.h>
 #include "br_private.h"
+#include "br_private_tunnel.h"
 
 /* Hook for brouter */
 br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
@@ -57,7 +58,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
 
 	indev = skb->dev;
 	skb->dev = brdev;
-	skb = br_handle_vlan(br, vg, skb);
+	skb = br_handle_vlan(br, NULL, vg, skb);
 	if (!skb)
 		return NET_RX_DROP;
 	/* update the multicast stats if the packet is IGMP/MLD */
@@ -261,6 +262,11 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		return RX_HANDLER_CONSUMED;
 
 	p = br_port_get_rcu(skb->dev);
+	if (p->flags & BR_VLAN_TUNNEL) {
+		if (br_handle_ingress_vlan_tunnel(skb, p,
+						  nbp_vlan_group_rcu(p)))
+			goto drop;
+	}
 
 	if (unlikely(is_link_local_ether_addr(dest))) {
 		u16 fwd_mask = p->br->group_fwd_mask_required;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 61de90f28afa..40177df45ba6 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -775,6 +775,7 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
 		       const struct sk_buff *skb);
 bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
 struct sk_buff *br_handle_vlan(struct net_bridge *br,
+			       const struct net_bridge_port *port,
 			       struct net_bridge_vlan_group *vg,
 			       struct sk_buff *skb);
 int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
@@ -874,6 +875,7 @@ static inline bool br_should_learn(struct net_bridge_port *p,
 }
 
 static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
+					     const struct net_bridge_port *port,
 					     struct net_bridge_vlan_group *vg,
 					     struct sk_buff *skb)
 {
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
index 1c8d0d5302cb..4a447a378ab3 100644
--- a/net/bridge/br_private_tunnel.h
+++ b/net/bridge/br_private_tunnel.h
@@ -40,6 +40,11 @@ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
 void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
 void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
 			  struct net_bridge_vlan *vlan);
+int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+				  struct net_bridge_port *p,
+				  struct net_bridge_vlan_group *vg);
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+				 struct net_bridge_vlan *vlan);
 #else
 static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
 {
@@ -67,6 +72,12 @@ static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
 {
 }
 
+static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+						struct net_bridge_port *p,
+						struct net_bridge_vlan_group *vg)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 64002e3941ca..62e68c0dc687 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -341,6 +341,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vg)
 }
 
 struct sk_buff *br_handle_vlan(struct net_bridge *br,
+			       const struct net_bridge_port *p,
 			       struct net_bridge_vlan_group *vg,
 			       struct sk_buff *skb)
 {
@@ -381,6 +382,12 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
 
 	if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
 		skb->vlan_tci = 0;
+
+	if (p && (p->flags & BR_VLAN_TUNNEL) &&
+	    br_handle_egress_vlan_tunnel(skb, v)) {
+		kfree_skb(skb);
+		return NULL;
+	}
 out:
 	return skb;
 }
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index b3fd29d20a3c..b2b79a070162 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -39,6 +39,13 @@ static const struct rhashtable_params br_vlan_tunnel_rht_params = {
 	.automatic_shrinking = true,
 };
 
+static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
+						     u64 tunnel_id)
+{
+	return rhashtable_lookup_fast(tbl, &tunnel_id,
+				      br_vlan_tunnel_rht_params);
+}
+
 void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
 			  struct net_bridge_vlan *vlan)
 {
@@ -147,3 +154,50 @@ void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
 {
 	rhashtable_destroy(&vg->tunnel_hash);
 }
+
+int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+				  struct net_bridge_port *p,
+				  struct net_bridge_vlan_group *vg)
+{
+	struct ip_tunnel_info *tinfo = skb_tunnel_info(skb);
+	struct net_bridge_vlan *vlan;
+
+	if (!vg || !tinfo)
+		return 0;
+
+	/* if already tagged, ignore */
+	if (skb_vlan_tagged(skb))
+		return 0;
+
+	/* lookup vid, given tunnel id */
+	vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id);
+	if (!vlan)
+		return 0;
+
+	skb_dst_drop(skb);
+
+	__vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
+
+	return 0;
+}
+
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+				 struct net_bridge_vlan *vlan)
+{
+	int err;
+
+	if (!vlan || !vlan->tinfo.tunnel_id)
+		return 0;
+
+	if (unlikely(!skb_vlan_tag_present(skb)))
+		return 0;
+
+	skb_dst_drop(skb);
+	err = skb_vlan_pop(skb);
+	if (err)
+		return err;
+
+	skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst));
+
+	return 0;
+}
-- 
cgit v1.2.3


From 94b5e0f970258828bf163b5ef076da4e4b0802e0 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 2 Feb 2017 08:52:21 -0800
Subject: net: ipv6: Set protocol to kernel for local routes

IPv6 stack does not set the protocol for local routes, so those routes show
up with proto "none":
    $ ip -6 ro ls table local
    local ::1 dev lo proto none metric 0  pref medium
    local 2100:3:: dev lo proto none metric 0  pref medium
    local 2100:3::4 dev lo proto none metric 0  pref medium
    local fe80:: dev lo proto none metric 0  pref medium
    ...

Set rt6i_protocol to RTPROT_KERNEL for consistency with IPv4. Now routes
show up with proto "kernel":
    $ ip -6 ro ls table local
    local ::1 dev lo proto kernel metric 0  pref medium
    local 2100:3:: dev lo proto kernel metric 0  pref medium
    local 2100:3::4 dev lo proto kernel metric 0  pref medium
    local fe80:: dev lo proto kernel metric 0  pref medium
    ...

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2563331b0532..91eb3f7782dd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2634,6 +2634,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	rt->dst.output = ip6_output;
 	rt->rt6i_idev = idev;
 
+	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast)
 		rt->rt6i_flags |= RTF_ANYCAST;
-- 
cgit v1.2.3


From 38ab52e8e1e2dd65f2d349f82553335813b638d2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 2 Feb 2017 20:40:08 -0800
Subject: tcp: clear pfmemalloc on outgoing skb

Josef Bacik diagnosed following problem :

   I was seeing random disconnects while testing NBD over loopback.
   This turned out to be because NBD sets pfmemalloc on it's socket,
   however the receiving side is a user space application so does not
   have pfmemalloc set on its socket. This means that
   sk_filter_trim_cap will simply drop this packet, under the
   assumption that the other side will simply retransmit. Well we do
   retransmit, and then the packet is just dropped again for the same
   reason.

It seems the better way to address this problem is to clear pfmemalloc
in the TCP transmit path. pfmemalloc strict control really makes sense
on the receive path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 956bea9a5394..ccf1ef4dcba4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -964,6 +964,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	 */
 	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
 
+	/* If we had to use memory reserve to allocate this skb,
+	 * this might cause drops if packet is looped back :
+	 * Other socket might not have SOCK_MEMALLOC.
+	 * Packets not looped back do not care about pfmemalloc.
+	 */
+	skb->pfmemalloc = 0;
+
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
 
-- 
cgit v1.2.3


From 69ca05ce9dec2cc95070df7f1f10ea6c9c12d237 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Fri, 3 Feb 2017 10:29:08 +0100
Subject: sched: cls_flower: expose priority to offloading netdevice

The driver that offloads flower rules needs to know with which priority
user inserted the rules. So add this information into offload struct.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h  | 1 +
 net/sched/cls_flower.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index b43077e47d35..dabb00af46a0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -481,6 +481,7 @@ enum tc_fl_command {
 
 struct tc_cls_flower_offload {
 	enum tc_fl_command command;
+	u32 prio;
 	unsigned long cookie;
 	struct flow_dissector *dissector;
 	struct fl_flow_key *mask;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 23c4d224dcb1..0826c8ec3a76 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -229,6 +229,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 		return;
 
 	offload.command = TC_CLSFLOWER_DESTROY;
+	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 
 	tc->type = TC_SETUP_CLSFLOWER;
@@ -260,6 +261,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	}
 
 	offload.command = TC_CLSFLOWER_REPLACE;
+	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 	offload.dissector = dissector;
 	offload.mask = mask;
@@ -287,6 +289,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 		return;
 
 	offload.command = TC_CLSFLOWER_STATS;
+	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 	offload.exts = &f->exts;
 
-- 
cgit v1.2.3


From 79e7fff47b7bb4124ef970a13eac4fdeddd1fc25 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 2 Feb 2017 18:43:28 -0800
Subject: net: remove support for per driver ndo_busy_poll()

We added generic support for busy polling in NAPI layer in linux-4.5

No network driver uses ndo_busy_poll() anymore, we can get rid
of the pointer in struct net_device_ops, and its use in sk_busy_loop()

Saves NETIF_F_BUSY_POLL features bit.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  2 --
 include/linux/netdevice.h       |  3 ---
 net/core/dev.c                  | 15 ---------------
 net/core/ethtool.c              |  1 -
 4 files changed, 21 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9c6c8ef2e9e7..9a0419594e84 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -71,7 +71,6 @@ enum {
 	NETIF_F_HW_VLAN_STAG_RX_BIT,	/* Receive VLAN STAG HW acceleration */
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
-	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
 
 	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
 
@@ -134,7 +133,6 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
-#define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
 #define NETIF_F_HW_TC		__NETIF_F(HW_TC)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f3878fbe7786..6f18b509fb2f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1184,9 +1184,6 @@ struct net_device_ops {
 	int			(*ndo_netpoll_setup)(struct net_device *dev,
 						     struct netpoll_info *info);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
-#endif
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	int			(*ndo_busy_poll)(struct napi_struct *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
diff --git a/net/core/dev.c b/net/core/dev.c
index 727b6fda0e8c..4cde8bfb9bab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4978,7 +4978,6 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 {
 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
 	int (*napi_poll)(struct napi_struct *napi, int budget);
-	int (*busy_poll)(struct napi_struct *dev);
 	void *have_poll_lock = NULL;
 	struct napi_struct *napi;
 	int rc;
@@ -4993,17 +4992,10 @@ restart:
 	if (!napi)
 		goto out;
 
-	/* Note: ndo_busy_poll method is optional in linux-4.5 */
-	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
-
 	preempt_disable();
 	for (;;) {
 		rc = 0;
 		local_bh_disable();
-		if (busy_poll) {
-			rc = busy_poll(napi);
-			goto count;
-		}
 		if (!napi_poll) {
 			unsigned long val = READ_ONCE(napi->state);
 
@@ -6956,13 +6948,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		features &= ~dev->gso_partial_features;
 	}
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	if (dev->netdev_ops->ndo_busy_poll)
-		features |= NETIF_F_BUSY_POLL;
-	else
-#endif
-		features &= ~NETIF_F_BUSY_POLL;
-
 	return features;
 }
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6b3eee0834a0..d5f412b3093d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXFCS_BIT] =            "rx-fcs",
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
-	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
 	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
 };
 
-- 
cgit v1.2.3


From 6e7bc478c9a006c701c14476ec9d389a484b4864 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 3 Feb 2017 14:29:42 -0800
Subject: net: skb_needs_check() accepts CHECKSUM_NONE for tx

My recent change missed fact that UFO would perform a complete
UDP checksum before segmenting in frags.

In this case skb->ip_summed is set to CHECKSUM_NONE.

We need to add this valid case to skb_needs_check()

Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4cde8bfb9bab..42ba0379575a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2637,9 +2637,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 {
 	if (tx_path)
-		return skb->ip_summed != CHECKSUM_PARTIAL;
-	else
-		return skb->ip_summed == CHECKSUM_NONE;
+		return skb->ip_summed != CHECKSUM_PARTIAL &&
+		       skb->ip_summed != CHECKSUM_NONE;
+
+	return skb->ip_summed == CHECKSUM_NONE;
 }
 
 /**
-- 
cgit v1.2.3


From 0ae8133586ad1c9be894411aaf8b17bb58c8efe5 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 2 Feb 2017 12:37:08 -0800
Subject: net: ipv6: Allow shorthand delete of all nexthops in multipath route

IPv4 allows multipath routes to be deleted using just the prefix and
length. For example:
    $ ip ro ls vrf red
    unreachable default metric 8192
    1.1.1.0/24
        nexthop via 10.100.1.254  dev eth1 weight 1
        nexthop via 10.11.200.2  dev eth11.200 weight 1
    10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3
    10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3

    $ ip ro del 1.1.1.0/24 vrf red

    $ ip ro ls vrf red
    unreachable default metric 8192
    10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3
    10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3

The same notation does not work with IPv6 because of how multipath routes
are implemented for IPv6. For IPv6 only the first nexthop of a multipath
route is deleted if the request contains only a prefix and length. This
leads to unnecessary complexity in userspace dealing with IPv6 multipath
routes.

This patch allows all nexthops to be deleted without specifying each one
in the delete request. Internally, this is done by walking the sibling
list of the route matching the specifications given (prefix, length,
metric, protocol, etc).

    $  ip -6 ro ls vrf red
    2001:db8:1::/120 dev eth1 proto kernel metric 256  pref medium
    2001:db8:2::/120 dev eth2 proto kernel metric 256  pref medium
    2001:db8:200::/120 via 2001:db8:1::2 dev eth1 metric 1024  pref medium
    2001:db8:200::/120 via 2001:db8:2::2 dev eth2 metric 1024  pref medium
    ...

    $ ip -6 ro del vrf red 2001:db8:200::/120

    $ ip -6 ro ls vrf red
    2001:db8:1::/120 dev eth1 proto kernel metric 256  pref medium
    2001:db8:2::/120 dev eth2 proto kernel metric 256  pref medium
    ...

Because IPv6 allows individual nexthops to be deleted without deleting
the entire route, the ip6_route_multipath_del and non-multipath code
path (ip6_route_del) have to be discriminated so that all nexthops are
only deleted for the latter case. This is done by making the existing
fc_type in fib6_config a u16 and then adding a new u16 field with
fc_delete_all_nh as the first bit.

Suggested-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  4 +++-
 net/ipv6/route.c      | 38 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa40ef4..c979c878df1c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -37,7 +37,9 @@ struct fib6_config {
 	int		fc_ifindex;
 	u32		fc_flags;
 	u32		fc_protocol;
-	u32		fc_type;	/* only 8 bits are used */
+	u16		fc_type;        /* only 8 bits are used */
+	u16		fc_delete_all_nh : 1,
+			__unused : 15;
 
 	struct in6_addr	fc_dst;
 	struct in6_addr	fc_src;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 91eb3f7782dd..635b7fdef2eb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2143,6 +2143,34 @@ int ip6_del_rt(struct rt6_info *rt)
 	return __ip6_del_rt(rt, &info);
 }
 
+static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
+{
+	struct nl_info *info = &cfg->fc_nlinfo;
+	struct fib6_table *table;
+	int err;
+
+	table = rt->rt6i_table;
+	write_lock_bh(&table->tb6_lock);
+
+	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
+		struct rt6_info *sibling, *next_sibling;
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &rt->rt6i_siblings,
+					 rt6i_siblings) {
+			err = fib6_del(sibling, info);
+			if (err)
+				goto out;
+		}
+	}
+
+	err = fib6_del(rt, info);
+out:
+	write_unlock_bh(&table->tb6_lock);
+	ip6_rt_put(rt);
+	return err;
+}
+
 static int ip6_route_del(struct fib6_config *cfg)
 {
 	struct fib6_table *table;
@@ -2179,7 +2207,11 @@ static int ip6_route_del(struct fib6_config *cfg)
 			dst_hold(&rt->dst);
 			read_unlock_bh(&table->tb6_lock);
 
-			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+			/* if gateway was specified only delete the one hop */
+			if (cfg->fc_flags & RTF_GATEWAY)
+				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+
+			return __ip6_del_rt_siblings(rt, cfg);
 		}
 	}
 	read_unlock_bh(&table->tb6_lock);
@@ -3142,8 +3174,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	if (cfg.fc_mp)
 		return ip6_route_multipath_del(&cfg);
-	else
+	else {
+		cfg.fc_delete_all_nh = 1;
 		return ip6_route_del(&cfg);
+	}
 }
 
 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
-- 
cgit v1.2.3


From beb1afac518dec5a15dc92ba8f0ca016dcf457b4 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 2 Feb 2017 12:37:09 -0800
Subject: net: ipv6: Add support to dump multipath routes via RTA_MULTIPATH
 attribute

IPv6 returns multipath routes as a series of individual routes making
their display and handling by userspace different and more complicated
than IPv4, putting the burden on the user to see that a route is part of
a multipath route and internally creating a multipath route if desired
(e.g., libnl does this as of commit 29b71371e764). This patch addresses
this difference, allowing multipath routes to be returned using the
RTA_MULTIPATH attribute.

The end result is that IPv6 multipath routes can be treated and displayed
in a format similar to IPv4:

    $ ip -6 ro ls vrf red
    2001:db8:1::/120 dev eth1 proto kernel metric 256  pref medium
    2001:db8:2::/120 dev eth2 proto kernel metric 256  pref medium
    2001:db8:200::/120 metric 1024
	    nexthop via 2001:db8:1::2  dev eth1 weight 1
	    nexthop via 2001:db8:2::2  dev eth2 weight 1

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c |  10 +++++
 net/ipv6/route.c   | 112 +++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 105 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index febde6c112bf..1bf5e22fb95d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -318,6 +318,16 @@ static int fib6_dump_node(struct fib6_walker *w)
 			w->leaf = rt;
 			return 1;
 		}
+
+		/* Multipath routes are dumped in one route with the
+		 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
+		 * last sibling of this route (no need to dump the
+		 * sibling routes again)
+		 */
+		if (rt->rt6i_nsiblings)
+			rt = list_last_entry(&rt->rt6i_siblings,
+					     struct rt6_info,
+					     rt6i_siblings);
 	}
 	w->leaf = NULL;
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 635b7fdef2eb..c740d9e249a6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3195,8 +3195,20 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return ip6_route_add(&cfg);
 }
 
-static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
+static size_t rt6_nlmsg_size(struct rt6_info *rt)
 {
+	int nexthop_len = 0;
+
+	if (rt->rt6i_nsiblings) {
+		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
+			    + NLA_ALIGN(sizeof(struct rtnexthop))
+			    + nla_total_size(16) /* RTA_GATEWAY */
+			    + nla_total_size(4)  /* RTA_OIF */
+			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
+
+		nexthop_len *= rt->rt6i_nsiblings;
+	}
+
 	return NLMSG_ALIGN(sizeof(struct rtmsg))
 	       + nla_total_size(16) /* RTA_SRC */
 	       + nla_total_size(16) /* RTA_DST */
@@ -3210,7 +3222,62 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
+	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
+	       + nexthop_len;
+}
+
+static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
+			    unsigned int *flags)
+{
+	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
+		*flags |= RTNH_F_LINKDOWN;
+		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+			*flags |= RTNH_F_DEAD;
+	}
+
+	if (rt->rt6i_flags & RTF_GATEWAY) {
+		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
+			goto nla_put_failure;
+	}
+
+	if (rt->dst.dev &&
+	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+		goto nla_put_failure;
+
+	if (rt->dst.lwtstate &&
+	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
+{
+	struct rtnexthop *rtnh;
+	unsigned int flags = 0;
+
+	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+	if (!rtnh)
+		goto nla_put_failure;
+
+	rtnh->rtnh_hops = 0;
+	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
+
+	if (rt6_nexthop_info(skb, rt, &flags) < 0)
+		goto nla_put_failure;
+
+	rtnh->rtnh_flags = flags;
+
+	/* length of rtnetlink header + attributes */
+	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
 }
 
 static int rt6_fill_node(struct net *net,
@@ -3264,11 +3331,6 @@ static int rt6_fill_node(struct net *net,
 	else
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
-	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
-		rtm->rtm_flags |= RTNH_F_LINKDOWN;
-		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
-			rtm->rtm_flags |= RTNH_F_DEAD;
-	}
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
 	rtm->rtm_protocol = rt->rt6i_protocol;
 	if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -3332,17 +3394,35 @@ static int rt6_fill_node(struct net *net,
 	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
-	if (rt->rt6i_flags & RTF_GATEWAY) {
-		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
-			goto nla_put_failure;
-	}
-
-	if (rt->dst.dev &&
-	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
-		goto nla_put_failure;
 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
 		goto nla_put_failure;
 
+	/* For multipath routes, walk the siblings list and add
+	 * each as a nexthop within RTA_MULTIPATH.
+	 */
+	if (rt->rt6i_nsiblings) {
+		struct rt6_info *sibling, *next_sibling;
+		struct nlattr *mp;
+
+		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		if (!mp)
+			goto nla_put_failure;
+
+		if (rt6_add_nexthop(skb, rt) < 0)
+			goto nla_put_failure;
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &rt->rt6i_siblings, rt6i_siblings) {
+			if (rt6_add_nexthop(skb, sibling) < 0)
+				goto nla_put_failure;
+		}
+
+		nla_nest_end(skb, mp);
+	} else {
+		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags) < 0)
+			goto nla_put_failure;
+	}
+
 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
 
 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
@@ -3351,8 +3431,6 @@ static int rt6_fill_node(struct net *net,
 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
 		goto nla_put_failure;
 
-	if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
-		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
 	return 0;
-- 
cgit v1.2.3


From 3b1137fe74829e021f483756a648cbb87c8a1b4a Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 2 Feb 2017 12:37:10 -0800
Subject: net: ipv6: Change notifications for multipath add to RTA_MULTIPATH

Change ip6_route_multipath_add to send one notifciation with the full
route encoded with RTA_MULTIPATH instead of a series of individual routes.
This is done by adding a skip_notify flag to the nl_info struct. The
flag is used to skip sending of the notification in the fib code that
actually inserts the route. Once the full route has been added, a
notification is generated with all nexthops.

ip6_route_multipath_add handles 3 use cases: new routes, route replace,
and route append. The multipath notification generated needs to be
consistent with the order of the nexthops and it should be consistent
with the order in a FIB dump which means the route with the first nexthop
needs to be used as the route reference. For the first 2 cases (new and
replace), a reference to the route used to send the notification is
obtained by saving the first route added. For the append case, the last
route added is used to loop back to its first sibling route which is
the first nexthop in the multipath route.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h |  1 +
 net/ipv6/ip6_fib.c    |  6 ++++--
 net/ipv6/route.c      | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index d3938f11ae52..b239fcd33d80 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -229,6 +229,7 @@ struct nl_info {
 	struct nlmsghdr		*nlh;
 	struct net		*nl_net;
 	u32			portid;
+	bool			skip_notify;
 };
 
 int netlink_rcv_skb(struct sk_buff *skb,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1bf5e22fb95d..99c68ce6ef78 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -881,7 +881,8 @@ add:
 		*ins = rt;
 		rt->rt6i_node = fn;
 		atomic_inc(&rt->rt6i_ref);
-		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+		if (!info->skip_notify)
+			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
 
 		if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -907,7 +908,8 @@ add:
 		rt->rt6i_node = fn;
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		atomic_inc(&rt->rt6i_ref);
-		inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
+		if (!info->skip_notify)
+			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c740d9e249a6..cb3366d5e165 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3023,13 +3023,37 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
 	return 0;
 }
 
+static void ip6_route_mpath_notify(struct rt6_info *rt,
+				   struct rt6_info *rt_last,
+				   struct nl_info *info,
+				   __u16 nlflags)
+{
+	/* if this is an APPEND route, then rt points to the first route
+	 * inserted and rt_last points to last route inserted. Userspace
+	 * wants a consistent dump of the route which starts at the first
+	 * nexthop. Since sibling routes are always added at the end of
+	 * the list, find the first sibling of the last route appended
+	 */
+	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
+		rt = list_first_entry(&rt_last->rt6i_siblings,
+				      struct rt6_info,
+				      rt6i_siblings);
+	}
+
+	if (rt)
+		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+}
+
 static int ip6_route_multipath_add(struct fib6_config *cfg)
 {
+	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
+	struct nl_info *info = &cfg->fc_nlinfo;
 	struct fib6_config r_cfg;
 	struct rtnexthop *rtnh;
 	struct rt6_info *rt;
 	struct rt6_nh *err_nh;
 	struct rt6_nh *nh, *nh_safe;
+	__u16 nlflags;
 	int remaining;
 	int attrlen;
 	int err = 1;
@@ -3038,6 +3062,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
 	LIST_HEAD(rt6_nh_list);
 
+	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
+	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
+		nlflags |= NLM_F_APPEND;
+
 	remaining = cfg->fc_mp_len;
 	rtnh = (struct rtnexthop *)cfg->fc_mp;
 
@@ -3080,9 +3108,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
 		rtnh = rtnh_next(rtnh, &remaining);
 	}
 
+	/* for add and replace send one notification with all nexthops.
+	 * Skip the notification in fib6_add_rt2node and send one with
+	 * the full route when done
+	 */
+	info->skip_notify = 1;
+
 	err_nh = NULL;
 	list_for_each_entry(nh, &rt6_nh_list, next) {
-		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
+		rt_last = nh->rt6_info;
+		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
+		/* save reference to first route for notification */
+		if (!rt_notif && !err)
+			rt_notif = nh->rt6_info;
+
 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
 		nh->rt6_info = NULL;
 		if (err) {
@@ -3104,9 +3143,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
 		nhn++;
 	}
 
+	/* success ... tell user about new route */
+	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
 	goto cleanup;
 
 add_errout:
+	/* send notification for routes that were added so that
+	 * the delete notifications sent by ip6_route_del are
+	 * coherent
+	 */
+	if (rt_notif)
+		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
+
 	/* Delete routes that were already added */
 	list_for_each_entry(nh, &rt6_nh_list, next) {
 		if (err_nh == nh)
-- 
cgit v1.2.3


From 16a16cd35ee29d9bea54dd60e55d9c1cc685a37d Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 2 Feb 2017 12:37:11 -0800
Subject: net: ipv6: Change notifications for multipath delete to RTA_MULTIPATH

If an entire multipath route is deleted using prefix and len (without any
nexthops), send a single RTM_DELROUTE notification with the full route
using RTA_MULTIPATH. This is done by generating the skb before the route
delete when all of the sibling routes are still present but sending it
after the route has been removed from the FIB. The skip_notify flag
is used to tell the lower fib code not to send notifications for the
individual nexthop routes.

If a route is deleted using RTA_MULTIPATH for any nexthops or a single
nexthop entry is deleted, then the nexthops are deleted one at a time with
notifications sent as each hop is deleted. This is necessary given that
IPv6 allows individual hops within a route to be deleted.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c |  3 ++-
 net/ipv6/route.c   | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 99c68ce6ef78..e4266746e4a2 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1454,7 +1454,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	fib6_purge_rt(rt, fn, net);
 
-	inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+	if (!info->skip_notify)
+		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
 	rt6_release(rt);
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cb3366d5e165..194261acb87d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -98,6 +98,12 @@ static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
+static size_t rt6_nlmsg_size(struct rt6_info *rt);
+static int rt6_fill_node(struct net *net,
+			 struct sk_buff *skb, struct rt6_info *rt,
+			 struct in6_addr *dst, struct in6_addr *src,
+			 int iif, int type, u32 portid, u32 seq,
+			 unsigned int flags);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -2146,6 +2152,7 @@ int ip6_del_rt(struct rt6_info *rt)
 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 {
 	struct nl_info *info = &cfg->fc_nlinfo;
+	struct sk_buff *skb = NULL;
 	struct fib6_table *table;
 	int err;
 
@@ -2155,6 +2162,20 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
 		struct rt6_info *sibling, *next_sibling;
 
+		/* prefer to send a single notification with all hops */
+		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+		if (skb) {
+			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+
+			if (rt6_fill_node(info->nl_net, skb, rt,
+					  NULL, NULL, 0, RTM_DELROUTE,
+					  info->portid, seq, 0) < 0) {
+				kfree_skb(skb);
+				skb = NULL;
+			} else
+				info->skip_notify = 1;
+		}
+
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &rt->rt6i_siblings,
 					 rt6i_siblings) {
@@ -2168,6 +2189,11 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 out:
 	write_unlock_bh(&table->tb6_lock);
 	ip6_rt_put(rt);
+
+	if (skb) {
+		rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV6_ROUTE,
+			    info->nlh, gfp_any());
+	}
 	return err;
 }
 
-- 
cgit v1.2.3


From 7d4d5065ecb0cea9c5815d5e0df5fb586c5ee9b5 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 2 Feb 2017 12:37:12 -0800
Subject: net: ipv6: Use compressed IPv6 addresses showing route replace error

ip6_print_replace_route_err logs an error if a route replace fails with
IPv6 addresses in the full format. e.g,:

IPv6: IPV6: multipath route replace failed (check consistency of installed routes): 2001:0db8:0200:0000:0000:0000:0000:0000 nexthop 2001:0db8:0001:0000:0000:0000:0000:0016 ifi 0

Change the message to dump the addresses in the compressed format.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 194261acb87d..8ffa24cc8899 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3010,7 +3010,7 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
 	struct rt6_nh *nh;
 
 	list_for_each_entry(nh, rt6_nh_list, next) {
-		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
+		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
 		        nh->r_cfg.fc_ifindex);
 	}
-- 
cgit v1.2.3


From 02c1602ee7b3e3d062c3eacd374d6a6e3a2ebb73 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Sat, 4 Feb 2017 15:25:02 -0800
Subject: net: remove __napi_complete()

All __napi_complete() callers have been converted to
use the more standard napi_complete_done(),
we can now remove this NAPI method for good.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 net/core/dev.c            | 24 +++---------------------
 2 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6f18b509fb2f..014fbe256d55 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -463,7 +463,6 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
-bool __napi_complete(struct napi_struct *n);
 bool napi_complete_done(struct napi_struct *n, int work_done);
 /**
  *	napi_complete - NAPI processing complete
diff --git a/net/core/dev.c b/net/core/dev.c
index 42ba0379575a..404d2e6d5d32 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4883,23 +4883,6 @@ void __napi_schedule_irqoff(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule_irqoff);
 
-bool __napi_complete(struct napi_struct *n)
-{
-	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-
-	/* Some drivers call us directly, instead of calling
-	 * napi_complete_done().
-	 */
-	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
-		return false;
-
-	list_del_init(&n->poll_list);
-	smp_mb__before_atomic();
-	clear_bit(NAPI_STATE_SCHED, &n->state);
-	return true;
-}
-EXPORT_SYMBOL(__napi_complete);
-
 bool napi_complete_done(struct napi_struct *n, int work_done)
 {
 	unsigned long flags;
@@ -4926,14 +4909,13 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		else
 			napi_gro_flush(n, false);
 	}
-	if (likely(list_empty(&n->poll_list))) {
-		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
-	} else {
+	if (unlikely(!list_empty(&n->poll_list))) {
 		/* If n->poll_list is not empty, we need to mask irqs */
 		local_irq_save(flags);
-		__napi_complete(n);
+		list_del_init(&n->poll_list);
 		local_irq_restore(flags);
 	}
+	WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
 	return true;
 }
 EXPORT_SYMBOL(napi_complete_done);
-- 
cgit v1.2.3


From a8eca326151ee1beac82a4fd86d9edad3a37aaed Mon Sep 17 00:00:00 2001
From: Ido Schimmel
Date: Mon, 6 Feb 2017 16:20:14 +0100
Subject: net: remove ndo_neigh_{construct, destroy} from stacked devices

In commit 18bfb924f000 ("net: introduce default neigh_construct/destroy
ndo calls for L2 upper devices") we added these ndos to stacked devices
such as team and bond, so that calls will be propagated to mlxsw.

However, previous commit removed the reliance on these ndos and no new
users of these ndos have appeared since above mentioned commit. We can
therefore safely remove this dead code.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  2 --
 drivers/net/team/team.c         |  2 --
 include/linux/netdevice.h       |  4 ----
 net/8021q/vlan_dev.c            |  2 --
 net/bridge/br_device.c          |  2 --
 net/core/dev.c                  | 44 -----------------------------------------
 6 files changed, 56 deletions(-)

(limited to 'net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e6af04716cf7..6321f12630c8 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4145,8 +4145,6 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_add_slave		= bond_enslave,
 	.ndo_del_slave		= bond_release,
 	.ndo_fix_features	= bond_fix_features,
-	.ndo_neigh_construct	= netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	= netdev_default_l2upper_neigh_destroy,
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index a3711769544b..4a24b5d15f5a 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2001,8 +2001,6 @@ static const struct net_device_ops team_netdev_ops = {
 	.ndo_add_slave		= team_add_slave,
 	.ndo_del_slave		= team_del_slave,
 	.ndo_fix_features	= team_fix_features,
-	.ndo_neigh_construct	= netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	= netdev_default_l2upper_neigh_destroy,
 	.ndo_change_carrier     = team_change_carrier,
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 014fbe256d55..58afbd1cc659 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3888,10 +3888,6 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev);
 void netdev_lower_state_changed(struct net_device *lower_dev,
 				void *lower_state_info);
-int netdev_default_l2upper_neigh_construct(struct net_device *dev,
-					   struct neighbour *n);
-void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
-					  struct neighbour *n);
 
 /* RSS keys are 40 or 52 bytes long */
 #define NETDEV_RSS_KEY_LEN 52
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 116455ac3db5..e97ab824e368 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -791,8 +791,6 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_netpoll_cleanup	= vlan_dev_netpoll_cleanup,
 #endif
 	.ndo_fix_features	= vlan_dev_fix_features,
-	.ndo_neigh_construct	= netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	= netdev_default_l2upper_neigh_destroy,
 	.ndo_fdb_add		= switchdev_port_fdb_add,
 	.ndo_fdb_del		= switchdev_port_fdb_del,
 	.ndo_fdb_dump		= switchdev_port_fdb_dump,
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 6c46d1b4cdbb..5ba0b558f8ae 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -347,8 +347,6 @@ static const struct net_device_ops br_netdev_ops = {
 	.ndo_add_slave		 = br_add_slave,
 	.ndo_del_slave		 = br_del_slave,
 	.ndo_fix_features        = br_fix_features,
-	.ndo_neigh_construct	 = netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	 = netdev_default_l2upper_neigh_destroy,
 	.ndo_fdb_add		 = br_fdb_add,
 	.ndo_fdb_del		 = br_fdb_delete,
 	.ndo_fdb_dump		 = br_fdb_dump,
diff --git a/net/core/dev.c b/net/core/dev.c
index 404d2e6d5d32..3e1a60102e64 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6111,50 +6111,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
 }
 EXPORT_SYMBOL(netdev_lower_state_changed);
 
-int netdev_default_l2upper_neigh_construct(struct net_device *dev,
-					   struct neighbour *n)
-{
-	struct net_device *lower_dev, *stop_dev;
-	struct list_head *iter;
-	int err;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (!lower_dev->netdev_ops->ndo_neigh_construct)
-			continue;
-		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
-		if (err) {
-			stop_dev = lower_dev;
-			goto rollback;
-		}
-	}
-	return 0;
-
-rollback:
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (lower_dev == stop_dev)
-			break;
-		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
-			continue;
-		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
-	}
-	return err;
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
-
-void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
-					  struct neighbour *n)
-{
-	struct net_device *lower_dev;
-	struct list_head *iter;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
-			continue;
-		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
-	}
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
-
 static void dev_change_rx_flags(struct net_device *dev, int flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-- 
cgit v1.2.3


From d15c9ede6123dbce14c17eb9ced229e488002735 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 3 Feb 2017 17:37:06 +0800
Subject: sctp: process fwd tsn chunk only when prsctp is enabled

This patch is to check if asoc->peer.prsctp_capable is set before
processing fwd tsn chunk, if not, it will return an ERROR to the
peer, just as rfc3758 section 3.3.1 demands.

Reported-by: Julian Cordes <julian.cordes@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 782e579472c9..d8798ddda726 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3867,6 +3867,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
+	if (!asoc->peer.prsctp_capable)
+		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
+
 	/* Make sure that the FORWARD_TSN chunk has valid length.  */
 	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3935,6 +3938,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
+	if (!asoc->peer.prsctp_capable)
+		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
+
 	/* Make sure that the FORWARD_TSN chunk has a valid length.  */
 	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-- 
cgit v1.2.3


From d0d7b10b05945f40fefd4e60f457c61aefa3e9a9 Mon Sep 17 00:00:00 2001
From: Parav Pandit
Date: Sat, 4 Feb 2017 11:00:49 -0600
Subject: net-next: treewide use is_vlan_dev() helper function.

This patch makes use of is_vlan_dev() function instead of flag
comparison which is exactly done by is_vlan_dev() helper function.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Jon Maxwell <jmaxwell37@gmail.com>
Acked-by: Johannes Thumshirn <jth@kernel.org>
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/core/cma.c                        |  6 ++----
 drivers/infiniband/sw/rxe/rxe_net.c                  |  2 +-
 drivers/net/ethernet/broadcom/cnic.c                 |  2 +-
 drivers/net/ethernet/chelsio/cxgb3/l2t.c             |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c      |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4/l2t.c             |  2 +-
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  8 ++++----
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c     |  4 ++--
 drivers/net/hyperv/netvsc_drv.c                      |  2 +-
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c                    |  6 +++---
 drivers/scsi/cxgbi/libcxgbi.c                        |  6 +++---
 drivers/scsi/fcoe/fcoe.c                             | 13 ++++++-------
 include/rdma/ib_addr.h                               |  6 ++----
 net/hsr/hsr_slave.c                                  |  3 ++-
 14 files changed, 31 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 3e70a9c5d79d..4eb5a80e5d81 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2467,14 +2467,12 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos)
 	struct net_device *dev;
 
 	prio = rt_tos2priority(tos);
-	dev = ndev->priv_flags & IFF_802_1Q_VLAN ?
-		vlan_dev_real_dev(ndev) : ndev;
-
+	dev = is_vlan_dev(ndev) ? vlan_dev_real_dev(ndev) : ndev;
 	if (dev->num_tc)
 		return netdev_get_prio_tc_map(dev, prio);
 
 #if IS_ENABLED(CONFIG_VLAN_8021Q)
-	if (ndev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(ndev))
 		return (vlan_dev_get_egress_qos_mask(ndev, prio) &
 			VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
 #endif
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 4abdeb359fb4..d9d15561eb5d 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -118,7 +118,7 @@ static struct device *dma_device(struct rxe_dev *rxe)
 
 	ndev = rxe->ndev;
 
-	if (ndev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(ndev))
 		ndev = vlan_dev_real_dev(ndev);
 
 	return ndev->dev.parent;
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c
index b1d2ac818710..cec94bbb2ea5 100644
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c
@@ -3665,7 +3665,7 @@ static int cnic_cm_destroy(struct cnic_sock *csk)
 static inline u16 cnic_get_vlan(struct net_device *dev,
 				struct net_device **vlan_dev)
 {
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(dev)) {
 		*vlan_dev = vlan_dev_real_dev(dev);
 		return vlan_dev_vlan_id(dev);
 	}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.c b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
index 5f226eda8cd6..52063587e1e9 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
@@ -351,7 +351,7 @@ struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
 		e->smt_idx = smt_idx;
 		atomic_set(&e->refcnt, 1);
 		neigh_replace(e, neigh);
-		if (neigh->dev->priv_flags & IFF_802_1Q_VLAN)
+		if (is_vlan_dev(neigh->dev))
 			e->vlan = vlan_dev_vlan_id(neigh->dev);
 		else
 			e->vlan = VLAN_NONE;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index f4f569060689..70590144be03 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -1805,7 +1805,7 @@ static void check_neigh_update(struct neighbour *neigh)
 	const struct device *parent;
 	const struct net_device *netdev = neigh->dev;
 
-	if (netdev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(netdev))
 		netdev = vlan_dev_real_dev(netdev);
 	parent = netdev->dev.parent;
 	if (parent && parent->driver == &cxgb4_driver.driver)
@@ -2111,7 +2111,7 @@ static int cxgb4_inet6addr_handler(struct notifier_block *this,
 #if IS_ENABLED(CONFIG_BONDING)
 	struct adapter *adap;
 #endif
-	if (event_dev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(event_dev))
 		event_dev = vlan_dev_real_dev(event_dev);
 #if IS_ENABLED(CONFIG_BONDING)
 	if (event_dev->flags & IFF_MASTER) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
index 60a26037a1c6..7c8c5b9a3c22 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -432,7 +432,7 @@ struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
 	else
 		lport = netdev2pinfo(physdev)->lport;
 
-	if (neigh->dev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(neigh->dev))
 		vlan = vlan_dev_vlan_id(neigh->dev);
 	else
 		vlan = VLAN_NONE;
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 0cf8a3703275..3b5d7cfa2321 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -3264,7 +3264,7 @@ netxen_list_config_ip(struct netxen_adapter *adapter,
 		cur = kzalloc(sizeof(struct nx_ip_list), GFP_ATOMIC);
 		if (cur == NULL)
 			goto out;
-		if (dev->priv_flags & IFF_802_1Q_VLAN)
+		if (is_vlan_dev(dev))
 			dev = vlan_dev_real_dev(dev);
 		cur->master = !!netif_is_bond_master(dev);
 		cur->ip_addr = ifa->ifa_address;
@@ -3374,7 +3374,7 @@ static void netxen_config_master(struct net_device *dev, unsigned long event)
 	    !netif_is_bond_slave(dev)) {
 		netxen_config_indev_addr(adapter, master, event);
 		for_each_netdev_rcu(&init_net, slave)
-			if (slave->priv_flags & IFF_802_1Q_VLAN &&
+			if (is_vlan_dev(slave) &&
 			    vlan_dev_real_dev(slave) == master)
 				netxen_config_indev_addr(adapter, slave, event);
 	}
@@ -3400,7 +3400,7 @@ recheck:
 	if (dev == NULL)
 		goto done;
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(dev)) {
 		dev = vlan_dev_real_dev(dev);
 		goto recheck;
 	}
@@ -3445,7 +3445,7 @@ recheck:
 	if (dev == NULL)
 		goto done;
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(dev)) {
 		dev = vlan_dev_real_dev(dev);
 		goto recheck;
 	}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 4c0cce962585..b6628aaa6e4a 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -4220,7 +4220,7 @@ recheck:
 	if (dev == NULL)
 		goto done;
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(dev)) {
 		dev = vlan_dev_real_dev(dev);
 		goto recheck;
 	}
@@ -4256,7 +4256,7 @@ recheck:
 	if (dev == NULL)
 		goto done;
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(dev)) {
 		dev = vlan_dev_real_dev(dev);
 		goto recheck;
 	}
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 72b0c1f7496e..425285f36595 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1605,7 +1605,7 @@ static int netvsc_netdev_event(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	/* Avoid Vlan dev with same MAC registering as VF */
-	if (event_dev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(event_dev))
 		return NOTIFY_DONE;
 
 	/* Avoid Bonding master dev with same MAC registering as VF */
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index c639d5a02656..4b9e7761a97b 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -2282,7 +2282,7 @@ static int _bnx2fc_create(struct net_device *netdev,
 	}
 
 	/* obtain physical netdev */
-	if (netdev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(netdev))
 		phys_dev = vlan_dev_real_dev(netdev);
 
 	/* verify if the physical device is a netxtreme2 device */
@@ -2320,7 +2320,7 @@ static int _bnx2fc_create(struct net_device *netdev,
 		goto ifput_err;
 	}
 
-	if (netdev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(netdev)) {
 		vlan_id = vlan_dev_vlan_id(netdev);
 		interface->vlan_enabled = 1;
 	}
@@ -2538,7 +2538,7 @@ static bool bnx2fc_match(struct net_device *netdev)
 	struct net_device *phys_dev = netdev;
 
 	mutex_lock(&bnx2fc_dev_lock);
-	if (netdev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(netdev))
 		phys_dev = vlan_dev_real_dev(netdev);
 
 	if (bnx2fc_hba_lookup(phys_dev)) {
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 9167bcd9fffe..bd7d39ecbd24 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -223,7 +223,7 @@ struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *ndev,
 	struct cxgbi_device *cdev, *tmp;
 	int i;
 
-	if (ndev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(ndev)) {
 		vdev = ndev;
 		ndev = vlan_dev_real_dev(ndev);
 		log_debug(1 << CXGBI_DBG_DEV,
@@ -256,7 +256,7 @@ struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *ndev,
 	struct cxgbi_device *cdev;
 	int i;
 
-	if (ndev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(ndev)) {
 		vdev = ndev;
 		ndev = vlan_dev_real_dev(ndev);
 		pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name);
@@ -290,7 +290,7 @@ static struct cxgbi_device *cxgbi_device_find_by_mac(struct net_device *ndev,
 	struct cxgbi_device *cdev, *tmp;
 	int i;
 
-	if (ndev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(ndev)) {
 		vdev = ndev;
 		ndev = vlan_dev_real_dev(ndev);
 		pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name);
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 59150cad0353..79160ffae483 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -326,8 +326,7 @@ static int fcoe_interface_setup(struct fcoe_interface *fcoe,
 
 	/* look for SAN MAC address, if multiple SAN MACs exist, only
 	 * use the first one for SPMA */
-	real_dev = (netdev->priv_flags & IFF_802_1Q_VLAN) ?
-		vlan_dev_real_dev(netdev) : netdev;
+	real_dev = is_vlan_dev(netdev) ? vlan_dev_real_dev(netdev) : netdev;
 	fcoe->realdev = real_dev;
 	rcu_read_lock();
 	for_each_dev_addr(real_dev, ha) {
@@ -730,7 +729,7 @@ static int fcoe_netdev_config(struct fc_lport *lport, struct net_device *netdev)
 	ctlr = fcoe_to_ctlr(fcoe);
 
 	/* Figure out the VLAN ID, if any */
-	if (netdev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(netdev))
 		lport->vlan = vlan_dev_vlan_id(netdev);
 	else
 		lport->vlan = 0;
@@ -959,13 +958,13 @@ static inline int fcoe_em_config(struct fc_lport *lport)
 	 * Reuse existing offload em instance in case
 	 * it is already allocated on real eth device
 	 */
-	if (fcoe->netdev->priv_flags & IFF_802_1Q_VLAN)
+	if (is_vlan_dev(fcoe->netdev))
 		cur_real_dev = vlan_dev_real_dev(fcoe->netdev);
 	else
 		cur_real_dev = fcoe->netdev;
 
 	list_for_each_entry(oldfcoe, &fcoe_hostlist, list) {
-		if (oldfcoe->netdev->priv_flags & IFF_802_1Q_VLAN)
+		if (is_vlan_dev(oldfcoe->netdev))
 			old_real_dev = vlan_dev_real_dev(oldfcoe->netdev);
 		else
 			old_real_dev = oldfcoe->netdev;
@@ -1563,7 +1562,7 @@ static int fcoe_xmit(struct fc_lport *lport, struct fc_frame *fp)
 	skb->protocol = htons(ETH_P_FCOE);
 	skb->priority = fcoe->priority;
 
-	if (fcoe->netdev->priv_flags & IFF_802_1Q_VLAN &&
+	if (is_vlan_dev(fcoe->netdev) &&
 	    fcoe->realdev->features & NETIF_F_HW_VLAN_CTAG_TX) {
 		/* must set skb->dev before calling vlan_put_tag */
 		skb->dev = fcoe->realdev;
@@ -1794,7 +1793,7 @@ fcoe_hostlist_lookup_realdev_port(struct net_device *netdev)
 	struct net_device *real_dev;
 
 	list_for_each_entry(fcoe, &fcoe_hostlist, list) {
-		if (fcoe->netdev->priv_flags & IFF_802_1Q_VLAN)
+		if (is_vlan_dev(fcoe->netdev))
 			real_dev = vlan_dev_real_dev(fcoe->netdev);
 		else
 			real_dev = fcoe->netdev;
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index 1beab5532035..4b34c51f859e 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -160,8 +160,7 @@ static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr)
 
 static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
 {
-	return dev->priv_flags & IFF_802_1Q_VLAN ?
-		vlan_dev_vlan_id(dev) : 0xffff;
+	return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff;
 }
 
 static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
@@ -326,8 +325,7 @@ static inline u16 rdma_get_vlan_id(union ib_gid *dgid)
 
 static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev)
 {
-	return dev->priv_flags & IFF_802_1Q_VLAN ?
-		vlan_dev_real_dev(dev) : NULL;
+	return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL;
 }
 
 #endif /* IB_ADDR_H */
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index f5b60388d02f..56080da4aa77 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -12,6 +12,7 @@
 #include "hsr_slave.h"
 #include <linux/etherdevice.h>
 #include <linux/if_arp.h>
+#include <linux/if_vlan.h>
 #include "hsr_main.h"
 #include "hsr_device.h"
 #include "hsr_forward.h"
@@ -81,7 +82,7 @@ static int hsr_check_dev_ok(struct net_device *dev)
 		return -EINVAL;
 	}
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(dev)) {
 		netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 88e4f0ca4e4e7760e4aad544789c5408219886d5 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 3 Feb 2017 13:20:16 -0500
Subject: net: dsa: move netdevice notifier registration

Move the netdevice notifier block register code in slave.c and provide
helpers for dsa.c to register and unregister it.

At the same time, check for errors since (un)register_netdevice_notifier
may fail.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c      | 10 ++++------
 net/dsa/dsa_priv.h |  4 ++--
 net/dsa/slave.c    | 22 ++++++++++++++++++++--
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 619e57a44d1d..beb79ccf0f59 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -903,10 +903,6 @@ static struct packet_type dsa_pack_type __read_mostly = {
 	.func	= dsa_switch_rcv,
 };
 
-static struct notifier_block dsa_netdevice_nb __read_mostly = {
-	.notifier_call	= dsa_slave_netdevice_event,
-};
-
 #ifdef CONFIG_PM_SLEEP
 static int dsa_suspend(struct device *d)
 {
@@ -964,7 +960,9 @@ static int __init dsa_init_module(void)
 {
 	int rc;
 
-	register_netdevice_notifier(&dsa_netdevice_nb);
+	rc = dsa_slave_register_notifier();
+	if (rc)
+		return rc;
 
 	rc = platform_driver_register(&dsa_driver);
 	if (rc)
@@ -978,7 +976,7 @@ module_init(dsa_init_module);
 
 static void __exit dsa_cleanup_module(void)
 {
-	unregister_netdevice_notifier(&dsa_netdevice_nb);
+	dsa_slave_unregister_notifier();
 	dev_remove_pack(&dsa_pack_type);
 	platform_driver_unregister(&dsa_driver);
 }
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index a5509b765fc0..591a40aea9ca 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -63,8 +63,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 void dsa_slave_destroy(struct net_device *slave_dev);
 int dsa_slave_suspend(struct net_device *slave_dev);
 int dsa_slave_resume(struct net_device *slave_dev);
-int dsa_slave_netdevice_event(struct notifier_block *unused,
-			      unsigned long event, void *ptr);
+int dsa_slave_register_notifier(void);
+void dsa_slave_unregister_notifier(void);
 
 /* tag_dsa.c */
 extern const struct dsa_device_ops dsa_netdev_ops;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 09fc3e9462c1..949644c1dac2 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1524,8 +1524,8 @@ static int dsa_slave_port_event(struct net_device *dev, unsigned long event,
 	return NOTIFY_DONE;
 }
 
-int dsa_slave_netdevice_event(struct notifier_block *unused,
-			      unsigned long event, void *ptr)
+static int dsa_slave_netdevice_event(struct notifier_block *nb,
+				     unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
@@ -1534,3 +1534,21 @@ int dsa_slave_netdevice_event(struct notifier_block *unused,
 
 	return NOTIFY_DONE;
 }
+
+static struct notifier_block dsa_slave_nb __read_mostly = {
+	.notifier_call	= dsa_slave_netdevice_event,
+};
+
+int dsa_slave_register_notifier(void)
+{
+	return register_netdevice_notifier(&dsa_slave_nb);
+}
+
+void dsa_slave_unregister_notifier(void)
+{
+	int err;
+
+	err = unregister_netdevice_notifier(&dsa_slave_nb);
+	if (err)
+		pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
+}
-- 
cgit v1.2.3


From 8e92ab3a426e04dc355b196e3b4474f633025a3b Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 3 Feb 2017 13:20:17 -0500
Subject: net: dsa: simplify netdevice events handling

Simplify the code handling the slave netdevice notifier call by
providing a dsa_slave_changeupper helper for NETDEV_CHANGEUPPER, and so
on (only this event is supported at the moment.)

Return NOTIFY_DONE when we did not care about an event, and NOTIFY_OK
when we were concerned but no error occurred, as the API suggests.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 949644c1dac2..332eb234dc21 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1491,37 +1491,22 @@ static bool dsa_slave_dev_check(struct net_device *dev)
 	return dev->netdev_ops == &dsa_slave_netdev_ops;
 }
 
-static int dsa_slave_port_upper_event(struct net_device *dev,
-				      unsigned long event, void *ptr)
+static int dsa_slave_changeupper(struct net_device *dev,
+				 struct netdev_notifier_changeupper_info *info)
 {
-	struct netdev_notifier_changeupper_info *info = ptr;
-	struct net_device *upper = info->upper_dev;
-	int err = 0;
+	int err = NOTIFY_DONE;
 
-	switch (event) {
-	case NETDEV_CHANGEUPPER:
-		if (netif_is_bridge_master(upper)) {
-			if (info->linking)
-				err = dsa_slave_bridge_port_join(dev, upper);
-			else
-				dsa_slave_bridge_port_leave(dev, upper);
+	if (netif_is_bridge_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_slave_bridge_port_join(dev, info->upper_dev);
+			err = notifier_from_errno(err);
+		} else {
+			dsa_slave_bridge_port_leave(dev, info->upper_dev);
+			err = NOTIFY_OK;
 		}
-
-		break;
-	}
-
-	return notifier_from_errno(err);
-}
-
-static int dsa_slave_port_event(struct net_device *dev, unsigned long event,
-				void *ptr)
-{
-	switch (event) {
-	case NETDEV_CHANGEUPPER:
-		return dsa_slave_port_upper_event(dev, event, ptr);
 	}
 
-	return NOTIFY_DONE;
+	return err;
 }
 
 static int dsa_slave_netdevice_event(struct notifier_block *nb,
@@ -1529,8 +1514,11 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
-	if (dsa_slave_dev_check(dev))
-		return dsa_slave_port_event(dev, event, ptr);
+	if (dev->netdev_ops != &dsa_slave_netdev_ops)
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_CHANGEUPPER)
+		return dsa_slave_changeupper(dev, ptr);
 
 	return NOTIFY_DONE;
 }
-- 
cgit v1.2.3


From 9c26542685130ef3b55cdb4e04eec0ac33376b41 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 3 Feb 2017 13:20:18 -0500
Subject: net: dsa: rollback bridging on error

When an error is returned during the bridging of a port in a
NETDEV_CHANGEUPPER event, net/core/dev.c rolls back the operation.

Be consistent and unassign dp->bridge_dev when this happens.

In the meantime, add comments to document this behavior.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 332eb234dc21..d726307c7795 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -562,12 +562,21 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	struct dsa_switch *ds = p->dp->ds;
 	int ret = -EOPNOTSUPP;
 
+	/* Here the port is already bridged. Reflect the current configuration
+	 * so that drivers can program their chips accordingly.
+	 */
 	p->dp->bridge_dev = br;
 
 	if (ds->ops->port_bridge_join)
 		ret = ds->ops->port_bridge_join(ds, p->dp->index, br);
 
-	return ret == -EOPNOTSUPP ? 0 : ret;
+	/* The bridging is rolled back on error */
+	if (ret && ret != -EOPNOTSUPP) {
+		p->dp->bridge_dev = NULL;
+		return ret;
+	}
+
+	return 0;
 }
 
 static void dsa_slave_bridge_port_leave(struct net_device *dev,
@@ -576,6 +585,9 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev,
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->dp->ds;
 
+	/* Here the port is already unbridged. Reflect the current configuration
+	 * so that drivers can program their chips accordingly.
+	 */
 	p->dp->bridge_dev = NULL;
 
 	if (ds->ops->port_bridge_leave)
-- 
cgit v1.2.3


From c5d35cb32cffa6e4c2db1cbd9a544e10a8d6fda9 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 3 Feb 2017 13:20:19 -0500
Subject: net: dsa: change state setter scope

The scope of the functions inside net/dsa/slave.c must be the slave
net_device pointer. Change to state setter helper accordingly to
simplify callers.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index d726307c7795..d8c3c0f00cf3 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -74,9 +74,12 @@ static inline bool dsa_port_is_bridged(struct dsa_port *dp)
 	return !!dp->bridge_dev;
 }
 
-static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
+static void dsa_slave_set_state(struct net_device *dev, u8 state)
 {
-	struct dsa_port *dp = &ds->ports[port];
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_port *dp = p->dp;
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
 
 	if (ds->ops->port_stp_state_set)
 		ds->ops->port_stp_state_set(ds, port, state);
@@ -133,7 +136,7 @@ static int dsa_slave_open(struct net_device *dev)
 			goto clear_promisc;
 	}
 
-	dsa_port_set_stp_state(ds, p->dp->index, stp_state);
+	dsa_slave_set_state(dev, stp_state);
 
 	if (p->phy)
 		phy_start(p->phy);
@@ -175,7 +178,7 @@ static int dsa_slave_close(struct net_device *dev)
 	if (ds->ops->port_disable)
 		ds->ops->port_disable(ds, p->dp->index, p->phy);
 
-	dsa_port_set_stp_state(ds, p->dp->index, BR_STATE_DISABLED);
+	dsa_slave_set_state(dev, BR_STATE_DISABLED);
 
 	return 0;
 }
@@ -382,7 +385,7 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
 	if (switchdev_trans_ph_prepare(trans))
 		return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
 
-	dsa_port_set_stp_state(ds, p->dp->index, attr->u.stp_state);
+	dsa_slave_set_state(dev, attr->u.stp_state);
 
 	return 0;
 }
@@ -596,7 +599,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev,
 	/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
 	 */
-	dsa_port_set_stp_state(ds, p->dp->index, BR_STATE_FORWARDING);
+	dsa_slave_set_state(dev, BR_STATE_FORWARDING);
 }
 
 static int dsa_slave_port_attr_get(struct net_device *dev,
-- 
cgit v1.2.3


From f515f192ab4f45bb695146b82432d63d98775787 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 3 Feb 2017 13:20:20 -0500
Subject: net: dsa: add switch notifier

Add a notifier block per DSA switch, registered against a notifier head
in the switch fabric they belong to.

This infrastructure will allow to propagate fabric-wide events such as
port bridging, VLAN configuration, etc. If a DSA switch driver cares
about cross-chip configuration, such events can be caught.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  7 +++++++
 net/dsa/Makefile   |  1 +
 net/dsa/dsa.c      |  6 ++++++
 net/dsa/dsa2.c     |  6 ++++++
 net/dsa/dsa_priv.h |  4 ++++
 net/dsa/switch.c   | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 77 insertions(+)
 create mode 100644 net/dsa/switch.c

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2cb77e64d648..ac4ea7c3a102 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -13,6 +13,7 @@
 
 #include <linux/if_ether.h>
 #include <linux/list.h>
+#include <linux/notifier.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/of.h>
@@ -92,6 +93,9 @@ struct packet_type;
 struct dsa_switch_tree {
 	struct list_head	list;
 
+	/* Notifier chain for switch-wide events */
+	struct raw_notifier_head	nh;
+
 	/* Tree identifier */
 	u32 tree;
 
@@ -182,6 +186,9 @@ struct dsa_switch {
 	struct dsa_switch_tree	*dst;
 	int			index;
 
+	/* Listener for switch fabric events */
+	struct notifier_block	nb;
+
 	/*
 	 * Give the switch driver somewhere to hang its private data
 	 * structure.
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index a3380ed0e0be..72912982de3d 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,7 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += dsa.o slave.o dsa2.o
+dsa_core-y += dsa.o slave.o dsa2.o switch.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index beb79ccf0f59..22e44f691ab9 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -275,6 +275,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	if (ret < 0)
 		return ret;
 
+	ret = dsa_switch_register_notifier(ds);
+	if (ret)
+		return ret;
+
 	if (ops->set_addr) {
 		ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
 		if (ret < 0)
@@ -400,6 +404,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 
 	if (ds->slave_mii_bus && ds->ops->phy_read)
 		mdiobus_unregister(ds->slave_mii_bus);
+
+	dsa_switch_unregister_notifier(ds);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 9f8cc26be9ea..1c546b6621ee 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -294,6 +294,10 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	if (err < 0)
 		return err;
 
+	err = dsa_switch_register_notifier(ds);
+	if (err)
+		return err;
+
 	if (ds->ops->set_addr) {
 		err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
 		if (err < 0)
@@ -364,6 +368,8 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 
 	if (ds->slave_mii_bus && ds->ops->phy_read)
 		mdiobus_unregister(ds->slave_mii_bus);
+
+	dsa_switch_unregister_notifier(ds);
 }
 
 static int dsa_dst_apply(struct dsa_switch_tree *dst)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 591a40aea9ca..0706a511244e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -66,6 +66,10 @@ int dsa_slave_resume(struct net_device *slave_dev);
 int dsa_slave_register_notifier(void);
 void dsa_slave_unregister_notifier(void);
 
+/* switch.c */
+int dsa_switch_register_notifier(struct dsa_switch *ds);
+void dsa_switch_unregister_notifier(struct dsa_switch *ds);
+
 /* tag_dsa.c */
 extern const struct dsa_device_ops dsa_netdev_ops;
 
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
new file mode 100644
index 000000000000..e22fa7633d03
--- /dev/null
+++ b/net/dsa/switch.c
@@ -0,0 +1,53 @@
+/*
+ * Handling of a single switch chip, part of a switch fabric
+ *
+ * Copyright (c) 2017 Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <net/dsa.h>
+
+static int dsa_switch_event(struct notifier_block *nb,
+			    unsigned long event, void *info)
+{
+	struct dsa_switch *ds = container_of(nb, struct dsa_switch, nb);
+	int err;
+
+	switch (event) {
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	/* Non-switchdev operations cannot be rolled back. If a DSA driver
+	 * returns an error during the chained call, switch chips may be in an
+	 * inconsistent state.
+	 */
+	if (err)
+		dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n",
+			event, err);
+
+	return notifier_from_errno(err);
+}
+
+int dsa_switch_register_notifier(struct dsa_switch *ds)
+{
+	ds->nb.notifier_call = dsa_switch_event;
+
+	return raw_notifier_chain_register(&ds->dst->nh, &ds->nb);
+}
+
+void dsa_switch_unregister_notifier(struct dsa_switch *ds)
+{
+	int err;
+
+	err = raw_notifier_chain_unregister(&ds->dst->nh, &ds->nb);
+	if (err)
+		dev_err(ds->dev, "failed to unregister notifier (%d)\n", err);
+}
-- 
cgit v1.2.3


From 04d3a4c6af52a58370795bc9f70dc15f51f8bb84 Mon Sep 17 00:00:00 2001
From: Vivien Didelot
Date: Fri, 3 Feb 2017 13:20:21 -0500
Subject: net: dsa: introduce bridge notifier

A slave device will now notify the switch fabric once its port is
bridged or unbridged, instead of calling directly its switch operations.

This code allows propagating cross-chip bridging events in the fabric.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 10 ++++++++++
 net/dsa/slave.c   | 40 +++++++++++++++++++++++++++++-----------
 net/dsa/switch.c  | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ac4ea7c3a102..e9c940c8936f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -268,6 +268,16 @@ struct switchdev_obj_port_fdb;
 struct switchdev_obj_port_mdb;
 struct switchdev_obj_port_vlan;
 
+#define DSA_NOTIFIER_BRIDGE_JOIN		1
+#define DSA_NOTIFIER_BRIDGE_LEAVE		2
+
+/* DSA_NOTIFIER_BRIDGE_* */
+struct dsa_notifier_bridge_info {
+	struct net_device *br;
+	int sw_index;
+	int port;
+};
+
 struct dsa_switch_ops {
 	/*
 	 * Probing and setup.
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index d8c3c0f00cf3..061a49c29cef 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -27,6 +27,17 @@
 
 static bool dsa_slave_dev_check(struct net_device *dev);
 
+static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct raw_notifier_head *nh = &p->dp->ds->dst->nh;
+	int err;
+
+	err = raw_notifier_call_chain(nh, e, v);
+
+	return notifier_to_errno(err);
+}
+
 /* slave mii_bus handling ***************************************************/
 static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
 {
@@ -562,39 +573,46 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 				      struct net_device *br)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->dp->ds;
-	int ret = -EOPNOTSUPP;
+	struct dsa_notifier_bridge_info info = {
+		.sw_index = p->dp->ds->index,
+		.port = p->dp->index,
+		.br = br,
+	};
+	int err;
 
 	/* Here the port is already bridged. Reflect the current configuration
 	 * so that drivers can program their chips accordingly.
 	 */
 	p->dp->bridge_dev = br;
 
-	if (ds->ops->port_bridge_join)
-		ret = ds->ops->port_bridge_join(ds, p->dp->index, br);
+	err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info);
 
 	/* The bridging is rolled back on error */
-	if (ret && ret != -EOPNOTSUPP) {
+	if (err)
 		p->dp->bridge_dev = NULL;
-		return ret;
-	}
 
-	return 0;
+	return err;
 }
 
 static void dsa_slave_bridge_port_leave(struct net_device *dev,
 					struct net_device *br)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->dp->ds;
+	struct dsa_notifier_bridge_info info = {
+		.sw_index = p->dp->ds->index,
+		.port = p->dp->index,
+		.br = br,
+	};
+	int err;
 
 	/* Here the port is already unbridged. Reflect the current configuration
 	 * so that drivers can program their chips accordingly.
 	 */
 	p->dp->bridge_dev = NULL;
 
-	if (ds->ops->port_bridge_leave)
-		ds->ops->port_bridge_leave(ds, p->dp->index, br);
+	err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+	if (err)
+		netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
 
 	/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index e22fa7633d03..6456dacf9ae9 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -13,6 +13,32 @@
 #include <linux/notifier.h>
 #include <net/dsa.h>
 
+static int dsa_switch_bridge_join(struct dsa_switch *ds,
+				  struct dsa_notifier_bridge_info *info)
+{
+	if (ds->index == info->sw_index && ds->ops->port_bridge_join)
+		return ds->ops->port_bridge_join(ds, info->port, info->br);
+
+	if (ds->index != info->sw_index)
+		dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n",
+			info->sw_index, info->port, netdev_name(info->br));
+
+	return 0;
+}
+
+static int dsa_switch_bridge_leave(struct dsa_switch *ds,
+				   struct dsa_notifier_bridge_info *info)
+{
+	if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
+		ds->ops->port_bridge_leave(ds, info->port, info->br);
+
+	if (ds->index != info->sw_index)
+		dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n",
+			info->sw_index, info->port, netdev_name(info->br));
+
+	return 0;
+}
+
 static int dsa_switch_event(struct notifier_block *nb,
 			    unsigned long event, void *info)
 {
@@ -20,6 +46,12 @@ static int dsa_switch_event(struct notifier_block *nb,
 	int err;
 
 	switch (event) {
+	case DSA_NOTIFIER_BRIDGE_JOIN:
+		err = dsa_switch_bridge_join(ds, info);
+		break;
+	case DSA_NOTIFIER_BRIDGE_LEAVE:
+		err = dsa_switch_bridge_leave(ds, info);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
-- 
cgit v1.2.3


From 1f90c7f3470580e24da25f6a6c1fb480ed9371ac Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Sat, 4 Feb 2017 18:05:06 +0100
Subject: bridge: modify bridge and port to have often accessed fields in one
 cache line

Move around net_bridge so the vlan fields are in the beginning since
they're checked on every packet even if vlan filtering is disabled.
For the port move flags & vlan group to the beginning, so they're in the
same cache line with the port's state (both flags and state are checked
on each packet).

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_private.h | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 40177df45ba6..ec8560349b6f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -212,12 +212,16 @@ struct net_bridge_mdb_htable
 	u32				ver;
 };
 
-struct net_bridge_port
-{
+struct net_bridge_port {
 	struct net_bridge		*br;
 	struct net_device		*dev;
 	struct list_head		list;
 
+	unsigned long			flags;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	struct net_bridge_vlan_group	__rcu *vlgrp;
+#endif
+
 	/* STP */
 	u8				priority;
 	u8				state;
@@ -238,8 +242,6 @@ struct net_bridge_port
 	struct kobject			kobj;
 	struct rcu_head			rcu;
 
-	unsigned long 			flags;
-
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	struct bridge_mcast_own_query	ip4_own_query;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -259,9 +261,6 @@ struct net_bridge_port
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll			*np;
 #endif
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-	struct net_bridge_vlan_group	__rcu *vlgrp;
-#endif
 #ifdef CONFIG_NET_SWITCHDEV
 	int				offload_fwd_mark;
 #endif
@@ -283,14 +282,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *
 		rtnl_dereference(dev->rx_handler_data) : NULL;
 }
 
-struct net_bridge
-{
+struct net_bridge {
 	spinlock_t			lock;
+	spinlock_t			hash_lock;
 	struct list_head		port_list;
 	struct net_device		*dev;
-
 	struct pcpu_sw_netstats		__percpu *stats;
-	spinlock_t			hash_lock;
+	/* These fields are accessed on each packet */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	u8				vlan_enabled;
+	u8				vlan_stats_enabled;
+	__be16				vlan_proto;
+	u16				default_pvid;
+	struct net_bridge_vlan_group	__rcu *vlgrp;
+#endif
+
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	union {
@@ -308,6 +314,9 @@ struct net_bridge
 	bridge_id			designated_root;
 	bridge_id			bridge_id;
 	u32				root_path_cost;
+	unsigned char			topology_change;
+	unsigned char			topology_change_detected;
+	u16				root_port;
 	unsigned long			max_age;
 	unsigned long			hello_time;
 	unsigned long			forward_delay;
@@ -319,7 +328,6 @@ struct net_bridge
 
 	u8				group_addr[ETH_ALEN];
 	bool				group_addr_set;
-	u16				root_port;
 
 	enum {
 		BR_NO_STP, 		/* no spanning tree */
@@ -327,9 +335,6 @@ struct net_bridge
 		BR_USER_STP,		/* new RSTP in userspace */
 	} stp_enabled;
 
-	unsigned char			topology_change;
-	unsigned char			topology_change_detected;
-
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	unsigned char			multicast_router;
 
@@ -381,14 +386,6 @@ struct net_bridge
 #ifdef CONFIG_NET_SWITCHDEV
 	int offload_fwd_mark;
 #endif
-
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-	struct net_bridge_vlan_group	__rcu *vlgrp;
-	u8				vlan_enabled;
-	u8				vlan_stats_enabled;
-	__be16				vlan_proto;
-	u16				default_pvid;
-#endif
 };
 
 struct br_input_skb_cb {
-- 
cgit v1.2.3


From f7cdee8a79a1cb03fa9ca71b825e72f880b344e1 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Sat, 4 Feb 2017 18:05:07 +0100
Subject: bridge: move to workqueue gc

Move the fdb garbage collector to a workqueue which fires at least 10
milliseconds apart and cleans chain by chain allowing for other tasks
to run in the meantime. When having thousands of fdbs the system is much
more responsive. Most importantly remove the need to check if the
matched entry has expired in __br_fdb_get that causes false-sharing and
is completely unnecessary if we cleanup entries, at worst we'll get 10ms
of traffic for that entry before it gets deleted.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c    |  1 +
 net/bridge/br_fdb.c       | 31 +++++++++++++++++++------------
 net/bridge/br_if.c        |  2 +-
 net/bridge/br_ioctl.c     |  2 +-
 net/bridge/br_netlink.c   |  2 +-
 net/bridge/br_private.h   |  4 ++--
 net/bridge/br_stp.c       |  2 +-
 net/bridge/br_stp_if.c    |  4 ++--
 net/bridge/br_stp_timer.c |  2 --
 net/bridge/br_sysfs_br.c  |  2 +-
 10 files changed, 29 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 5ba0b558f8ae..d208ee9ab60a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -411,4 +411,5 @@ void br_dev_setup(struct net_device *dev)
 	br_netfilter_rtable_init(br);
 	br_stp_timer_init(br);
 	br_multicast_init(br);
+	INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup);
 }
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e4a4176171c9..5cbed5c0db88 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -154,7 +154,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 	if (f->added_by_external_learn)
 		fdb_del_external_learn(f);
 
-	hlist_del_rcu(&f->hlist);
+	hlist_del_init_rcu(&f->hlist);
 	fdb_notify(br, f, RTM_DELNEIGH);
 	call_rcu(&f->rcu, fdb_rcu_free);
 }
@@ -290,34 +290,43 @@ out:
 	spin_unlock_bh(&br->hash_lock);
 }
 
-void br_fdb_cleanup(unsigned long _data)
+void br_fdb_cleanup(struct work_struct *work)
 {
-	struct net_bridge *br = (struct net_bridge *)_data;
+	struct net_bridge *br = container_of(work, struct net_bridge,
+					     gc_work.work);
 	unsigned long delay = hold_time(br);
-	unsigned long next_timer = jiffies + br->ageing_time;
+	unsigned long work_delay = delay;
+	unsigned long now = jiffies;
 	int i;
 
-	spin_lock(&br->hash_lock);
 	for (i = 0; i < BR_HASH_SIZE; i++) {
 		struct net_bridge_fdb_entry *f;
 		struct hlist_node *n;
 
+		if (!br->hash[i].first)
+			continue;
+
+		spin_lock_bh(&br->hash_lock);
 		hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
 			unsigned long this_timer;
+
 			if (f->is_static)
 				continue;
 			if (f->added_by_external_learn)
 				continue;
 			this_timer = f->updated + delay;
-			if (time_before_eq(this_timer, jiffies))
+			if (time_after(this_timer, now))
+				work_delay = min(work_delay, this_timer - now);
+			else
 				fdb_delete(br, f);
-			else if (time_before(this_timer, next_timer))
-				next_timer = this_timer;
 		}
+		spin_unlock_bh(&br->hash_lock);
+		cond_resched();
 	}
-	spin_unlock(&br->hash_lock);
 
-	mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
+	/* Cleanup minimum 10 milliseconds apart */
+	work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
+	mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
 }
 
 /* Completely flush all dynamic entries in forwarding database.*/
@@ -382,8 +391,6 @@ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
 				&br->hash[br_mac_hash(addr, vid)], hlist) {
 		if (ether_addr_equal(fdb->addr.addr, addr) &&
 		    fdb->vlan_id == vid) {
-			if (unlikely(has_expired(br, fdb)))
-				break;
 			return fdb;
 		}
 	}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ed0dd3340084..8ac1770aa222 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -313,7 +313,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
 
 	br_vlan_flush(br);
 	br_multicast_dev_del(br);
-	del_timer_sync(&br->gc_timer);
+	cancel_delayed_work_sync(&br->gc_work);
 
 	br_sysfs_delbr(br->dev);
 	unregister_netdevice_queue(br->dev, head);
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index da8157c57eb1..7970f8540cbb 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		b.hello_timer_value = br_timer_value(&br->hello_timer);
 		b.tcn_timer_value = br_timer_value(&br->tcn_timer);
 		b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
-		b.gc_timer_value = br_timer_value(&br->gc_timer);
+		b.gc_timer_value = br_timer_value(&br->gc_work.timer);
 		rcu_read_unlock();
 
 		if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index fc5d885dbb22..1cbdc5b96aa7 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1250,7 +1250,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
 			      IFLA_BR_PAD))
 		return -EMSGSIZE;
-	clockval = br_timer_value(&br->gc_timer);
+	clockval = br_timer_value(&br->gc_work.timer);
 	if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
 		return -EMSGSIZE;
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index ec8560349b6f..47fd64bf5022 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -379,7 +379,7 @@ struct net_bridge {
 	struct timer_list		hello_timer;
 	struct timer_list		tcn_timer;
 	struct timer_list		topology_change_timer;
-	struct timer_list		gc_timer;
+	struct delayed_work		gc_work;
 	struct kobject			*ifobj;
 	u32				auto_cnt;
 
@@ -502,7 +502,7 @@ void br_fdb_find_delete_local(struct net_bridge *br,
 			      const unsigned char *addr, u16 vid);
 void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
 void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
-void br_fdb_cleanup(unsigned long arg);
+void br_fdb_cleanup(struct work_struct *work);
 void br_fdb_delete_by_port(struct net_bridge *br,
 			   const struct net_bridge_port *p, u16 vid, int do_all);
 struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 71fd1a4e63cc..8f56c2d1f1a7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -602,7 +602,7 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 	br->ageing_time = t;
 	spin_unlock_bh(&br->lock);
 
-	mod_timer(&br->gc_timer, jiffies);
+	mod_delayed_work(system_long_wq, &br->gc_work, 0);
 
 	return 0;
 }
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 6c1e21411125..08341d2aa9c9 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -57,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br)
 	spin_lock_bh(&br->lock);
 	if (br->stp_enabled == BR_KERNEL_STP)
 		mod_timer(&br->hello_timer, jiffies + br->hello_time);
-	mod_timer(&br->gc_timer, jiffies + HZ/10);
+	mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10);
 
 	br_config_bpdu_generation(br);
 
@@ -88,7 +88,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
 	del_timer_sync(&br->hello_timer);
 	del_timer_sync(&br->topology_change_timer);
 	del_timer_sync(&br->tcn_timer);
-	del_timer_sync(&br->gc_timer);
+	cancel_delayed_work_sync(&br->gc_work);
 }
 
 /* called under bridge lock */
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 7ddb38e0a06e..c98b3e5c140a 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br)
 	setup_timer(&br->topology_change_timer,
 		      br_topology_change_timer_expired,
 		      (unsigned long) br);
-
-	setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
 }
 
 void br_stp_port_timer_init(struct net_bridge_port *p)
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index a18148213b08..0f4034934d56 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -263,7 +263,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
 			     char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+	return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
 }
 static DEVICE_ATTR_RO(gc_timer);
 
-- 
cgit v1.2.3


From 1214628cb1868254e107230c9052f28ff9899b6a Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Sat, 4 Feb 2017 18:05:08 +0100
Subject: bridge: move write-heavy fdb members in their own cache line

Fdb's used and updated fields are written to on every packet forward and
packet receive respectively. Thus if we are receiving packets from a
particular fdb, they'll cause false-sharing with everyone who has looked
it up (even if it didn't match, since mac/vid share cache line!). The
"used" field is even worse since it is updated on every packet forward
to that fdb, thus the standard config where X ports use a single gateway
results in 100% fdb false-sharing. Note that this patch does not prevent
the last scenario, but it makes it better for other bridge participants
which are not using that fdb (and are only doing lookups over it).
The point is with this move we make sure that only communicating parties
get the false-sharing, in a later patch we'll show how to avoid that too.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_private.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 47fd64bf5022..1cbbf63a5ef7 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -160,19 +160,21 @@ struct net_bridge_vlan_group {
 	u16				pvid;
 };
 
-struct net_bridge_fdb_entry
-{
+struct net_bridge_fdb_entry {
 	struct hlist_node		hlist;
 	struct net_bridge_port		*dst;
 
-	unsigned long			updated;
-	unsigned long			used;
 	mac_addr			addr;
 	__u16				vlan_id;
 	unsigned char			is_local:1,
 					is_static:1,
 					added_by_user:1,
 					added_by_external_learn:1;
+
+	/* write-heavy members should not affect lookups */
+	unsigned long			updated ____cacheline_aligned_in_smp;
+	unsigned long			used;
+
 	struct rcu_head			rcu;
 };
 
-- 
cgit v1.2.3


From 83a718d6294964fd1b227fa5f1ad001bc1fe7656 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Sat, 4 Feb 2017 18:05:09 +0100
Subject: bridge: fdb: write to used and updated at most once per jiffy

Writing once per jiffy is enough to limit the bridge's false sharing.
After this change the bridge doesn't show up in the local load HitM stats.

Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_fdb.c   | 3 ++-
 net/bridge/br_input.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 5cbed5c0db88..5028691fa68a 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -597,7 +597,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 				fdb->dst = source;
 				fdb_modified = true;
 			}
-			fdb->updated = jiffies;
+			if (jiffies != fdb->updated)
+				fdb->updated = jiffies;
 			if (unlikely(added_by_user))
 				fdb->added_by_user = 1;
 			if (unlikely(fdb_modified))
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index fba38d8a1a08..220943f920d2 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -198,7 +198,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		if (dst->is_local)
 			return br_pass_frame_up(skb);
 
-		dst->used = jiffies;
+		if (jiffies != dst->used)
+			dst->used = jiffies;
 		br_forward(dst->dst, skb, local_rcv, false);
 	} else {
 		if (!mcast_hit)
-- 
cgit v1.2.3


From bddb2afcb6c52a545f18fb9bcd4829828ebf4a3a Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Tue, 7 Feb 2017 10:40:50 +0100
Subject: mac80211: add back lost debugfs files

Somehow these files were never present or lost, but the code
is there and they seem somewhat useful, so add them back.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs.c     | 1 +
 net/mac80211/debugfs_sta.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index f62cd0e13c58..47bea0babe78 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -356,6 +356,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
 
 	DEBUGFS_ADD(total_ps_buffered);
 	DEBUGFS_ADD(wep_iv);
+	DEBUGFS_ADD(rate_ctrl_alg);
 	DEBUGFS_ADD(queues);
 	DEBUGFS_ADD(misc);
 #ifdef CONFIG_PM
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index f6003b8c2c33..42601820db20 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -522,6 +522,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 		return;
 
 	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(aid);
 	DEBUGFS_ADD(num_ps_buf_frames);
 	DEBUGFS_ADD(last_seq_ctrl);
 	DEBUGFS_ADD(agg_status);
-- 
cgit v1.2.3


From 14b89f36eed2993670906a3991bca496a5ebf1a6 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Sat, 4 Feb 2017 13:02:42 -0800
Subject: net: dsa: Rename and export dev_to_net_device()

In preparation for using this function in net/dsa/dsa2.c, rename the function
to make its scope DSA specific, and export it.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 net/dsa/dsa.c     | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index e9c940c8936f..2a21fa80f898 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -445,6 +445,7 @@ struct dsa_switch_driver {
 void register_switch_driver(struct dsa_switch_driver *type);
 void unregister_switch_driver(struct dsa_switch_driver *type);
 struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev);
+struct net_device *dsa_dev_to_net_device(struct device *dev);
 
 static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst)
 {
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 22e44f691ab9..b6d4f6a23f06 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -492,7 +492,7 @@ struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus);
 
-static struct net_device *dev_to_net_device(struct device *dev)
+struct net_device *dsa_dev_to_net_device(struct device *dev)
 {
 	struct device *d;
 
@@ -509,6 +509,7 @@ static struct net_device *dev_to_net_device(struct device *dev)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 
 #ifdef CONFIG_OF
 static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
@@ -817,7 +818,7 @@ static int dsa_probe(struct platform_device *pdev)
 		dev = pd->of_netdev;
 		dev_hold(dev);
 	} else {
-		dev = dev_to_net_device(pd->netdev);
+		dev = dsa_dev_to_net_device(pd->netdev);
 	}
 	if (dev == NULL) {
 		ret = -EPROBE_DEFER;
-- 
cgit v1.2.3


From 71e0bbde0d88047f66b25721f69a441d46083748 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Sat, 4 Feb 2017 13:02:43 -0800
Subject: net: dsa: Add support for platform data

Allow drivers to use the new DSA API with platform data. Most of the
code in net/dsa/dsa2.c does not rely so much on device_nodes and can get
the same information from platform_data instead.

We purposely do not support distributed configurations with platform
data, so drivers should be providing a pointer to a 'struct
dsa_chip_data' structure if they wish to communicate per-port layout.

Multiple CPUs port could potentially be supported and dsa_chip_data is
extended to receive up to one reference to an upstream network device
per port described by a dsa_chip_data structure.

dsa_dev_to_net_device() increments the network device's reference count,
so we intentionally call dev_put() to be consistent with the DT-enabled
path, until we have a generic notifier based solution.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |   6 ++++
 net/dsa/dsa2.c    | 102 ++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2a21fa80f898..b49b2004891e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -45,6 +45,11 @@ struct dsa_chip_data {
 	struct device	*host_dev;
 	int		sw_addr;
 
+	/*
+	 * Reference to network devices
+	 */
+	struct device	*netdev[DSA_MAX_PORTS];
+
 	/* set to size of eeprom if supported by the switch */
 	int		eeprom_len;
 
@@ -170,6 +175,7 @@ struct dsa_mall_tc_entry {
 struct dsa_port {
 	struct dsa_switch	*ds;
 	unsigned int		index;
+	const char		*name;
 	struct net_device	*netdev;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 1c546b6621ee..6f5f0a2ad256 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -78,19 +78,28 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
 	kref_put(&dst->refcount, dsa_free_dst);
 }
 
+/* For platform data configurations, we need to have a valid name argument to
+ * differentiate a disabled port from an enabled one
+ */
 static bool dsa_port_is_valid(struct dsa_port *port)
 {
-	return !!port->dn;
+	return !!(port->dn || port->name);
 }
 
 static bool dsa_port_is_dsa(struct dsa_port *port)
 {
-	return !!of_parse_phandle(port->dn, "link", 0);
+	if (port->name && !strcmp(port->name, "dsa"))
+		return true;
+	else
+		return !!of_parse_phandle(port->dn, "link", 0);
 }
 
 static bool dsa_port_is_cpu(struct dsa_port *port)
 {
-	return !!of_parse_phandle(port->dn, "ethernet", 0);
+	if (port->name && !strcmp(port->name, "cpu"))
+		return true;
+	else
+		return !!of_parse_phandle(port->dn, "ethernet", 0);
 }
 
 static bool dsa_ds_find_port_dn(struct dsa_switch *ds,
@@ -250,10 +259,11 @@ static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index,
 static int dsa_user_port_apply(struct dsa_port *port, u32 index,
 			       struct dsa_switch *ds)
 {
-	const char *name;
+	const char *name = port->name;
 	int err;
 
-	name = of_get_property(port->dn, "label", NULL);
+	if (port->dn)
+		name = of_get_property(port->dn, "label", NULL);
 	if (!name)
 		name = "eth%d";
 
@@ -444,11 +454,16 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 	struct net_device *ethernet_dev;
 	struct device_node *ethernet;
 
-	ethernet = of_parse_phandle(port->dn, "ethernet", 0);
-	if (!ethernet)
-		return -EINVAL;
+	if (port->dn) {
+		ethernet = of_parse_phandle(port->dn, "ethernet", 0);
+		if (!ethernet)
+			return -EINVAL;
+		ethernet_dev = of_find_net_device_by_node(ethernet);
+	} else {
+		ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]);
+		dev_put(ethernet_dev);
+	}
 
-	ethernet_dev = of_find_net_device_by_node(ethernet);
 	if (!ethernet_dev)
 		return -EPROBE_DEFER;
 
@@ -551,6 +566,33 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
 	return 0;
 }
 
+static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds)
+{
+	bool valid_name_found = false;
+	unsigned int i;
+
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		if (!cd->port_names[i])
+			continue;
+
+		ds->ports[i].name = cd->port_names[i];
+
+		/* Initialize enabled_port_mask now for drv->setup()
+		 * to have access to a correct value, just like what
+		 * net/dsa/dsa.c::dsa_switch_setup_one does.
+		 */
+		if (!dsa_port_is_cpu(&ds->ports[i]))
+			ds->enabled_port_mask |= 1 << i;
+
+		valid_name_found = true;
+	}
+
+	if (!valid_name_found && i == DSA_MAX_PORTS)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index)
 {
 	int err;
@@ -575,6 +617,18 @@ static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index)
 	return 0;
 }
 
+static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index)
+{
+	if (!pd)
+		return -ENODEV;
+
+	/* We do not support complex trees with dsa_chip_data */
+	*tree = 0;
+	*index = 0;
+
+	return 0;
+}
+
 static struct device_node *dsa_get_ports(struct dsa_switch *ds,
 					 struct device_node *np)
 {
@@ -591,23 +645,34 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds,
 
 static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 {
+	struct dsa_chip_data *pdata = dev->platform_data;
 	struct device_node *np = dev->of_node;
 	struct dsa_switch_tree *dst;
 	struct device_node *ports;
 	u32 tree, index;
 	int i, err;
 
-	err = dsa_parse_member_dn(np, &tree, &index);
-	if (err)
-		return err;
+	if (np) {
+		err = dsa_parse_member_dn(np, &tree, &index);
+		if (err)
+			return err;
 
-	ports = dsa_get_ports(ds, np);
-	if (IS_ERR(ports))
-		return PTR_ERR(ports);
+		ports = dsa_get_ports(ds, np);
+		if (IS_ERR(ports))
+			return PTR_ERR(ports);
 
-	err = dsa_parse_ports_dn(ports, ds);
-	if (err)
-		return err;
+		err = dsa_parse_ports_dn(ports, ds);
+		if (err)
+			return err;
+	} else {
+		err = dsa_parse_member(pdata, &tree, &index);
+		if (err)
+			return err;
+
+		err = dsa_parse_ports(pdata, ds);
+		if (err)
+			return err;
+	}
 
 	dst = dsa_get_dst(tree);
 	if (!dst) {
@@ -623,6 +688,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 
 	ds->dst = dst;
 	ds->index = index;
+	ds->cd = pdata;
 
 	/* Initialize the routing table */
 	for (i = 0; i < DSA_MAX_SWITCHES; ++i)
-- 
cgit v1.2.3


From 89d82452d1fd283bb1e313bc7a1f7c324362cf02 Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Mon, 6 Feb 2017 16:07:13 +0000
Subject: net/sched: act_mirred: remove duplicated include from act_mirred.c

Remove duplicated include.

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_mirred.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 84682f02b611..af49c7dca860 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -28,8 +28,6 @@
 #include <linux/tc_act/tc_mirred.h>
 #include <net/tc_act/tc_mirred.h>
 
-#include <linux/if_arp.h>
-
 #define MIRRED_TAB_MASK     7
 static LIST_HEAD(mirred_list);
 static DEFINE_SPINLOCK(mirred_list_lock);
-- 
cgit v1.2.3


From bb4005bae3dab0dd21b239b1da193053151c07ba Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Mon, 6 Feb 2017 16:15:05 +0000
Subject: ipv6: sr: fix non static symbol warnings

Fixes the following sparse warnings:

net/ipv6/seg6_iptunnel.c:58:5: warning:
 symbol 'nla_put_srh' was not declared. Should it be static?
net/ipv6/seg6_iptunnel.c:238:5: warning:
 symbol 'seg6_input' was not declared. Should it be static?
net/ipv6/seg6_iptunnel.c:254:5: warning:
 symbol 'seg6_output' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 6124e159c882..85582257d3af 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -55,8 +55,8 @@ static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
 	[SEG6_IPTUNNEL_SRH]	= { .type = NLA_BINARY },
 };
 
-int nla_put_srh(struct sk_buff *skb, int attrtype,
-		struct seg6_iptunnel_encap *tuninfo)
+static int nla_put_srh(struct sk_buff *skb, int attrtype,
+		       struct seg6_iptunnel_encap *tuninfo)
 {
 	struct seg6_iptunnel_encap *data;
 	struct nlattr *nla;
@@ -235,7 +235,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 	return 0;
 }
 
-int seg6_input(struct sk_buff *skb)
+static int seg6_input(struct sk_buff *skb)
 {
 	int err;
 
@@ -251,7 +251,7 @@ int seg6_input(struct sk_buff *skb)
 	return dst_input(skb);
 }
 
-int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
 	struct dst_entry *dst = NULL;
-- 
cgit v1.2.3


From 9b8805a325591cf5b6b9df71200de25a2bd721fd Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Mon, 6 Feb 2017 23:14:11 +0200
Subject: sock: add sk_dst_pending_confirm flag

Add new sock flag to allow sockets to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
As not all call paths lock the socket use full word for
the flag.

Add sk_dst_confirm as replacement for dst_confirm when
called for received packets.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 12 ++++++++++++
 net/core/sock.c    |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 94e65fd70354..85d856b94b4b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -240,6 +240,7 @@ struct sock_common {
   *	@sk_wq: sock wait queue and async head
   *	@sk_rx_dst: receive input route used by early demux
   *	@sk_dst_cache: destination cache
+  *	@sk_dst_pending_confirm: need to confirm neighbour
   *	@sk_policy: flow policy
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
@@ -393,6 +394,8 @@ struct sock {
 	struct sk_buff_head	sk_write_queue;
 	__s32			sk_peek_off;
 	int			sk_write_pending;
+	__u32			sk_dst_pending_confirm;
+	/* Note: 32bit hole on 64bit arches */
 	long			sk_sndtimeo;
 	struct timer_list	sk_timer;
 	__u32			sk_priority;
@@ -1764,6 +1767,7 @@ static inline void dst_negative_advice(struct sock *sk)
 		if (ndst != dst) {
 			rcu_assign_pointer(sk->sk_dst_cache, ndst);
 			sk_tx_queue_clear(sk);
+			sk->sk_dst_pending_confirm = 0;
 		}
 	}
 }
@@ -1774,6 +1778,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst)
 	struct dst_entry *old_dst;
 
 	sk_tx_queue_clear(sk);
+	sk->sk_dst_pending_confirm = 0;
 	/*
 	 * This can be called while sk is owned by the caller only,
 	 * with no state that can be checked in a rcu_dereference_check() cond
@@ -1789,6 +1794,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst)
 	struct dst_entry *old_dst;
 
 	sk_tx_queue_clear(sk);
+	sk->sk_dst_pending_confirm = 0;
 	old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
 	dst_release(old_dst);
 }
@@ -1809,6 +1815,12 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
 
 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+static inline void sk_dst_confirm(struct sock *sk)
+{
+	if (!sk->sk_dst_pending_confirm)
+		sk->sk_dst_pending_confirm = 1;
+}
+
 bool sk_mc_loop(struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
diff --git a/net/core/sock.c b/net/core/sock.c
index 8b35debfe454..b74356535559 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 		sk_tx_queue_clear(sk);
+		sk->sk_dst_pending_confirm = 0;
 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 		dst_release(dst);
 		return NULL;
@@ -1519,6 +1520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 				af_family_clock_key_strings[newsk->sk_family]);
 
 		newsk->sk_dst_cache	= NULL;
+		newsk->sk_dst_pending_confirm = 0;
 		newsk->sk_wmem_queued	= 0;
 		newsk->sk_forward_alloc = 0;
 		atomic_set(&newsk->sk_drops, 0);
-- 
cgit v1.2.3


From 4ff0620354f2b39b9fe2a91c22c4de9d1fba0c8e Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Mon, 6 Feb 2017 23:14:12 +0200
Subject: net: add dst_pending_confirm flag to skbuff

Add new skbuff flag to allow protocols to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.

Add sock_confirm_neigh() helper to confirm the neighbour and
use it for IPv4, IPv6 and VRF before dst_neigh_output.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c      |  5 ++++-
 include/linux/skbuff.h | 12 ++++++++++++
 include/net/sock.h     | 14 ++++++++++++++
 net/ipv4/ip_output.c   |  5 ++++-
 net/ipv6/ip6_output.c  |  1 +
 5 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 264fc1585b3c..630eafdb79e8 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -378,6 +378,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
@@ -574,8 +575,10 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
-	if (!IS_ERR(neigh))
+	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
+	}
 
 	rcu_read_unlock_bh();
 err:
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c6a78e1892b6..f1adddc1c5ac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -612,6 +612,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@wifi_acked_valid: wifi_acked was set
  *	@wifi_acked: whether frame was acked on wifi or not
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
+ *	@dst_pending_confirm: need to confirm neighbour
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
@@ -741,6 +742,7 @@ struct sk_buff {
 	__u8			csum_level:2;
 	__u8			csum_bad:1;
 
+	__u8			dst_pending_confirm:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
@@ -3698,6 +3700,16 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 	return skb->queue_mapping != 0;
 }
 
+static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
+{
+	skb->dst_pending_confirm = val;
+}
+
+static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
+{
+	return skb->dst_pending_confirm != 0;
+}
+
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 {
 #ifdef CONFIG_XFRM
diff --git a/include/net/sock.h b/include/net/sock.h
index 85d856b94b4b..6f83e78eaa5a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1821,6 +1821,20 @@ static inline void sk_dst_confirm(struct sock *sk)
 		sk->sk_dst_pending_confirm = 1;
 }
 
+static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
+{
+	if (skb_get_dst_pending_confirm(skb)) {
+		struct sock *sk = skb->sk;
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+		if (sk && sk->sk_dst_pending_confirm)
+			sk->sk_dst_pending_confirm = 0;
+	}
+}
+
 bool sk_mc_loop(struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b67719f45953..c9fc32fa3272 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 	if (!IS_ERR(neigh)) {
-		int res = dst_neigh_output(dst, neigh, skb);
+		int res;
+
+		sock_confirm_neigh(skb, neigh);
+		res = dst_neigh_output(dst, neigh, skb);
 
 		rcu_read_unlock_bh();
 		return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b6a94ff0bbd0..14d99fbf102e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -119,6 +119,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
-- 
cgit v1.2.3


From c86a773c78025f5b825bacd7b846f4fa60dc0317 Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Mon, 6 Feb 2017 23:14:13 +0200
Subject: sctp: add dst_pending_confirm flag

Add new transport flag to allow sockets to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
The flag is propagated from transport to every packet.
It is reset when cached dst is reset.

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    |  6 ++----
 include/net/sctp/structs.h |  4 ++++
 net/sctp/associola.c       |  3 +--
 net/sctp/output.c          | 10 +++++++++-
 net/sctp/outqueue.c        |  2 +-
 net/sctp/sm_make_chunk.c   |  6 ++----
 net/sctp/sm_sideeffect.c   |  2 +-
 net/sctp/socket.c          |  4 ++--
 net/sctp/transport.c       | 16 +++++++++++++++-
 9 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 3cfd365bcfbc..480b65a24aff 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -593,10 +593,8 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr)
  */
 static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t)
 {
-	if (t->dst && !dst_check(t->dst, t->dst_cookie)) {
-		dst_release(t->dst);
-		t->dst = NULL;
-	}
+	if (t->dst && !dst_check(t->dst, t->dst_cookie))
+		sctp_transport_dst_release(t);
 
 	return t->dst;
 }
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 231fa9ac50bd..6a685049f67f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -804,6 +804,8 @@ struct sctp_transport {
 
 	__u32 burst_limited;	/* Holds old cwnd when max.burst is applied */
 
+	__u32 dst_pending_confirm;	/* need to confirm neighbour */
+
 	/* Destination */
 	struct dst_entry *dst;
 	/* Source address. */
@@ -950,6 +952,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *);
 void sctp_transport_reset(struct sctp_transport *);
 void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
+void sctp_transport_dst_release(struct sctp_transport *t);
+void sctp_transport_dst_confirm(struct sctp_transport *t);
 
 
 /* This is the structure we use to queue packets as they come into
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index e50dc6d7543f..2a6835b4562b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -832,8 +832,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		if (transport->state != SCTP_UNCONFIRMED)
 			transport->state = SCTP_INACTIVE;
 		else {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 			ulp_notify = false;
 		}
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 07ab5062e541..814eac047467 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -546,6 +546,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	struct sctp_association *asoc = tp->asoc;
 	struct sctp_chunk *chunk, *tmp;
 	int pkt_count, gso = 0;
+	int confirm;
 	struct dst_entry *dst;
 	struct sk_buff *head;
 	struct sctphdr *sh;
@@ -624,7 +625,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 			asoc->peer.last_sent_to = tp;
 	}
 	head->ignore_df = packet->ipfragok;
-	tp->af_specific->sctp_xmit(head, tp);
+	confirm = tp->dst_pending_confirm;
+	if (confirm)
+		skb_set_dst_pending_confirm(head, 1);
+	/* neighbour should be confirmed on successful transmission or
+	 * positive error
+	 */
+	if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm)
+		tp->dst_pending_confirm = 0;
 
 out:
 	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 65abe22d8691..db352e5d61f8 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1654,7 +1654,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 
 		if (forward_progress) {
 			if (transport->dst)
-				dst_confirm(transport->dst);
+				sctp_transport_dst_confirm(transport);
 		}
 	}
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index ad3445b3408e..c7d3249f88ec 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3333,8 +3333,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 		}
 		break;
 	case SCTP_PARAM_DEL_IP:
@@ -3348,8 +3347,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 		}
 		break;
 	default:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index a4552712b882..51abcc90fe75 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -755,7 +755,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 	 * forward progress.
 	 */
 	if (t->dst)
-		dst_confirm(t->dst);
+		sctp_transport_dst_confirm(t);
 
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5fc7122c76de..a4609a0be76d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -592,7 +592,7 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 			list_for_each_entry(trans,
 			    &asoc->peer.transport_addr_list, transports) {
 				/* Clear the source and route cache */
-				dst_release(trans->dst);
+				sctp_transport_dst_release(trans);
 				trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
 				    2*asoc->pathmtu, 4380));
 				trans->ssthresh = asoc->peer.i.a_rwnd;
@@ -843,7 +843,7 @@ skip_mkasconf:
 		 */
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 					transports) {
-			dst_release(transport->dst);
+			sctp_transport_dst_release(transport);
 			sctp_transport_route(transport, NULL,
 					     sctp_sk(asoc->base.sk));
 		}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index baa1ac00d7b5..5b63ceb3bf37 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -240,7 +240,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 {
 	/* If we don't have a fresh route, look one up */
 	if (!transport->dst || transport->dst->obsolete) {
-		dst_release(transport->dst);
+		sctp_transport_dst_release(transport);
 		transport->af_specific->get_dst(transport, &transport->saddr,
 						&transport->fl, sk);
 	}
@@ -672,3 +672,17 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t)
 			sctp_transport_hold(t);
 	}
 }
+
+/* Drop dst */
+void sctp_transport_dst_release(struct sctp_transport *t)
+{
+	dst_release(t->dst);
+	t->dst = NULL;
+	t->dst_pending_confirm = 0;
+}
+
+/* Schedule neighbour confirm */
+void sctp_transport_dst_confirm(struct sctp_transport *t)
+{
+	t->dst_pending_confirm = 1;
+}
-- 
cgit v1.2.3


From c3a2e8370534f810cac6050169db0ed3e0f94f0b Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Mon, 6 Feb 2017 23:14:14 +0200
Subject: tcp: replace dst_confirm with sk_dst_confirm

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
Use the new sk_dst_confirm() helper to propagate the
indication from received packets to sock_confirm_neigh().

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Tested-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c   | 12 +++---------
 net/ipv4/tcp_metrics.c |  7 ++-----
 net/ipv4/tcp_output.c  |  2 ++
 3 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 27c95acbb52f..2c0ff327b6df 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3644,11 +3644,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-		struct dst_entry *dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
-	}
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+		sk_dst_confirm(sk);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
@@ -5995,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		break;
 
 	case TCP_FIN_WAIT1: {
-		struct dst_entry *dst;
 		int tmo;
 
 		/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6022,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		tcp_set_state(sk, TCP_FIN_WAIT2);
 		sk->sk_shutdown |= SEND_SHUTDOWN;
 
-		dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
+		sk_dst_confirm(sk);
 
 		if (!sock_flag(sk, SOCK_DEAD)) {
 			/* Wake up lingering close() */
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b9ed0d50aead..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
 	u32 val;
 	int m;
 
+	sk_dst_confirm(sk);
 	if (sysctl_tcp_nometrics_save || !dst)
 		return;
 
-	if (dst->flags & DST_HOST)
-		dst_confirm(dst);
-
 	rcu_read_lock();
 	if (icsk->icsk_backoff || !tp->srtt_us) {
 		/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
 	struct tcp_metrics_block *tm;
 	u32 val, crtt = 0; /* cached RTT scaled by 8 */
 
+	sk_dst_confirm(sk);
 	if (!dst)
 		goto reset;
 
-	dst_confirm(dst);
-
 	rcu_read_lock();
 	tm = tcp_get_metrics(sk, dst, true);
 	if (!tm) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ccf1ef4dcba4..61f272a99a49 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -980,6 +980,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	skb_set_hash_from_sk(skb, sk);
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
+	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
 	/* Build TCP header and checksum it. */
 	th = (struct tcphdr *)skb->data;
 	th->source		= inet->inet_sport;
-- 
cgit v1.2.3


From 63fca65d08632fbec9d9b655f671cf08aa1aeeb8 Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Mon, 6 Feb 2017 23:14:15 +0200
Subject: net: add confirm_neigh method to dst_ops

Add confirm_neigh method to dst_ops and use it from IPv4 and IPv6
to lookup and confirm the neighbour. Its usage via the new helper
dst_confirm_neigh() should be restricted to MSG_PROBE users for
performance reasons.

For XFRM prefer the last tunnel address, if present. With help
from Steffen Klassert.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h      | 16 ++++++++++++++++
 include/net/dst.h      |  7 +++++++
 include/net/dst_ops.h  |  2 ++
 include/net/ndisc.h    | 17 +++++++++++++++++
 net/ipv4/route.c       | 19 +++++++++++++++++++
 net/ipv6/route.c       | 16 ++++++++++++++++
 net/xfrm/xfrm_policy.c | 19 +++++++++++++++++++
 7 files changed, 96 insertions(+)

(limited to 'net')

diff --git a/include/net/arp.h b/include/net/arp.h
index 5e0f891d476c..65619a2de6f4 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -35,6 +35,22 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32
 	return n;
 }
 
+static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
+{
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+	n = __ipv4_neigh_lookup_noref(dev, key);
+	if (n) {
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+	}
+	rcu_read_unlock_bh();
+}
+
 void arp_init(void);
 int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 void arp_send(int type, int ptype, __be32 dest_ip,
diff --git a/include/net/dst.h b/include/net/dst.h
index 6835d224d47b..3a3b34b83b00 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -477,6 +477,13 @@ static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst
 	return IS_ERR(n) ? NULL : n;
 }
 
+static inline void dst_confirm_neigh(const struct dst_entry *dst,
+				     const void *daddr)
+{
+	if (dst->ops->confirm_neigh)
+		dst->ops->confirm_neigh(dst, daddr);
+}
+
 static inline void dst_link_failure(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 8a2b66d8d78d..c84b3287e38b 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -33,6 +33,8 @@ struct dst_ops {
 	struct neighbour *	(*neigh_lookup)(const struct dst_entry *dst,
 						struct sk_buff *skb,
 						const void *daddr);
+	void			(*confirm_neigh)(const struct dst_entry *dst,
+						 const void *daddr);
 
 	struct kmem_cache	*kmem_cachep;
 
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index d562a2fe4860..8a0214654b6b 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -391,6 +391,23 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons
 	return n;
 }
 
+static inline void __ipv6_confirm_neigh(struct net_device *dev,
+					const void *pkey)
+{
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+	n = __ipv6_neigh_lookup_noref(dev, pkey);
+	if (n) {
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+	}
+	rcu_read_unlock_bh();
+}
+
 int ndisc_init(void);
 int ndisc_late_init(void);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4b7c231c1aef..cb494a5050f7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 					   struct sk_buff *skb,
 					   const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
 	.redirect =		ip_do_redirect,
 	.local_out =		__ip_local_out,
 	.neigh_lookup =		ipv4_neigh_lookup,
+	.confirm_neigh =	ipv4_confirm_neigh,
 };
 
 #define ECN_OR_COST(class)	TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&arp_tbl, pkey, dev);
 }
 
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	struct net_device *dev = dst->dev;
+	const __be32 *pkey = daddr;
+	const struct rtable *rt;
+
+	rt = (const struct rtable *)dst;
+	if (rt->rt_gateway)
+		pkey = (const __be32 *)&rt->rt_gateway;
+	else if (!daddr ||
+		 (rt->rt_flags &
+		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+		return;
+
+	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
+}
+
 #define IP_IDENTS_SZ 2048u
 
 static atomic_t *ip_idents __read_mostly;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8ffa24cc8899..98b183f1bc8b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -223,6 +223,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&nd_tbl, daddr, dst->dev);
 }
 
+static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	struct net_device *dev = dst->dev;
+	struct rt6_info *rt = (struct rt6_info *)dst;
+
+	daddr = choose_neigh_daddr(rt, NULL, daddr);
+	if (!daddr)
+		return;
+	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+		return;
+	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
+		return;
+	__ipv6_confirm_neigh(dev, daddr);
+}
+
 static struct dst_ops ip6_dst_ops_template = {
 	.family			=	AF_INET6,
 	.gc			=	ip6_dst_gc,
@@ -239,6 +254,7 @@ static struct dst_ops ip6_dst_ops_template = {
 	.redirect		=	rt6_do_redirect,
 	.local_out		=	__ip6_local_out,
 	.neigh_lookup		=	ip6_neigh_lookup,
+	.confirm_neigh		=	ip6_confirm_neigh,
 };
 
 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 99ad1af2927f..0a0f63d3cc96 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2856,6 +2856,23 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 	return dst->path->ops->neigh_lookup(dst, skb, daddr);
 }
 
+static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	const struct dst_entry *path = dst->path;
+
+	for (; dst != path; dst = dst->child) {
+		const struct xfrm_state *xfrm = dst->xfrm;
+
+		if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
+			continue;
+		if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
+			daddr = xfrm->coaddr;
+		else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
+			daddr = &xfrm->id.daddr;
+	}
+	path->ops->confirm_neigh(path, daddr);
+}
+
 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
 	int err = 0;
@@ -2882,6 +2899,8 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 			dst_ops->link_failure = xfrm_link_failure;
 		if (likely(dst_ops->neigh_lookup == NULL))
 			dst_ops->neigh_lookup = xfrm_neigh_lookup;
+		if (likely(!dst_ops->confirm_neigh))
+			dst_ops->confirm_neigh = xfrm_confirm_neigh;
 		if (likely(afinfo->garbage_collect == NULL))
 			afinfo->garbage_collect = xfrm_garbage_collect_deferred;
 		rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
-- 
cgit v1.2.3


From 0dec879f636f11b0ffda1cb5fd96a1754c59ead3 Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Mon, 6 Feb 2017 23:14:16 +0200
Subject: net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.

The datagram protocols can use MSG_CONFIRM to confirm the
neighbour. When used with MSG_PROBE we do not reach the
code where neighbour is confirmed, so we have to do the
same slow lookup by using the dst_confirm_neigh() helper.
When MSG_PROBE is not used, ip_append_data/ip6_append_data
will set the skb flag dst_pending_confirm.

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  |  6 ++++++
 net/ipv4/ping.c       |  3 ++-
 net/ipv4/raw.c        |  6 +++++-
 net/ipv4/udp.c        |  3 ++-
 net/ipv6/ip6_output.c |  6 ++++++
 net/ipv6/raw.c        |  6 +++++-
 net/ipv6/route.c      | 27 ++++++++++++++-------------
 net/ipv6/udp.c        |  3 ++-
 net/l2tp/l2tp_ip6.c   |  3 ++-
 9 files changed, 44 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c9fc32fa3272..7a719f1ae556 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -889,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
 
 		skb->csum = 0;
 
+		if (flags & MSG_CONFIRM)
+			skb_set_dst_pending_confirm(skb, 1);
+
 		__skb_queue_tail(queue, skb);
 	} else if (skb_is_gso(skb)) {
 		goto append;
@@ -1089,6 +1092,9 @@ alloc_new_skb:
 			exthdrlen = 0;
 			csummode = CHECKSUM_NONE;
 
+			if ((flags & MSG_CONFIRM) && !skb_prev)
+				skb_set_dst_pending_confirm(skb, 1);
+
 			/*
 			 * Put the packet on the pending queue.
 			 */
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 592db6a3a0e9..6ee792d83d5b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -848,7 +848,8 @@ out:
 	return err;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4e49e5cb001c..8119e1f66e03 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
 
+	if (flags & MSG_CONFIRM)
+		skb_set_dst_pending_confirm(skb, 1);
+
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
 	if (memcpy_from_msg(iph, msg, length))
@@ -666,7 +669,8 @@ out:
 	return len;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf6ba3387401..4a1ba04565d1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1101,7 +1101,8 @@ out:
 	return err;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4->daddr);
 	if (!(msg->msg_flags&MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 14d99fbf102e..d299040613a0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1145,6 +1145,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		skb->protocol = htons(ETH_P_IPV6);
 		skb->csum = 0;
 
+		if (flags & MSG_CONFIRM)
+			skb_set_dst_pending_confirm(skb, 1);
+
 		__skb_queue_tail(queue, skb);
 	} else if (skb_is_gso(skb)) {
 		goto append;
@@ -1517,6 +1520,9 @@ alloc_new_skb:
 			exthdrlen = 0;
 			dst_exthdrlen = 0;
 
+			if ((flags & MSG_CONFIRM) && !skb_prev)
+				skb_set_dst_pending_confirm(skb, 1);
+
 			/*
 			 * Put the packet on the pending queue
 			 */
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ea89073c8247..f174e76e6505 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -654,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 
 	skb->ip_summed = CHECKSUM_NONE;
 
+	if (flags & MSG_CONFIRM)
+		skb_set_dst_pending_confirm(skb, 1);
+
 	skb->transport_header = skb->network_header;
 	err = memcpy_from_msg(iph, msg, length);
 	if (err)
@@ -934,7 +937,8 @@ out:
 	txopt_put(opt_to_free);
 	return err < 0 ? err : len;
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 98b183f1bc8b..f54f4265b37f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1381,6 +1381,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 				 const struct ipv6hdr *iph, u32 mtu)
 {
+	const struct in6_addr *daddr, *saddr;
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
 
 	if (rt6->rt6i_flags & RTF_LOCAL)
@@ -1389,26 +1390,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	if (dst_metric_locked(dst, RTAX_MTU))
 		return;
 
-	dst_confirm(dst);
+	if (iph) {
+		daddr = &iph->daddr;
+		saddr = &iph->saddr;
+	} else if (sk) {
+		daddr = &sk->sk_v6_daddr;
+		saddr = &inet6_sk(sk)->saddr;
+	} else {
+		daddr = NULL;
+		saddr = NULL;
+	}
+	dst_confirm_neigh(dst, daddr);
 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
 	if (mtu >= dst_mtu(dst))
 		return;
 
 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
 		rt6_do_update_pmtu(rt6, mtu);
-	} else {
-		const struct in6_addr *daddr, *saddr;
+	} else if (daddr) {
 		struct rt6_info *nrt6;
 
-		if (iph) {
-			daddr = &iph->daddr;
-			saddr = &iph->saddr;
-		} else if (sk) {
-			daddr = &sk->sk_v6_daddr;
-			saddr = &inet6_sk(sk)->saddr;
-		} else {
-			return;
-		}
 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
@@ -2332,7 +2333,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 * Look, redirects are sent only in response to data packets,
 	 * so that this nexthop apparently is reachable. --ANK
 	 */
-	dst_confirm(&rt->dst);
+	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
 
 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
 	if (!neigh)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b4c6516a3a0c..51346fa70298 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1308,7 +1308,8 @@ out:
 	return err;
 
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags&MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 4b06eb415f68..734798a00ca0 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -658,7 +658,8 @@ out:
 	return err < 0 ? err : len;
 
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
-- 
cgit v1.2.3


From 51ce8bd4d17a761e1a90a34a1b5c9b762cce7553 Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Mon, 6 Feb 2017 23:14:17 +0200
Subject: net: pending_confirm is not used anymore

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
As last step, we can remove the pending_confirm flag.

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 14 ++------------
 net/core/dst.c    |  1 -
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 3a3b34b83b00..84a1043dd6a1 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -59,8 +59,6 @@ struct dst_entry {
 #define DST_XFRM_QUEUE		0x0100
 #define DST_METADATA		0x0200
 
-	unsigned short		pending_confirm;
-
 	short			error;
 
 	/* A non-zero value of dst->obsolete forces by-hand validation
@@ -78,6 +76,8 @@ struct dst_entry {
 #define DST_OBSOLETE_KILL	-2
 	unsigned short		header_len;	/* more space at head required */
 	unsigned short		trailer_len;	/* space to reserve at tail */
+	unsigned short		__pad3;
+
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	__u32			tclassid;
 #else
@@ -440,7 +440,6 @@ static inline void dst_rcu_free(struct rcu_head *head)
 
 static inline void dst_confirm(struct dst_entry *dst)
 {
-	dst->pending_confirm = 1;
 }
 
 static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
@@ -448,15 +447,6 @@ static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
 {
 	const struct hh_cache *hh;
 
-	if (dst->pending_confirm) {
-		unsigned long now = jiffies;
-
-		dst->pending_confirm = 0;
-		/* avoid dirtying neighbour */
-		if (n->confirmed != now)
-			n->confirmed = now;
-	}
-
 	hh = &n->hh;
 	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
 		return neigh_hh_output(hh, skb);
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe07f786..960e503b5a52 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
-	dst->pending_confirm = 0;
 	dst->next = NULL;
 	if (!(flags & DST_NOCOUNT))
 		dst_entries_add(ops, 1);
-- 
cgit v1.2.3


From e69e46261063a25c3907bed16a2e9d18b115d1fd Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Mon, 6 Feb 2017 15:55:23 -0800
Subject: net: dsa: Do not clobber PHY link outside of state machine

Calling phy_read_status() means that we may call into
genphy_read_status() which in turn will use genphy_update_link() which
can make changes to phydev->link outside of the state machine's state
transitions. This is an invalid behavior that is now caught as of
811a919135b9 ("phy state machine: failsafe leave invalid RUNNING state")

Reported-by: Zefir Kurtisi <zefir.kurtisi@neratec.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 061a49c29cef..c34872e1febc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -684,14 +684,10 @@ dsa_slave_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *cmd)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	int err;
+	int err = -EOPNOTSUPP;
 
-	err = -EOPNOTSUPP;
-	if (p->phy != NULL) {
-		err = phy_read_status(p->phy);
-		if (err == 0)
-			err = phy_ethtool_ksettings_get(p->phy, cmd);
-	}
+	if (p->phy != NULL)
+		err = phy_ethtool_ksettings_get(p->phy, cmd);
 
 	return err;
 }
-- 
cgit v1.2.3


From 1f02b5f42f53af516c4f5f747390e66d7a8f0bfe Mon Sep 17 00:00:00 2001
From: Colin Ian King
Date: Tue, 7 Feb 2017 10:56:38 +0000
Subject: net: bridge: remove redundant check to see if err is set

The error check on err is redundant as it is being checked
previously each time it has been updated.  Remove this redundant
check.

Detected with CoverityScan, CID#140030("Logically dead code")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 1cbdc5b96aa7..a8f6acd23e30 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -612,9 +612,6 @@ static int br_afspec(struct net_bridge *br,
 				return err;
 			break;
 		}
-
-		if (err)
-			return err;
 	}
 
 	return err;
-- 
cgit v1.2.3


From bb580ad698aeb4e5455d701c228c50355f84c056 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Tue, 7 Feb 2017 12:46:46 +0100
Subject: bridge: tunnel: fix attribute checks in br_parse_vlan_tunnel_info

These checks should go after the attributes have been parsed otherwise
we're using tb uninitialized.

Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support")
Reported-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink_tunnel.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
index 99c68012c9d4..62eaf750bd9e 100644
--- a/net/bridge/br_netlink_tunnel.c
+++ b/net/bridge/br_netlink_tunnel.c
@@ -229,15 +229,15 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr,
 
 	memset(tinfo, 0, sizeof(*tinfo));
 
-	if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
-	    !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
-		return -EINVAL;
-
 	err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
 			       attr, vlan_tunnel_policy);
 	if (err < 0)
 		return err;
 
+	if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
+	    !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
+		return -EINVAL;
+
 	tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]);
 	vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]);
 	if (vid >= VLAN_VID_MASK)
-- 
cgit v1.2.3


From a8cab863a75f2d353588d4c3c37dc37c77b96c83 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu
Date: Tue, 7 Feb 2017 06:43:23 -0800
Subject: bridge: remove unnecessary check for vtbegin in
 br_fill_vlan_tinfo_range

vtbegin should not be NULL in this function, Its already checked by the
caller.

this should silence the below smatch complaint:
  net/bridge/br_netlink_tunnel.c:144 br_fill_vlan_tinfo_range()
    error: we previously assumed 'vtbegin' could be null (see line 130)

net/bridge/br_netlink_tunnel.c
   129
   130      if (vtbegin && vtend && (vtend->vid - vtbegin->vid) > 0) {
                    ^^^^^^^
Check for NULL.

Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support")
Reported-By: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
index 62eaf750bd9e..f38473509647 100644
--- a/net/bridge/br_netlink_tunnel.c
+++ b/net/bridge/br_netlink_tunnel.c
@@ -127,7 +127,7 @@ static int br_fill_vlan_tinfo_range(struct sk_buff *skb,
 {
 	int err;
 
-	if (vtbegin && vtend && (vtend->vid - vtbegin->vid) > 0) {
+	if (vtend && (vtend->vid - vtbegin->vid) > 0) {
 		/* add range to skb */
 		err = br_fill_vlan_tinfo(skb, vtbegin->vid,
 					 vtbegin->tinfo.tunnel_id,
-- 
cgit v1.2.3


From ca6d4480f87db9d9470d3d7bbe445953fa105e57 Mon Sep 17 00:00:00 2001
From: stephen hemminger
Date: Tue, 7 Feb 2017 08:46:46 -0800
Subject: bridge: avoid unnecessary read of jiffies

Jiffies is volatile so read it once.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_fdb.c   | 6 ++++--
 net/bridge/br_input.c | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 5028691fa68a..5693168e88b6 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -592,13 +592,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 				br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
 					source->dev->name, addr, vid);
 		} else {
+			unsigned long now = jiffies;
+
 			/* fastpath: update of existing entry */
 			if (unlikely(source != fdb->dst)) {
 				fdb->dst = source;
 				fdb_modified = true;
 			}
-			if (jiffies != fdb->updated)
-				fdb->updated = jiffies;
+			if (now != fdb->updated)
+				fdb->updated = now;
 			if (unlikely(added_by_user))
 				fdb->added_by_user = 1;
 			if (unlikely(fdb_modified))
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 220943f920d2..4615a9b3e26c 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -195,11 +195,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	}
 
 	if (dst) {
+		unsigned long now = jiffies;
+
 		if (dst->is_local)
 			return br_pass_frame_up(skb);
 
-		if (jiffies != dst->used)
-			dst->used = jiffies;
+		if (now != dst->used)
+			dst->used = now;
 		br_forward(dst->dst, skb, local_rcv, false);
 	} else {
 		if (!mcast_hit)
-- 
cgit v1.2.3


From b699b71d82e77203be43bda5ff1b72516f129581 Mon Sep 17 00:00:00 2001
From: Pichugin Dmitry
Date: Sat, 28 Jan 2017 17:06:53 +0300
Subject: cfg80211 debugfs: Cleanup some checkpatch issues

This fixes the checkpatch.pl warnings:
* Macros should not use a trailing semicolon.
* Spaces required around that '='.
* Symbolic permissions 'S_IRUGO' are not preferred.

Signed-off-by: Dmitriy Pichugin <smokeman85@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/debugfs.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 5d453916a417..30fc6eb352bc 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -17,7 +17,7 @@
 static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 			    size_t count, loff_t *ppos)			\
 {									\
-	struct wiphy *wiphy= file->private_data;		\
+	struct wiphy *wiphy = file->private_data;			\
 	char buf[buflen];						\
 	int res;							\
 									\
@@ -29,14 +29,14 @@ static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
 	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
-};
+}
 
 DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
-		      wiphy->rts_threshold)
+		      wiphy->rts_threshold);
 DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
 		      wiphy->frag_threshold);
 DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
-		      wiphy->retry_short)
+		      wiphy->retry_short);
 DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
 		      wiphy->retry_long);
 
@@ -103,7 +103,7 @@ static const struct file_operations ht40allow_map_ops = {
 };
 
 #define DEBUGFS_ADD(name)						\
-	debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops);
+	debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops)
 
 void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev)
 {
-- 
cgit v1.2.3


From fe8de3da13bdbcbe8b583a3bbadf677da0f04f83 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel
Date: Mon, 6 Feb 2017 10:49:27 +0000
Subject: mac80211: fils_aead: Use crypto api CMAC shash rather than bare
 cipher

Switch the FILS AEAD code to use a cmac(aes) shash instantiated by the
crypto API rather than reusing the open coded implementation in
aes_cmac_vector(). This makes the code more understandable, and allows
platforms to implement cmac(aes) in a more secure (*) and efficient way
than is typically possible when using the AES cipher directly.

So replace the crypto_cipher by a crypto_shash, and update the aes_s2v()
routine to call the shash interface directly.

* In particular, the generic table based AES implementation is sensitive
  to known-plaintext timing attacks on the key, to which AES based MAC
  algorithms are especially vulnerable, given that their plaintext is not
  usually secret. Time invariant alternatives are available (e.g., based
  on SIMD algorithms), but may incur a setup cost that is prohibitive when
  operating on a single block at a time, which is why they don't usually
  expose the cipher API.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/Kconfig     |  1 +
 net/mac80211/aes_cmac.h  |  4 ---
 net/mac80211/fils_aead.c | 74 +++++++++++++++++++++---------------------------
 3 files changed, 34 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 3891cbd2adea..76e30f4797fb 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -6,6 +6,7 @@ config MAC80211
 	select CRYPTO_AES
 	select CRYPTO_CCM
 	select CRYPTO_GCM
+	select CRYPTO_CMAC
 	select CRC32
 	---help---
 	  This option enables the hardware independent IEEE 802.11
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index c827e1d5de8b..3702041f44fd 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -11,10 +11,6 @@
 
 #include <linux/crypto.h>
 
-void gf_mulx(u8 *pad);
-void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
-		     const u8 *addr[], const size_t *len, u8 *mac,
-		     size_t mac_len);
 struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
 						   size_t key_len);
 void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c
index ecfdd97758a3..061a9211ecf5 100644
--- a/net/mac80211/fils_aead.c
+++ b/net/mac80211/fils_aead.c
@@ -9,66 +9,58 @@
 
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
+#include <crypto/hash.h>
 #include <crypto/skcipher.h>
 
 #include "ieee80211_i.h"
 #include "aes_cmac.h"
 #include "fils_aead.h"
 
-static int aes_s2v(struct crypto_cipher *tfm,
+static void gf_mulx(u8 *pad)
+{
+	u64 a = get_unaligned_be64(pad);
+	u64 b = get_unaligned_be64(pad + 8);
+
+	put_unaligned_be64((a << 1) | (b >> 63), pad);
+	put_unaligned_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0), pad + 8);
+}
+
+static int aes_s2v(struct crypto_shash *tfm,
 		   size_t num_elem, const u8 *addr[], size_t len[], u8 *v)
 {
-	u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE];
+	u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE] = {};
+	SHASH_DESC_ON_STACK(desc, tfm);
 	size_t i;
-	const u8 *data[2];
-	size_t data_len[2], data_elems;
+
+	desc->tfm = tfm;
 
 	/* D = AES-CMAC(K, <zero>) */
-	memset(tmp, 0, AES_BLOCK_SIZE);
-	data[0] = tmp;
-	data_len[0] = AES_BLOCK_SIZE;
-	aes_cmac_vector(tfm, 1, data, data_len, d, AES_BLOCK_SIZE);
+	crypto_shash_digest(desc, tmp, AES_BLOCK_SIZE, d);
 
 	for (i = 0; i < num_elem - 1; i++) {
 		/* D = dbl(D) xor AES_CMAC(K, Si) */
 		gf_mulx(d); /* dbl */
-		aes_cmac_vector(tfm, 1, &addr[i], &len[i], tmp,
-				AES_BLOCK_SIZE);
+		crypto_shash_digest(desc, addr[i], len[i], tmp);
 		crypto_xor(d, tmp, AES_BLOCK_SIZE);
 	}
 
+	crypto_shash_init(desc);
+
 	if (len[i] >= AES_BLOCK_SIZE) {
 		/* len(Sn) >= 128 */
-		size_t j;
-		const u8 *pos;
-
 		/* T = Sn xorend D */
-
-		/* Use a temporary buffer to perform xorend on Sn (addr[i]) to
-		 * avoid modifying the const input argument.
-		 */
-		data[0] = addr[i];
-		data_len[0] = len[i] - AES_BLOCK_SIZE;
-		pos = addr[i] + data_len[0];
-		for (j = 0; j < AES_BLOCK_SIZE; j++)
-			tmp[j] = pos[j] ^ d[j];
-		data[1] = tmp;
-		data_len[1] = AES_BLOCK_SIZE;
-		data_elems = 2;
+		crypto_shash_update(desc, addr[i], len[i] - AES_BLOCK_SIZE);
+		crypto_xor(d, addr[i] + len[i] - AES_BLOCK_SIZE,
+			   AES_BLOCK_SIZE);
 	} else {
 		/* len(Sn) < 128 */
 		/* T = dbl(D) xor pad(Sn) */
 		gf_mulx(d); /* dbl */
-		memset(tmp, 0, AES_BLOCK_SIZE);
-		memcpy(tmp, addr[i], len[i]);
-		tmp[len[i]] = 0x80;
-		crypto_xor(d, tmp, AES_BLOCK_SIZE);
-		data[0] = d;
-		data_len[0] = sizeof(d);
-		data_elems = 1;
+		crypto_xor(d, addr[i], len[i]);
+		d[len[i]] ^= 0x80;
 	}
 	/* V = AES-CMAC(K, T) */
-	aes_cmac_vector(tfm, data_elems, data, data_len, v, AES_BLOCK_SIZE);
+	crypto_shash_finup(desc, d, AES_BLOCK_SIZE, v);
 
 	return 0;
 }
@@ -80,7 +72,7 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
 			   size_t len[], u8 *out)
 {
 	u8 v[AES_BLOCK_SIZE];
-	struct crypto_cipher *tfm;
+	struct crypto_shash *tfm;
 	struct crypto_skcipher *tfm2;
 	struct skcipher_request *req;
 	int res;
@@ -95,14 +87,14 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
 
 	/* S2V */
 
-	tfm = crypto_alloc_cipher("aes", 0, 0);
+	tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(tfm))
 		return PTR_ERR(tfm);
 	/* K1 for S2V */
-	res = crypto_cipher_setkey(tfm, key, key_len);
+	res = crypto_shash_setkey(tfm, key, key_len);
 	if (!res)
 		res = aes_s2v(tfm, num_elem, addr, len, v);
-	crypto_free_cipher(tfm);
+	crypto_free_shash(tfm);
 	if (res)
 		return res;
 
@@ -157,7 +149,7 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
 			   size_t num_elem, const u8 *addr[], size_t len[],
 			   u8 *out)
 {
-	struct crypto_cipher *tfm;
+	struct crypto_shash *tfm;
 	struct crypto_skcipher *tfm2;
 	struct skcipher_request *req;
 	struct scatterlist src[1], dst[1];
@@ -210,14 +202,14 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
 
 	/* S2V */
 
-	tfm = crypto_alloc_cipher("aes", 0, 0);
+	tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(tfm))
 		return PTR_ERR(tfm);
 	/* K1 for S2V */
-	res = crypto_cipher_setkey(tfm, key, key_len);
+	res = crypto_shash_setkey(tfm, key, key_len);
 	if (!res)
 		res = aes_s2v(tfm, num_elem, addr, len, check);
-	crypto_free_cipher(tfm);
+	crypto_free_shash(tfm);
 	if (res)
 		return res;
 	if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0)
-- 
cgit v1.2.3


From 26717828b75dd5c46e97f7f4a9b937d038bb2852 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel
Date: Mon, 6 Feb 2017 10:49:28 +0000
Subject: mac80211: aes-cmac: switch to shash CMAC driver

Instead of open coding the CMAC algorithm in the mac80211 driver using
byte wide xors and calls into the crypto layer for each block of data,
instantiate a cmac(aes) synchronous hash and pass all the data into it
directly. This does not only simplify the code, it also allows the use
of more efficient and more secure implementations, especially on
platforms where SIMD ciphers have a considerable setup cost.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/aes_cmac.c | 126 ++++++++++--------------------------------------
 net/mac80211/aes_cmac.h |  11 +++--
 net/mac80211/key.h      |   2 +-
 3 files changed, 32 insertions(+), 107 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index d0bd5fff5f0a..2fb65588490c 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -22,126 +22,50 @@
 #define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
 #define AAD_LEN 20
 
+static const u8 zero[CMAC_TLEN_256];
 
-void gf_mulx(u8 *pad)
-{
-	int i, carry;
-
-	carry = pad[0] & 0x80;
-	for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
-		pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
-	pad[AES_BLOCK_SIZE - 1] <<= 1;
-	if (carry)
-		pad[AES_BLOCK_SIZE - 1] ^= 0x87;
-}
-
-void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
-		     const u8 *addr[], const size_t *len, u8 *mac,
-		     size_t mac_len)
-{
-	u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
-	const u8 *pos, *end;
-	size_t i, e, left, total_len;
-
-	memset(cbc, 0, AES_BLOCK_SIZE);
-
-	total_len = 0;
-	for (e = 0; e < num_elem; e++)
-		total_len += len[e];
-	left = total_len;
-
-	e = 0;
-	pos = addr[0];
-	end = pos + len[0];
-
-	while (left >= AES_BLOCK_SIZE) {
-		for (i = 0; i < AES_BLOCK_SIZE; i++) {
-			cbc[i] ^= *pos++;
-			if (pos >= end) {
-				e++;
-				pos = addr[e];
-				end = pos + len[e];
-			}
-		}
-		if (left > AES_BLOCK_SIZE)
-			crypto_cipher_encrypt_one(tfm, cbc, cbc);
-		left -= AES_BLOCK_SIZE;
-	}
-
-	memset(pad, 0, AES_BLOCK_SIZE);
-	crypto_cipher_encrypt_one(tfm, pad, pad);
-	gf_mulx(pad);
-
-	if (left || total_len == 0) {
-		for (i = 0; i < left; i++) {
-			cbc[i] ^= *pos++;
-			if (pos >= end) {
-				e++;
-				pos = addr[e];
-				end = pos + len[e];
-			}
-		}
-		cbc[left] ^= 0x80;
-		gf_mulx(pad);
-	}
-
-	for (i = 0; i < AES_BLOCK_SIZE; i++)
-		pad[i] ^= cbc[i];
-	crypto_cipher_encrypt_one(tfm, pad, pad);
-	memcpy(mac, pad, mac_len);
-}
-
-
-void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
 			const u8 *data, size_t data_len, u8 *mic)
 {
-	const u8 *addr[3];
-	size_t len[3];
-	u8 zero[CMAC_TLEN];
+	SHASH_DESC_ON_STACK(desc, tfm);
+	u8 out[AES_BLOCK_SIZE];
 
-	memset(zero, 0, CMAC_TLEN);
-	addr[0] = aad;
-	len[0] = AAD_LEN;
-	addr[1] = data;
-	len[1] = data_len - CMAC_TLEN;
-	addr[2] = zero;
-	len[2] = CMAC_TLEN;
+	desc->tfm = tfm;
 
-	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN);
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, aad, AAD_LEN);
+	crypto_shash_update(desc, data, data_len - CMAC_TLEN);
+	crypto_shash_finup(desc, zero, CMAC_TLEN, out);
+
+	memcpy(mic, out, CMAC_TLEN);
 }
 
-void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
 			    const u8 *data, size_t data_len, u8 *mic)
 {
-	const u8 *addr[3];
-	size_t len[3];
-	u8 zero[CMAC_TLEN_256];
+	SHASH_DESC_ON_STACK(desc, tfm);
 
-	memset(zero, 0, CMAC_TLEN_256);
-	addr[0] = aad;
-	len[0] = AAD_LEN;
-	addr[1] = data;
-	len[1] = data_len - CMAC_TLEN_256;
-	addr[2] = zero;
-	len[2] = CMAC_TLEN_256;
+	desc->tfm = tfm;
 
-	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256);
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, aad, AAD_LEN);
+	crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
+	crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic);
 }
 
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
-						   size_t key_len)
+struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
+						  size_t key_len)
 {
-	struct crypto_cipher *tfm;
+	struct crypto_shash *tfm;
 
-	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (!IS_ERR(tfm))
-		crypto_cipher_setkey(tfm, key, key_len);
+		crypto_shash_setkey(tfm, key, key_len);
 
 	return tfm;
 }
 
-
-void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
+void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm)
 {
-	crypto_free_cipher(tfm);
+	crypto_free_shash(tfm);
 }
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index 3702041f44fd..fef531f42003 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -10,13 +10,14 @@
 #define AES_CMAC_H
 
 #include <linux/crypto.h>
+#include <crypto/hash.h>
 
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
-						   size_t key_len);
-void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
+						  size_t key_len);
+void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
 			const u8 *data, size_t data_len, u8 *mic);
-void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
 			    const u8 *data, size_t data_len, u8 *mic);
-void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
+void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm);
 
 #endif /* AES_CMAC_H */
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 4aa20cef0859..ebdb80b85dc3 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -93,7 +93,7 @@ struct ieee80211_key {
 		} ccmp;
 		struct {
 			u8 rx_pn[IEEE80211_CMAC_PN_LEN];
-			struct crypto_cipher *tfm;
+			struct crypto_shash *tfm;
 			u32 replays; /* dot11RSNAStatsCMACReplays */
 			u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
 		} aes_cmac;
-- 
cgit v1.2.3


From b2347a322d1f6f93f1a39fe17ed08628fc959351 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Tue, 7 Feb 2017 16:20:53 +0300
Subject: mac80211: check for allocation failure in debugfs code

kmalloc() can fail.  Also let's move the allocation out of the
declaration block so it's easier to read.

Fixes: 4a5eccaa9350 ("mac80211: Show pending txqlen in debugfs.")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 47bea0babe78..5fae001f286c 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -249,12 +249,19 @@ static ssize_t misc_read(struct file *file, char __user *user_buf,
 	struct ieee80211_local *local = file->private_data;
 	/* Max len of each line is 16 characters, plus 9 for 'pending:\n' */
 	size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9;
-	char *buf = kzalloc(bufsz, GFP_KERNEL);
-	char *pos = buf, *end = buf + bufsz - 1;
+	char *buf;
+	char *pos, *end;
 	ssize_t rv;
 	int i;
 	int ln;
 
+	buf = kzalloc(bufsz, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	pos = buf;
+	end = buf + bufsz - 1;
+
 	pos += scnprintf(pos, end - pos, "pending:\n");
 
 	for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
-- 
cgit v1.2.3


From a4956dca0764569640374ae1afb8be54a23201b8 Mon Sep 17 00:00:00 2001
From: Luca Coelho
Date: Tue, 7 Feb 2017 22:13:56 +0200
Subject: cfg80211: make rdev assignment clearer in nl80211_testmode_dump()

Avoid assigning rdev to NULL when we already have it and getting it
again from the wiphy index, by moving this code to relevant if block.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 63dfa60a29ef..a7b4318f735d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8585,6 +8585,12 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
 		 * so we need to offset by 1.
 		 */
 		phy_idx = cb->args[0] - 1;
+
+		rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
+		if (!rdev) {
+			err = -ENOENT;
+			goto out_err;
+		}
 	} else {
 		struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
 
@@ -8599,7 +8605,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
 			goto out_err;
 		}
 		phy_idx = rdev->wiphy_idx;
-		rdev = NULL;
 
 		if (attrbuf[NL80211_ATTR_TESTDATA])
 			cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA];
@@ -8610,12 +8615,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
 		data_len = nla_len((void *)cb->args[1]);
 	}
 
-	rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
-	if (!rdev) {
-		err = -ENOENT;
-		goto out_err;
-	}
-
 	if (!rdev->ops->testmode_dump) {
 		err = -EOPNOTSUPP;
 		goto out_err;
-- 
cgit v1.2.3


From 66cd794e3c30b8af3b6befe42a378557efb3114a Mon Sep 17 00:00:00 2001
From: Johannes Berg
Date: Tue, 7 Feb 2017 22:40:44 +0200
Subject: nl80211: add HT/VHT capabilities to AP parameters

For the benefit of drivers that rebuild IEs in firmware, parse the
IEs for HT/VHT capabilities and the respective membership selector
in the (extended) supported rates. This avoids duplicating the same
code into all drivers that need this information.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  3 ++-
 include/net/cfg80211.h    |  8 ++++++++
 net/wireless/nl80211.c    | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 02768de209d6..0dd9498c694f 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1043,8 +1043,9 @@ struct ieee80211_mgmt {
 	} u;
 } __packed __aligned(2);
 
-/* Supported Rates value encodings in 802.11n-2009 7.3.2.2 */
+/* Supported rates membership selectors */
 #define BSS_MEMBERSHIP_SELECTOR_HT_PHY	127
+#define BSS_MEMBERSHIP_SELECTOR_VHT_PHY	126
 
 /* mgmt header + 1 byte category code */
 #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 870549480e9b..5cfd2806a078 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -748,6 +748,10 @@ struct cfg80211_bitrate_mask {
  * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
  *	networks.
  * @beacon_rate: bitrate to be used for beacons
+ * @ht_cap: HT capabilities (or %NULL if HT isn't enabled)
+ * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled)
+ * @ht_required: stations must support HT
+ * @vht_required: stations must support VHT
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -768,6 +772,10 @@ struct cfg80211_ap_settings {
 	const struct cfg80211_acl_data *acl;
 	bool pbss;
 	struct cfg80211_bitrate_mask beacon_rate;
+
+	const struct ieee80211_ht_cap *ht_cap;
+	const struct ieee80211_vht_cap *vht_cap;
+	bool ht_required, vht_required;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a7b4318f735d..c853746f47bc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015-2016	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  */
 
 #include <linux/if.h>
@@ -3743,6 +3743,49 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
 	return 0;
 }
 
+static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
+					    const u8 *rates)
+{
+	int i;
+
+	if (!rates)
+		return;
+
+	for (i = 0; i < rates[1]; i++) {
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
+			params->ht_required = true;
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
+			params->vht_required = true;
+	}
+}
+
+/*
+ * Since the nl80211 API didn't include, from the beginning, attributes about
+ * HT/VHT requirements/capabilities, we parse them out of the IEs for the
+ * benefit of drivers that rebuild IEs in the firmware.
+ */
+static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
+{
+	const struct cfg80211_beacon_data *bcn = &params->beacon;
+	size_t ies_len = bcn->beacon_ies_len;
+	const u8 *ies = bcn->beacon_ies;
+	const u8 *rates;
+	const u8 *cap;
+
+	rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len);
+	nl80211_check_ap_rate_selectors(params, rates);
+
+	rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len);
+	nl80211_check_ap_rate_selectors(params, rates);
+
+	cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len);
+	if (cap && cap[1] >= sizeof(*params->ht_cap))
+		params->ht_cap = (void *)(cap + 2);
+	cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
+	if (cap && cap[1] >= sizeof(*params->vht_cap))
+		params->vht_cap = (void *)(cap + 2);
+}
+
 static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
 				   struct cfg80211_ap_settings *params)
 {
@@ -3971,6 +4014,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	nl80211_calculate_ap_params(&params);
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
-- 
cgit v1.2.3


From aad1e812eee31a0e075709c247577b0328a6deab Mon Sep 17 00:00:00 2001
From: Arend Van Spriel
Date: Fri, 27 Jan 2017 12:27:44 +0000
Subject: nl80211: fix validation of scheduled scan info for wowlan netdetect

For wowlan netdetect a separate limit is defined for the number of
matchsets. Currently, this limit is ignored and the regular limit
for scheduled scan matchsets, ie. struct wiphy::max_match_sets, is
used for the net-detect case as well.

Cc: Johannes Berg <johannes@sipsolutions.net>
Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c853746f47bc..b455898df63c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6912,7 +6912,7 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
 
 static struct cfg80211_sched_scan_request *
 nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
-			 struct nlattr **attrs)
+			 struct nlattr **attrs, int max_match_sets)
 {
 	struct cfg80211_sched_scan_request *request;
 	struct nlattr *attr;
@@ -6977,7 +6977,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 	if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF)
 		n_match_sets = 1;
 
-	if (n_match_sets > wiphy->max_match_sets)
+	if (n_match_sets > max_match_sets)
 		return ERR_PTR(-EINVAL);
 
 	if (attrs[NL80211_ATTR_IE])
@@ -7277,7 +7277,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
 		return -EINPROGRESS;
 
 	sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev,
-						  info->attrs);
+						  info->attrs,
+						  rdev->wiphy.max_match_sets);
 
 	err = PTR_ERR_OR_ZERO(sched_scan_req);
 	if (err)
@@ -10089,7 +10090,8 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
 	if (err)
 		goto out;
 
-	trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb);
+	trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb,
+						   wowlan->max_nd_match_sets);
 	err = PTR_ERR_OR_ZERO(trig->nd_config);
 	if (err)
 		trig->nd_config = NULL;
-- 
cgit v1.2.3


From 769f07d8f0fb6a68a0eda6308bbe890bff894fd7 Mon Sep 17 00:00:00 2001
From: Andrzej Zaborowski
Date: Wed, 25 Jan 2017 12:43:40 +0100
Subject: mac80211: Pass new RSSI level in CQM RSSI notification

Extend ieee80211_cqm_rssi_notify with a rssi_level parameter so that
this information can be passed to netlink clients in the next patch, if
available.  Most drivers will have this value at hand.  wl1251 receives
events from the firmware that only tell it whether latest measurement
is above or below threshold so we don't pass any value at this time
(parameter is 0).

Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rx.c |  2 ++
 drivers/net/wireless/rsi/rsi_91x_mac80211.c |  2 +-
 drivers/net/wireless/st/cw1200/sta.c        |  2 +-
 drivers/net/wireless/ti/wl1251/event.c      |  4 ++--
 drivers/net/wireless/ti/wlcore/event.c      |  3 ++-
 include/net/mac80211.h                      |  2 ++
 net/mac80211/mlme.c                         |  7 ++++---
 net/mac80211/trace.h                        | 11 +++++++----
 8 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
index 0e60e38b2acf..e06a2e323cc8 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
@@ -571,6 +571,7 @@ static void iwl_mvm_stat_iterator(void *_data, u8 *mac,
 		ieee80211_cqm_rssi_notify(
 			vif,
 			NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
+			sig,
 			GFP_KERNEL);
 	} else if (sig > thold &&
 		   (last_event == 0 || sig > last_event + hyst)) {
@@ -580,6 +581,7 @@ static void iwl_mvm_stat_iterator(void *_data, u8 *mac,
 		ieee80211_cqm_rssi_notify(
 			vif,
 			NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
+			sig,
 			GFP_KERNEL);
 	}
 }
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index dadaa73ab49d..e3216473aecb 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -877,7 +877,7 @@ static void rsi_perform_cqm(struct rsi_common *common,
 
 	common->cqm_info.last_cqm_event_rssi = rssi;
 	rsi_dbg(INFO_ZONE, "CQM: Notifying event: %d\n", event);
-	ieee80211_cqm_rssi_notify(adapter->vifs[0], event, GFP_KERNEL);
+	ieee80211_cqm_rssi_notify(adapter->vifs[0], event, rssi, GFP_KERNEL);
 
 	return;
 }
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index daf06a4f842e..a52224836a2b 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -1019,7 +1019,7 @@ void cw1200_event_handler(struct work_struct *work)
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW :
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH;
 			pr_debug("[CQM] RSSI event: %d.\n", rcpi_rssi);
-			ieee80211_cqm_rssi_notify(priv->vif, cqm_evt,
+			ieee80211_cqm_rssi_notify(priv->vif, cqm_evt, rcpi_rssi,
 						  GFP_KERNEL);
 			break;
 		}
diff --git a/drivers/net/wireless/ti/wl1251/event.c b/drivers/net/wireless/ti/wl1251/event.c
index d0593bc1f1a9..f5acd24d0e2b 100644
--- a/drivers/net/wireless/ti/wl1251/event.c
+++ b/drivers/net/wireless/ti/wl1251/event.c
@@ -150,7 +150,7 @@ static int wl1251_event_process(struct wl1251 *wl, struct event_mailbox *mbox)
 				     "ROAMING_TRIGGER_LOW_RSSI_EVENT");
 			ieee80211_cqm_rssi_notify(wl->vif,
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
-				GFP_KERNEL);
+				0, GFP_KERNEL);
 		}
 
 		if (vector & ROAMING_TRIGGER_REGAINED_RSSI_EVENT_ID) {
@@ -158,7 +158,7 @@ static int wl1251_event_process(struct wl1251 *wl, struct event_mailbox *mbox)
 				     "ROAMING_TRIGGER_REGAINED_RSSI_EVENT");
 			ieee80211_cqm_rssi_notify(wl->vif,
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
-				GFP_KERNEL);
+				0, GFP_KERNEL);
 		}
 	}
 
diff --git a/drivers/net/wireless/ti/wlcore/event.c b/drivers/net/wireless/ti/wlcore/event.c
index 4b59f67724de..f2e90d223d94 100644
--- a/drivers/net/wireless/ti/wlcore/event.c
+++ b/drivers/net/wireless/ti/wlcore/event.c
@@ -129,7 +129,8 @@ void wlcore_event_rssi_trigger(struct wl1271 *wl, s8 *metric_arr)
 
 		vif = wl12xx_wlvif_to_vif(wlvif);
 		if (event != wlvif->last_rssi_event)
-			ieee80211_cqm_rssi_notify(vif, event, GFP_KERNEL);
+			ieee80211_cqm_rssi_notify(vif, event, metric,
+						  GFP_KERNEL);
 		wlvif->last_rssi_event = event;
 	}
 }
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 33624ffbd5a5..b9a08cd1d97d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5278,6 +5278,7 @@ void ieee80211_resume_disconnect(struct ieee80211_vif *vif);
  *
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @rssi_event: the RSSI trigger event type
+ * @rssi_level: new RSSI level value or 0 if not available
  * @gfp: context flags
  *
  * When the %IEEE80211_VIF_SUPPORTS_CQM_RSSI is set, and a connection quality
@@ -5286,6 +5287,7 @@ void ieee80211_resume_disconnect(struct ieee80211_vif *vif);
  */
 void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
 			       enum nl80211_cqm_rssi_threshold_event rssi_event,
+			       s32 rssi_level,
 			       gfp_t gfp);
 
 /**
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8a6344518674..ee423688c92e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3419,14 +3419,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 			ieee80211_cqm_rssi_notify(
 				&sdata->vif,
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
-				GFP_KERNEL);
+				sig, GFP_KERNEL);
 		} else if (sig > thold &&
 			   (last_event == 0 || sig > last_event + hyst)) {
 			ifmgd->last_cqm_event_signal = sig;
 			ieee80211_cqm_rssi_notify(
 				&sdata->vif,
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
-				GFP_KERNEL);
+				sig, GFP_KERNEL);
 		}
 	}
 
@@ -5041,11 +5041,12 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 
 void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
 			       enum nl80211_cqm_rssi_threshold_event rssi_event,
+			       s32 rssi_level,
 			       gfp_t gfp)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 
-	trace_api_cqm_rssi_notify(sdata, rssi_event);
+	trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level);
 
 	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp);
 }
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 92a47afaa989..f78d9f4f8711 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1996,23 +1996,26 @@ TRACE_EVENT(api_connection_loss,
 
 TRACE_EVENT(api_cqm_rssi_notify,
 	TP_PROTO(struct ieee80211_sub_if_data *sdata,
-		 enum nl80211_cqm_rssi_threshold_event rssi_event),
+		 enum nl80211_cqm_rssi_threshold_event rssi_event,
+		 s32 rssi_level),
 
-	TP_ARGS(sdata, rssi_event),
+	TP_ARGS(sdata, rssi_event, rssi_level),
 
 	TP_STRUCT__entry(
 		VIF_ENTRY
 		__field(u32, rssi_event)
+		__field(s32, rssi_level)
 	),
 
 	TP_fast_assign(
 		VIF_ASSIGN;
 		__entry->rssi_event = rssi_event;
+		__entry->rssi_level = rssi_level;
 	),
 
 	TP_printk(
-		VIF_PR_FMT " event:%d",
-		VIF_PR_ARG, __entry->rssi_event
+		VIF_PR_FMT " event:%d rssi:%d",
+		VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level
 	)
 );
 
-- 
cgit v1.2.3


From bee427b86217b78a0a5fc85575cc155e4c32bbf9 Mon Sep 17 00:00:00 2001
From: Andrzej Zaborowski
Date: Wed, 25 Jan 2017 12:43:41 +0100
Subject: cfg80211: Pass new RSSI level in CQM RSSI notification

Update the drivers to pass the RSSI level as a cfg80211_cqm_rssi_notify
parameter and pass this value to userspace in a new nl80211 attribute.
This helps both userspace and also helps in the implementation of the
multiple RSSI thresholds CQM mechanism.

Note for marvell/mwifiex I pass 0 for the RSSI value because the new
RSSI value is not available to the driver at the time of the
cfg80211_cqm_rssi_notify call, but the driver queries the new value
immediately after that, so it is actually available just a moment later
if we wanted to defer caling cfg80211_cqm_rssi_notify until that moment.
Without this, the new cfg80211 code (patch 3) will call .get_station
which will send a duplicate HostCmd_CMD_RSSI_INFO command to the hardware.

Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/marvell/mwifiex/sta_event.c |  4 ++--
 drivers/net/wireless/rndis_wlan.c                |  2 +-
 include/net/cfg80211.h                           |  3 ++-
 include/uapi/linux/nl80211.h                     |  3 +++
 net/mac80211/mlme.c                              |  2 +-
 net/wireless/nl80211.c                           |  9 +++++++--
 net/wireless/trace.h                             | 11 +++++++----
 7 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/marvell/mwifiex/sta_event.c b/drivers/net/wireless/marvell/mwifiex/sta_event.c
index 9df0c4dc06ed..5cc3aa7c31cd 100644
--- a/drivers/net/wireless/marvell/mwifiex/sta_event.c
+++ b/drivers/net/wireless/marvell/mwifiex/sta_event.c
@@ -824,7 +824,7 @@ int mwifiex_process_sta_event(struct mwifiex_private *priv)
 	case EVENT_RSSI_LOW:
 		cfg80211_cqm_rssi_notify(priv->netdev,
 					 NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
-					 GFP_KERNEL);
+					 0, GFP_KERNEL);
 		mwifiex_send_cmd(priv, HostCmd_CMD_RSSI_INFO,
 				 HostCmd_ACT_GEN_GET, 0, NULL, false);
 		priv->subsc_evt_rssi_state = RSSI_LOW_RECVD;
@@ -839,7 +839,7 @@ int mwifiex_process_sta_event(struct mwifiex_private *priv)
 	case EVENT_RSSI_HIGH:
 		cfg80211_cqm_rssi_notify(priv->netdev,
 					 NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
-					 GFP_KERNEL);
+					 0, GFP_KERNEL);
 		mwifiex_send_cmd(priv, HostCmd_CMD_RSSI_INFO,
 				 HostCmd_ACT_GEN_GET, 0, NULL, false);
 		priv->subsc_evt_rssi_state = RSSI_HIGH_RECVD;
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 603c90470225..785334f7a538 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -3187,7 +3187,7 @@ static void rndis_do_cqm(struct usbnet *usbdev, s32 rssi)
 		return;
 
 	priv->last_cqm_event_rssi = rssi;
-	cfg80211_cqm_rssi_notify(usbdev->net, event, GFP_KERNEL);
+	cfg80211_cqm_rssi_notify(usbdev->net, event, rssi, GFP_KERNEL);
 }
 
 #define DEVICE_POLLER_JIFFIES (HZ)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5cfd2806a078..a2c18b53e053 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5390,6 +5390,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
  * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
  * @dev: network device
  * @rssi_event: the triggered RSSI event
+ * @rssi_level: new RSSI level value or 0 if not available
  * @gfp: context flags
  *
  * This function is called when a configured connection quality monitoring
@@ -5397,7 +5398,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
  */
 void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			      enum nl80211_cqm_rssi_threshold_event rssi_event,
-			      gfp_t gfp);
+			      s32 rssi_level, gfp_t gfp);
 
 /**
  * cfg80211_cqm_pktloss_notify - notify userspace about packetloss to peer
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d6c62ee9bd1d..cd547b864595 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3952,6 +3952,8 @@ enum nl80211_ps_state {
  *	%NL80211_CMD_NOTIFY_CQM. Set to 0 to turn off TX error reporting.
  * @NL80211_ATTR_CQM_BEACON_LOSS_EVENT: flag attribute that's set in a beacon
  *	loss event
+ * @NL80211_ATTR_CQM_RSSI_LEVEL: the RSSI value in dBm that triggered the
+ *	RSSI threshold event.
  * @__NL80211_ATTR_CQM_AFTER_LAST: internal
  * @NL80211_ATTR_CQM_MAX: highest key attribute
  */
@@ -3965,6 +3967,7 @@ enum nl80211_attr_cqm {
 	NL80211_ATTR_CQM_TXE_PKTS,
 	NL80211_ATTR_CQM_TXE_INTVL,
 	NL80211_ATTR_CQM_BEACON_LOSS_EVENT,
+	NL80211_ATTR_CQM_RSSI_LEVEL,
 
 	/* keep last */
 	__NL80211_ATTR_CQM_AFTER_LAST,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ee423688c92e..6e90301154d5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5048,7 +5048,7 @@ void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
 
 	trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level);
 
-	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp);
+	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp);
 }
 EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b455898df63c..9d738f75bd4e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9474,6 +9474,7 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
 	[NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
 	[NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
 	[NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
+	[NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 },
 };
 
 static int nl80211_set_cqm_txe(struct genl_info *info,
@@ -13959,11 +13960,11 @@ static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
 
 void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			      enum nl80211_cqm_rssi_threshold_event rssi_event,
-			      gfp_t gfp)
+			      s32 rssi_level, gfp_t gfp)
 {
 	struct sk_buff *msg;
 
-	trace_cfg80211_cqm_rssi_notify(dev, rssi_event);
+	trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level);
 
 	if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
 		    rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
@@ -13977,6 +13978,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			rssi_event))
 		goto nla_put_failure;
 
+	if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL,
+				      rssi_level))
+		goto nla_put_failure;
+
 	cfg80211_send_cqm(msg, gfp);
 
 	return;
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ea1b47e04fa4..2419c390f150 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2490,18 +2490,21 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
 
 TRACE_EVENT(cfg80211_cqm_rssi_notify,
 	TP_PROTO(struct net_device *netdev,
-		 enum nl80211_cqm_rssi_threshold_event rssi_event),
-	TP_ARGS(netdev, rssi_event),
+		 enum nl80211_cqm_rssi_threshold_event rssi_event,
+		 s32 rssi_level),
+	TP_ARGS(netdev, rssi_event, rssi_level),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		__field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
+		__field(s32, rssi_level)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		__entry->rssi_event = rssi_event;
+		__entry->rssi_level = rssi_level;
 	),
-	TP_printk(NETDEV_PR_FMT ", rssi event: %d",
-		  NETDEV_PR_ARG, __entry->rssi_event)
+	TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
+		  NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
 );
 
 TRACE_EVENT(cfg80211_reg_can_beacon,
-- 
cgit v1.2.3


From c078ca3b0c5bf82c2b31906c446d6e2ad8ea0783 Mon Sep 17 00:00:00 2001
From: Phil Sutter
Date: Tue, 17 Jan 2017 22:51:26 +0100
Subject: netfilter: nft_exthdr: Add support for existence check

If NFT_EXTHDR_F_PRESENT is set, exthdr will not copy any header field
data into *dest, but instead set it to 1 if the header is found and 0
otherwise.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  6 ++++++
 net/netfilter/nft_exthdr.c               | 22 ++++++++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 7b730cab99bd..53aac8b8ed6b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -704,6 +704,10 @@ enum nft_payload_attributes {
 };
 #define NFTA_PAYLOAD_MAX	(__NFTA_PAYLOAD_MAX - 1)
 
+enum nft_exthdr_flags {
+	NFT_EXTHDR_F_PRESENT = (1 << 0),
+};
+
 /**
  * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes
  *
@@ -711,6 +715,7 @@ enum nft_payload_attributes {
  * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8)
  * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
+ * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -718,6 +723,7 @@ enum nft_exthdr_attributes {
 	NFTA_EXTHDR_TYPE,
 	NFTA_EXTHDR_OFFSET,
 	NFTA_EXTHDR_LEN,
+	NFTA_EXTHDR_FLAGS,
 	__NFTA_EXTHDR_MAX
 };
 #define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47beb3abcc9d..a89e5ab150db 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -23,6 +23,7 @@ struct nft_exthdr {
 	u8			offset;
 	u8			len;
 	enum nft_registers	dreg:8;
+	u8			flags;
 };
 
 static void nft_exthdr_eval(const struct nft_expr *expr,
@@ -35,8 +36,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr,
 	int err;
 
 	err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
-	if (err < 0)
+	if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+		*dest = (err >= 0);
+		return;
+	} else if (err < 0) {
 		goto err;
+	}
 	offset += priv->offset;
 
 	dest[priv->len / NFT_REG32_SIZE] = 0;
@@ -52,6 +57,7 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
 	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int nft_exthdr_init(const struct nft_ctx *ctx,
@@ -59,7 +65,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
-	u32 offset, len;
+	u32 offset, len, flags = 0;
 	int err;
 
 	if (tb[NFTA_EXTHDR_DREG] == NULL ||
@@ -76,10 +82,20 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
+	if (tb[NFTA_EXTHDR_FLAGS]) {
+		err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags);
+		if (err < 0)
+			return err;
+
+		if (flags & ~NFT_EXTHDR_F_PRESENT)
+			return -EINVAL;
+	}
+
 	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
 	priv->offset = offset;
 	priv->len    = len;
 	priv->dreg   = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
+	priv->flags  = flags;
 
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, priv->len);
@@ -97,6 +113,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3


From 5cb82a38c6b5152b1deaba0c1596ce63222a4710 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jan 2017 18:30:07 +0100
Subject: netfilter: nf_tables: pass netns to set->ops->remove()

This new parameter is required by the new bitmap set type that comes in a
follow up patch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 3 ++-
 net/netfilter/nf_tables_api.c     | 6 +++---
 net/netfilter/nft_set_hash.c      | 3 ++-
 net/netfilter/nft_set_rbtree.c    | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 7dfdb517f0be..a721bcb1210c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -298,7 +298,8 @@ struct nft_set_ops {
 	bool				(*deactivate_one)(const struct net *net,
 							  const struct nft_set *set,
 							  void *priv);
-	void				(*remove)(const struct nft_set *set,
+	void				(*remove)(const struct net *net,
+						  const struct nft_set *set,
 						  const struct nft_set_elem *elem);
 	void				(*walk)(const struct nft_ctx *ctx,
 						struct nft_set *set,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 57eeae63f597..3643ce345b59 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3752,7 +3752,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	return 0;
 
 err6:
-	set->ops->remove(set, &elem);
+	set->ops->remove(ctx->net, set, &elem);
 err5:
 	kfree(trans);
 err4:
@@ -4804,7 +4804,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nf_tables_setelem_notify(&trans->ctx, te->set,
 						 &te->elem,
 						 NFT_MSG_DELSETELEM, 0);
-			te->set->ops->remove(te->set, &te->elem);
+			te->set->ops->remove(net, te->set, &te->elem);
 			atomic_dec(&te->set->nelems);
 			te->set->ndeact--;
 			break;
@@ -4925,7 +4925,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
-			te->set->ops->remove(te->set, &te->elem);
+			te->set->ops->remove(net, te->set, &te->elem);
 			atomic_dec(&te->set->nelems);
 			break;
 		case NFT_MSG_DELSETELEM:
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index e36069fb76ae..bb157bd47fe8 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -203,7 +203,8 @@ static void *nft_hash_deactivate(const struct net *net,
 	return he;
 }
 
-static void nft_hash_remove(const struct nft_set *set,
+static void nft_hash_remove(const struct net *net,
+			    const struct nft_set *set,
 			    const struct nft_set_elem *elem)
 {
 	struct nft_hash *priv = nft_set_priv(set);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index f06f55ee516d..9fbd70da1633 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -151,7 +151,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	return err;
 }
 
-static void nft_rbtree_remove(const struct nft_set *set,
+static void nft_rbtree_remove(const struct net *net,
+			      const struct nft_set *set,
 			      const struct nft_set_elem *elem)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
-- 
cgit v1.2.3


From baa2d42cff088d9f962697a10e0345b444c8c409 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jan 2017 18:30:08 +0100
Subject: netfilter: nf_tables: use struct nft_set_iter in set element flush

Instead of struct nft_set_dump_args, remove unnecessary wrapper
structure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3643ce345b59..790ffed82930 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3936,15 +3936,13 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 		return -EBUSY;
 
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
-		struct nft_set_dump_args args = {
-			.iter	= {
-				.genmask	= genmask,
-				.fn		= nft_flush_set,
-			},
+		struct nft_set_iter iter = {
+			.genmask	= genmask,
+			.fn		= nft_flush_set,
 		};
-		set->ops->walk(&ctx, set, &args.iter);
+		set->ops->walk(&ctx, set, &iter);
 
-		return args.iter.err;
+		return iter.err;
 	}
 
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
-- 
cgit v1.2.3


From 1ba1c41408df8a9d2f8b9b67e4c9e6f59b29d8ee Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jan 2017 18:30:09 +0100
Subject: netfilter: nf_tables: rename deactivate_one() to flush()

Although semantics are similar to deactivate() with no implicit element
lookup, this is only called from the set flush path, so better rename
this to flush().

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 8 ++++----
 net/netfilter/nf_tables_api.c     | 2 +-
 net/netfilter/nft_set_hash.c      | 8 ++++----
 net/netfilter/nft_set_rbtree.c    | 8 ++++----
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index a721bcb1210c..ab155644d489 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -260,7 +260,7 @@ struct nft_expr;
  *	@insert: insert new element into set
  *	@activate: activate new element in the next generation
  *	@deactivate: lookup for element and deactivate it in the next generation
- *	@deactivate_one: deactivate element in the next generation
+ *	@flush: deactivate element in the next generation
  *	@remove: remove element from set
  *	@walk: iterate over all set elemeennts
  *	@privsize: function to return size of set private data
@@ -295,9 +295,9 @@ struct nft_set_ops {
 	void *				(*deactivate)(const struct net *net,
 						      const struct nft_set *set,
 						      const struct nft_set_elem *elem);
-	bool				(*deactivate_one)(const struct net *net,
-							  const struct nft_set *set,
-							  void *priv);
+	bool				(*flush)(const struct net *net,
+						 const struct nft_set *set,
+						 void *priv);
 	void				(*remove)(const struct net *net,
 						  const struct nft_set *set,
 						  const struct nft_set_elem *elem);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 790ffed82930..c09b11eb36fc 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3898,7 +3898,7 @@ static int nft_flush_set(const struct nft_ctx *ctx,
 	if (!trans)
 		return -ENOMEM;
 
-	if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) {
+	if (!set->ops->flush(ctx->net, set, elem->priv)) {
 		err = -ENOENT;
 		goto err1;
 	}
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index bb157bd47fe8..2f10ac3b1b10 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,8 +167,8 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
 	nft_set_elem_clear_busy(&he->ext);
 }
 
-static bool nft_hash_deactivate_one(const struct net *net,
-				    const struct nft_set *set, void *priv)
+static bool nft_hash_flush(const struct net *net,
+			   const struct nft_set *set, void *priv)
 {
 	struct nft_hash_elem *he = priv;
 
@@ -195,7 +195,7 @@ static void *nft_hash_deactivate(const struct net *net,
 	rcu_read_lock();
 	he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
 	if (he != NULL &&
-	    !nft_hash_deactivate_one(net, set, he))
+	    !nft_hash_flush(net, set, he))
 		he = NULL;
 
 	rcu_read_unlock();
@@ -398,7 +398,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
 	.insert		= nft_hash_insert,
 	.activate	= nft_hash_activate,
 	.deactivate	= nft_hash_deactivate,
-	.deactivate_one	= nft_hash_deactivate_one,
+	.flush		= nft_hash_flush,
 	.remove		= nft_hash_remove,
 	.lookup		= nft_hash_lookup,
 	.update		= nft_hash_update,
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 9fbd70da1633..81b8a4c2c061 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -172,8 +172,8 @@ static void nft_rbtree_activate(const struct net *net,
 	nft_set_elem_change_active(net, set, &rbe->ext);
 }
 
-static bool nft_rbtree_deactivate_one(const struct net *net,
-				      const struct nft_set *set, void *priv)
+static bool nft_rbtree_flush(const struct net *net,
+			     const struct nft_set *set, void *priv)
 {
 	struct nft_rbtree_elem *rbe = priv;
 
@@ -214,7 +214,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
 				parent = parent->rb_right;
 				continue;
 			}
-			nft_rbtree_deactivate_one(net, set, rbe);
+			nft_rbtree_flush(net, set, rbe);
 			return rbe;
 		}
 	}
@@ -305,7 +305,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
 	.insert		= nft_rbtree_insert,
 	.remove		= nft_rbtree_remove,
 	.deactivate	= nft_rbtree_deactivate,
-	.deactivate_one	= nft_rbtree_deactivate_one,
+	.flush		= nft_rbtree_flush,
 	.activate	= nft_rbtree_activate,
 	.lookup		= nft_rbtree_lookup,
 	.walk		= nft_rbtree_walk,
-- 
cgit v1.2.3


From 1f48ff6c5393aa7fe290faf5d633164f105b0aa7 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jan 2017 18:30:10 +0100
Subject: netfilter: nf_tables: add flush field to struct nft_set_iter

This provides context to walk callback iterator, thus, we know if the
walk happens from the set flush path. This is required by the new bitmap
set type coming in a follow up patch which has no real struct
nft_set_ext, so it has to allocate it based on the two bit compact
element representation.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 1 +
 net/netfilter/nf_tables_api.c     | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ab155644d489..5830f594842e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -203,6 +203,7 @@ struct nft_set_elem {
 struct nft_set;
 struct nft_set_iter {
 	u8		genmask;
+	bool		flush;
 	unsigned int	count;
 	unsigned int	skip;
 	int		err;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c09b11eb36fc..7ae810b03462 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3121,6 +3121,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		iter.count	= 0;
 		iter.err	= 0;
 		iter.fn		= nf_tables_bind_check_setelem;
+		iter.flush	= false;
 
 		set->ops->walk(ctx, set, &iter);
 		if (iter.err < 0)
@@ -3374,6 +3375,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	args.iter.count		= 0;
 	args.iter.err		= 0;
 	args.iter.fn		= nf_tables_dump_setelem;
+	args.iter.flush		= false;
 	set->ops->walk(&ctx, set, &args.iter);
 
 	nla_nest_end(skb, nest);
@@ -3939,6 +3941,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 		struct nft_set_iter iter = {
 			.genmask	= genmask,
 			.fn		= nft_flush_set,
+			.flush		= true,
 		};
 		set->ops->walk(&ctx, set, &iter);
 
@@ -5089,6 +5092,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 			iter.count	= 0;
 			iter.err	= 0;
 			iter.fn		= nf_tables_loop_check_setelem;
+			iter.flush	= false;
 
 			set->ops->walk(ctx, set, &iter);
 			if (iter.err < 0)
-- 
cgit v1.2.3


From 55af753cd9fda9c5300f5318253b08bd15fb412e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jan 2017 18:30:11 +0100
Subject: netfilter: nf_tables: rename struct nft_set_estimate class field

Use lookup as field name instead, to prepare the introduction of the
memory class in a follow up patch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++--
 net/netfilter/nf_tables_api.c     | 12 ++++++------
 net/netfilter/nft_set_hash.c      |  2 +-
 net/netfilter/nft_set_rbtree.c    |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5830f594842e..d76ac2f80a40 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -244,11 +244,11 @@ enum nft_set_class {
  *				  characteristics
  *
  *	@size: required memory
- *	@class: lookup performance class
+ *	@lookup: lookup performance class
  */
 struct nft_set_estimate {
 	unsigned int		size;
-	enum nft_set_class	class;
+	enum nft_set_class	lookup;
 };
 
 struct nft_set_ext;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7ae810b03462..fa7cd1679079 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2401,9 +2401,9 @@ nft_select_set_ops(const struct nlattr * const nla[],
 		features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT;
 	}
 
-	bops	   = NULL;
-	best.size  = ~0;
-	best.class = ~0;
+	bops	    = NULL;
+	best.size   = ~0;
+	best.lookup = ~0;
 
 	list_for_each_entry(ops, &nf_tables_set_ops, list) {
 		if ((ops->features & features) != features)
@@ -2413,15 +2413,15 @@ nft_select_set_ops(const struct nlattr * const nla[],
 
 		switch (policy) {
 		case NFT_SET_POL_PERFORMANCE:
-			if (est.class < best.class)
+			if (est.lookup < best.lookup)
 				break;
-			if (est.class == best.class && est.size < best.size)
+			if (est.lookup == best.lookup && est.size < best.size)
 				break;
 			continue;
 		case NFT_SET_POL_MEMORY:
 			if (est.size < best.size)
 				break;
-			if (est.size == best.size && est.class < best.class)
+			if (est.size == best.size && est.lookup < best.lookup)
 				break;
 			continue;
 		default:
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 2f10ac3b1b10..e58e7f02138b 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -384,7 +384,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
 		est->size = esize + 2 * sizeof(struct nft_hash_elem *);
 	}
 
-	est->class = NFT_SET_CLASS_O_1;
+	est->lookup = NFT_SET_CLASS_O_1;
 
 	return true;
 }
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 81b8a4c2c061..2b6ea10c4bbd 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -291,7 +291,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 	else
 		est->size = nsize;
 
-	est->class = NFT_SET_CLASS_O_LOG_N;
+	est->lookup = NFT_SET_CLASS_O_LOG_N;
 
 	return true;
 }
-- 
cgit v1.2.3


From 0b5a78749260560f41e3b7c1f60f2c7dd9aff4f0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jan 2017 18:30:12 +0100
Subject: netfilter: nf_tables: add space notation to sets

The space notation allows us to classify the set backend implementation
based on the amount of required memory. This provides an order of the
set representation scalability in terms of memory. The size field is
still left in place so use this if the userspace provides no explicit
number of elements, so we cannot calculate the real memory that this set
needs. This also helps us break ties in the set backend selection
routine, eg. two backend implementations provide the same performance.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c     | 22 +++++++++++++++++-----
 net/netfilter/nft_set_hash.c      |  1 +
 net/netfilter/nft_set_rbtree.c    |  1 +
 4 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d76ac2f80a40..21ce50e6d0c5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -245,10 +245,12 @@ enum nft_set_class {
  *
  *	@size: required memory
  *	@lookup: lookup performance class
+ *	@space: memory class
  */
 struct nft_set_estimate {
 	unsigned int		size;
 	enum nft_set_class	lookup;
+	enum nft_set_class	space;
 };
 
 struct nft_set_ext;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fa7cd1679079..cb6ae46f6c48 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2404,6 +2404,7 @@ nft_select_set_ops(const struct nlattr * const nla[],
 	bops	    = NULL;
 	best.size   = ~0;
 	best.lookup = ~0;
+	best.space  = ~0;
 
 	list_for_each_entry(ops, &nf_tables_set_ops, list) {
 		if ((ops->features & features) != features)
@@ -2415,14 +2416,25 @@ nft_select_set_ops(const struct nlattr * const nla[],
 		case NFT_SET_POL_PERFORMANCE:
 			if (est.lookup < best.lookup)
 				break;
-			if (est.lookup == best.lookup && est.size < best.size)
-				break;
+			if (est.lookup == best.lookup) {
+				if (!desc->size) {
+					if (est.space < best.space)
+						break;
+				} else if (est.size < best.size) {
+					break;
+				}
+			}
 			continue;
 		case NFT_SET_POL_MEMORY:
-			if (est.size < best.size)
-				break;
-			if (est.size == best.size && est.lookup < best.lookup)
+			if (!desc->size) {
+				if (est.space < best.space)
+					break;
+				if (est.space == best.space &&
+				    est.lookup < best.lookup)
+					break;
+			} else if (est.size < best.size) {
 				break;
+			}
 			continue;
 		default:
 			break;
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index e58e7f02138b..6938bc890f31 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -385,6 +385,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
 	}
 
 	est->lookup = NFT_SET_CLASS_O_1;
+	est->space  = NFT_SET_CLASS_O_N;
 
 	return true;
 }
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 2b6ea10c4bbd..3387ed7dd231 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -292,6 +292,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 		est->size = nsize;
 
 	est->lookup = NFT_SET_CLASS_O_LOG_N;
+	est->space  = NFT_SET_CLASS_O_N;
 
 	return true;
 }
-- 
cgit v1.2.3


From 665153ff575207f3a092cfcea3c51238612a7b58 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jan 2017 18:30:13 +0100
Subject: netfilter: nf_tables: add bitmap set type

This patch adds a new bitmap set type. This bitmap uses two bits to
represent one element. These two bits determine the element state in the
current and the future generation that fits into the nf_tables commit
protocol. When dumping elements back to userspace, the two bits are
expanded into a struct nft_set_ext object.

If no NFTA_SET_DESC_SIZE is specified, the existing automatic set
backend selection prefers bitmap over hash in case of keys whose size is
<= 16 bit. If the set size is know, the bitmap set type is selected if
with 16 bit kets and more than 390 elements in the set, otherwise the
hash table set implementation is used.

For 8 bit keys, the bitmap consumes 66 bytes. For 16 bit keys, the
bitmap takes 16388 bytes.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig          |   6 +
 net/netfilter/Makefile         |   1 +
 net/netfilter/nft_set_bitmap.c | 314 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 321 insertions(+)
 create mode 100644 net/netfilter/nft_set_bitmap.c

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index dfbe9deeb8c4..ea479ed43373 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -509,6 +509,12 @@ config NFT_SET_HASH
 	  This option adds the "hash" set type that is used to build one-way
 	  mappings between matchings and actions.
 
+config NFT_SET_BITMAP
+	tristate "Netfilter nf_tables bitmap set module"
+	help
+	  This option adds the "bitmap" set type that is used to build sets
+	  whose keys are smaller or equal to 16 bits.
+
 config NFT_COUNTER
 	tristate "Netfilter nf_tables counter module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 6b3034f12661..c9b78e7b342f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o
 obj-$(CONFIG_NFT_REJECT_INET)	+= nft_reject_inet.o
 obj-$(CONFIG_NFT_SET_RBTREE)	+= nft_set_rbtree.o
 obj-$(CONFIG_NFT_SET_HASH)	+= nft_set_hash.o
+obj-$(CONFIG_NFT_SET_BITMAP)	+= nft_set_bitmap.o
 obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
 obj-$(CONFIG_NFT_LOG)		+= nft_log.o
 obj-$(CONFIG_NFT_MASQ)		+= nft_masq.o
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
new file mode 100644
index 000000000000..97f9649bcc7e
--- /dev/null
+++ b/net/netfilter/nft_set_bitmap.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+/* This bitmap uses two bits to represent one element. These two bits determine
+ * the element state in the current and the future generation.
+ *
+ * An element can be in three states. The generation cursor is represented using
+ * the ^ character, note that this cursor shifts on every succesful transaction.
+ * If no transaction is going on, we observe all elements are in the following
+ * state:
+ *
+ * 11 = this element is active in the current generation. In case of no updates,
+ * ^    it stays active in the next generation.
+ * 00 = this element is inactive in the current generation. In case of no
+ * ^    updates, it stays inactive in the next generation.
+ *
+ * On transaction handling, we observe these two temporary states:
+ *
+ * 01 = this element is inactive in the current generation and it becomes active
+ * ^    in the next one. This happens when the element is inserted but commit
+ *      path has not yet been executed yet, so activation is still pending. On
+ *      transaction abortion, the element is removed.
+ * 10 = this element is active in the current generation and it becomes inactive
+ * ^    in the next one. This happens when the element is deactivated but commit
+ *      path has not yet been executed yet, so removal is still pending. On
+ *      transation abortion, the next generation bit is reset to go back to
+ *      restore its previous state.
+ */
+struct nft_bitmap {
+	u16	bitmap_size;
+	u8	bitmap[];
+};
+
+static inline void nft_bitmap_location(u32 key, u32 *idx, u32 *off)
+{
+	u32 k = (key << 1);
+
+	*idx = k / BITS_PER_BYTE;
+	*off = k % BITS_PER_BYTE;
+}
+
+/* Fetch the two bits that represent the element and check if it is active based
+ * on the generation mask.
+ */
+static inline bool
+nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
+{
+	return (bitmap[idx] & (0x3 << off)) & (genmask << off);
+}
+
+static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+			      const u32 *key, const struct nft_set_ext **ext)
+{
+	const struct nft_bitmap *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_cur(net);
+	u32 idx, off;
+
+	nft_bitmap_location(*key, &idx, &off);
+
+	return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+}
+
+static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
+			     const struct nft_set_elem *elem,
+			     struct nft_set_ext **_ext)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext *ext = elem->priv;
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+		return -EEXIST;
+
+	/* Enter 01 state. */
+	priv->bitmap[idx] |= (genmask << off);
+
+	return 0;
+}
+
+static void nft_bitmap_remove(const struct net *net,
+			      const struct nft_set *set,
+			      const struct nft_set_elem *elem)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext *ext = elem->priv;
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	/* Enter 00 state. */
+	priv->bitmap[idx] &= ~(genmask << off);
+}
+
+static void nft_bitmap_activate(const struct net *net,
+				const struct nft_set *set,
+				const struct nft_set_elem *elem)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext *ext = elem->priv;
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	/* Enter 11 state. */
+	priv->bitmap[idx] |= (genmask << off);
+}
+
+static bool nft_bitmap_flush(const struct net *net,
+			     const struct nft_set *set, void *ext)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	/* Enter 10 state, similar to deactivation. */
+	priv->bitmap[idx] &= ~(genmask << off);
+
+	return true;
+}
+
+static struct nft_set_ext *nft_bitmap_ext_alloc(const struct nft_set *set,
+						const struct nft_set_elem *elem)
+{
+	struct nft_set_ext_tmpl tmpl;
+	struct nft_set_ext *ext;
+
+	nft_set_ext_prepare(&tmpl);
+	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+
+	ext = kzalloc(tmpl.len, GFP_KERNEL);
+	if (!ext)
+		return NULL;
+
+	nft_set_ext_init(ext, &tmpl);
+	memcpy(nft_set_ext_key(ext), elem->key.val.data, set->klen);
+
+	return ext;
+}
+
+static void *nft_bitmap_deactivate(const struct net *net,
+				   const struct nft_set *set,
+				   const struct nft_set_elem *elem)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_next(net);
+	struct nft_set_ext *ext;
+	u32 idx, off, key = 0;
+
+	memcpy(&key, elem->key.val.data, set->klen);
+	nft_bitmap_location(key, &idx, &off);
+
+	if (!nft_bitmap_active(priv->bitmap, idx, off, genmask))
+		return NULL;
+
+	/* We have no real set extension since this is a bitmap, allocate this
+	 * dummy object that is released from the commit/abort path.
+	 */
+	ext = nft_bitmap_ext_alloc(set, elem);
+	if (!ext)
+		return NULL;
+
+	/* Enter 10 state. */
+	priv->bitmap[idx] &= ~(genmask << off);
+
+	return ext;
+}
+
+static void nft_bitmap_walk(const struct nft_ctx *ctx,
+			    struct nft_set *set,
+			    struct nft_set_iter *iter)
+{
+	const struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext_tmpl tmpl;
+	struct nft_set_elem elem;
+	struct nft_set_ext *ext;
+	int idx, off;
+	u16 key;
+
+	nft_set_ext_prepare(&tmpl);
+	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+
+	for (idx = 0; idx < priv->bitmap_size; idx++) {
+		for (off = 0; off < BITS_PER_BYTE; off += 2) {
+			if (iter->count < iter->skip)
+				goto cont;
+
+			if (!nft_bitmap_active(priv->bitmap, idx, off,
+					       iter->genmask))
+				goto cont;
+
+			ext = kzalloc(tmpl.len, GFP_KERNEL);
+			if (!ext) {
+				iter->err = -ENOMEM;
+				return;
+			}
+			nft_set_ext_init(ext, &tmpl);
+			key = ((idx * BITS_PER_BYTE) + off) >> 1;
+			memcpy(nft_set_ext_key(ext), &key, set->klen);
+
+			elem.priv = ext;
+			iter->err = iter->fn(ctx, set, iter, &elem);
+
+			/* On set flush, this dummy extension object is released
+			 * from the commit/abort path.
+			 */
+			if (!iter->flush)
+				kfree(ext);
+
+			if (iter->err < 0)
+				return;
+cont:
+			iter->count++;
+		}
+	}
+}
+
+/* The bitmap size is pow(2, key length in bits) / bits per byte. This is
+ * multiplied by two since each element takes two bits. For 8 bit keys, the
+ * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes.
+ */
+static inline u32 nft_bitmap_size(u32 klen)
+{
+	return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
+}
+
+static inline u32 nft_bitmap_total_size(u32 klen)
+{
+	return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
+}
+
+static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[])
+{
+	u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+
+	return nft_bitmap_total_size(klen);
+}
+
+static int nft_bitmap_init(const struct nft_set *set,
+			 const struct nft_set_desc *desc,
+			 const struct nlattr * const nla[])
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+
+	priv->bitmap_size = nft_bitmap_total_size(set->klen);
+
+	return 0;
+}
+
+static void nft_bitmap_destroy(const struct nft_set *set)
+{
+}
+
+static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
+				struct nft_set_estimate *est)
+{
+	/* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
+	if (desc->klen > 2)
+		return false;
+
+	est->size   = nft_bitmap_total_size(desc->klen);
+	est->lookup = NFT_SET_CLASS_O_1;
+	est->space  = NFT_SET_CLASS_O_1;
+
+	return true;
+}
+
+static struct nft_set_ops nft_bitmap_ops __read_mostly = {
+	.privsize	= nft_bitmap_privsize,
+	.estimate	= nft_bitmap_estimate,
+	.init		= nft_bitmap_init,
+	.destroy	= nft_bitmap_destroy,
+	.insert		= nft_bitmap_insert,
+	.remove		= nft_bitmap_remove,
+	.deactivate	= nft_bitmap_deactivate,
+	.flush		= nft_bitmap_flush,
+	.activate	= nft_bitmap_activate,
+	.lookup		= nft_bitmap_lookup,
+	.walk		= nft_bitmap_walk,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_bitmap_module_init(void)
+{
+	return nft_register_set(&nft_bitmap_ops);
+}
+
+static void __exit nft_bitmap_module_exit(void)
+{
+	nft_unregister_set(&nft_bitmap_ops);
+}
+
+module_init(nft_bitmap_module_init);
+module_exit(nft_bitmap_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_SET();
-- 
cgit v1.2.3


From ab23821f7ecfb022a4aec78fb6f4fd0f6aa1ccab Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Fri, 3 Feb 2017 13:35:48 +0100
Subject: netfilter: nft_ct: add zone id get support

Just like with counters the direction attribute is optional.
We set priv->dir to MAX unconditionally to avoid duplicating the assignment
for all keys with optional direction.

For keys where direction is mandatory, existing code already returns
an error.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_ct.c                   | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 53aac8b8ed6b..3e60ed78c538 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -870,6 +870,7 @@ enum nft_rt_attributes {
  * @NFT_CT_PKTS: conntrack packets
  * @NFT_CT_BYTES: conntrack bytes
  * @NFT_CT_AVGPKT: conntrack average bytes per packet
+ * @NFT_CT_ZONE: conntrack zone
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
@@ -889,6 +890,7 @@ enum nft_ct_keys {
 	NFT_CT_PKTS,
 	NFT_CT_BYTES,
 	NFT_CT_AVGPKT,
+	NFT_CT_ZONE,
 };
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 66a2377510e1..5bd4cdfdcda5 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -151,6 +151,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 	case NFT_CT_PROTOCOL:
 		*dest = nf_ct_protonum(ct);
 		return;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE: {
+		const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+
+		if (priv->dir < IP_CT_DIR_MAX)
+			*dest = nf_ct_zone_id(zone, priv->dir);
+		else
+			*dest = zone->id;
+
+		return;
+	}
+#endif
 	default:
 		break;
 	}
@@ -266,6 +278,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	int err;
 
 	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+	priv->dir = IP_CT_DIR_MAX;
 	switch (priv->key) {
 	case NFT_CT_DIRECTION:
 		if (tb[NFTA_CT_DIRECTION] != NULL)
@@ -333,11 +346,13 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
 	case NFT_CT_AVGPKT:
-		/* no direction? return sum of original + reply */
-		if (tb[NFTA_CT_DIRECTION] == NULL)
-			priv->dir = IP_CT_DIR_MAX;
 		len = sizeof(u64);
 		break;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE:
+		len = sizeof(u16);
+		break;
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -465,6 +480,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
 	case NFT_CT_AVGPKT:
+	case NFT_CT_ZONE:
 		if (priv->dir < IP_CT_DIR_MAX &&
 		    nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
 			goto nla_put_failure;
-- 
cgit v1.2.3


From 5c178d81b69f08ca3195427a6ea9a46d9af23127 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Fri, 3 Feb 2017 13:35:49 +0100
Subject: netfilter: nft_ct: prepare for key-dependent error unwind

Next patch will add ZONE_ID set support which will need similar
error unwind (put operation) as conntrack labels.

Prepare for this: remove the 'label_got' boolean in favor
of a switch statement that can be extended in next patch.

As we already have that in the set_destroy function place that in
a separate function and call it from the set init function.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_ct.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 5bd4cdfdcda5..2d82df2737da 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -386,12 +386,24 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
+static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
+{
+	switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+	case NFT_CT_LABELS:
+		nf_connlabels_put(ctx->net);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
 static int nft_ct_set_init(const struct nft_ctx *ctx,
 			   const struct nft_expr *expr,
 			   const struct nlattr * const tb[])
 {
 	struct nft_ct *priv = nft_expr_priv(expr);
-	bool label_got = false;
 	unsigned int len;
 	int err;
 
@@ -412,7 +424,6 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 		err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
 		if (err)
 			return err;
-		label_got = true;
 		break;
 #endif
 	default:
@@ -431,8 +442,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 	return 0;
 
 err1:
-	if (label_got)
-		nf_connlabels_put(ctx->net);
+	__nft_ct_set_destroy(ctx, priv);
 	return err;
 }
 
@@ -447,16 +457,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
 {
 	struct nft_ct *priv = nft_expr_priv(expr);
 
-	switch (priv->key) {
-#ifdef CONFIG_NF_CONNTRACK_LABELS
-	case NFT_CT_LABELS:
-		nf_connlabels_put(ctx->net);
-		break;
-#endif
-	default:
-		break;
-	}
-
+	__nft_ct_set_destroy(ctx, priv);
 	nft_ct_netns_put(ctx->net, ctx->afi->family);
 }
 
-- 
cgit v1.2.3


From edee4f1e92458299505ff007733f676b00c516a1 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Fri, 3 Feb 2017 13:35:50 +0100
Subject: netfilter: nft_ct: add zone id set support

zones allow tracking multiple connections sharing identical tuples,
this is needed e.g. when tracking distinct vlans with overlapping ip
addresses (conntrack is l2 agnostic).

Thus the zone has to be set before the packet is picked up by the
connection tracker.  This is done by means of 'conntrack templates' which
are conntrack structures used solely to pass this info from one netfilter
hook to the next.

The iptables CT target instantiates these connection tracking templates
once per rule, i.e. the template is fixed/tied to particular zone, can
be read-only and therefore be re-used by as many skbs simultaneously as
needed.

We can't follow this model because we want to take the zone id from
an sreg at rule eval time so we could e.g. fill in the zone id from
the packets vlan id or a e.g. nftables key : value maps.

To avoid cost of per packet alloc/free of the template, use a percpu
template 'scratch' object and use the refcount to detect the (unlikely)
case where the template is still attached to another skb (i.e., previous
skb was nfqueued ...).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_ct.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 143 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 2d82df2737da..c6b8022c0e47 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -32,6 +32,11 @@ struct nft_ct {
 	};
 };
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
+static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
+#endif
+
 static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
 				   enum nft_ct_keys k,
 				   enum ip_conntrack_dir d)
@@ -191,6 +196,53 @@ err:
 	regs->verdict.code = NFT_BREAK;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void nft_ct_set_zone_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR };
+	const struct nft_ct *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	enum ip_conntrack_info ctinfo;
+	u16 value = regs->data[priv->sreg];
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) /* already tracked */
+		return;
+
+	zone.id = value;
+
+	switch (priv->dir) {
+	case IP_CT_DIR_ORIGINAL:
+		zone.dir = NF_CT_ZONE_DIR_ORIG;
+		break;
+	case IP_CT_DIR_REPLY:
+		zone.dir = NF_CT_ZONE_DIR_REPL;
+		break;
+	default:
+		break;
+	}
+
+	ct = this_cpu_read(nft_ct_pcpu_template);
+
+	if (likely(atomic_read(&ct->ct_general.use) == 1)) {
+		nf_ct_zone_add(ct, &zone);
+	} else {
+		/* previous skb got queued to userspace */
+		ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
+		if (!ct) {
+			regs->verdict.code = NF_DROP;
+			return;
+		}
+	}
+
+	atomic_inc(&ct->ct_general.use);
+	nf_ct_set(skb, ct, IP_CT_NEW);
+}
+#endif
+
 static void nft_ct_set_eval(const struct nft_expr *expr,
 			    struct nft_regs *regs,
 			    const struct nft_pktinfo *pkt)
@@ -269,6 +321,45 @@ static void nft_ct_netns_put(struct net *net, uint8_t family)
 		nf_ct_netns_put(net, family);
 }
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void nft_ct_tmpl_put_pcpu(void)
+{
+	struct nf_conn *ct;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		ct = per_cpu(nft_ct_pcpu_template, cpu);
+		if (!ct)
+			break;
+		nf_ct_put(ct);
+		per_cpu(nft_ct_pcpu_template, cpu) = NULL;
+	}
+}
+
+static bool nft_ct_tmpl_alloc_pcpu(void)
+{
+	struct nf_conntrack_zone zone = { .id = 0 };
+	struct nf_conn *tmp;
+	int cpu;
+
+	if (nft_ct_pcpu_template_refcnt)
+		return true;
+
+	for_each_possible_cpu(cpu) {
+		tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL);
+		if (!tmp) {
+			nft_ct_tmpl_put_pcpu();
+			return false;
+		}
+
+		atomic_set(&tmp->ct_general.use, 1);
+		per_cpu(nft_ct_pcpu_template, cpu) = tmp;
+	}
+
+	return true;
+}
+#endif
+
 static int nft_ct_get_init(const struct nft_ctx *ctx,
 			   const struct nft_expr *expr,
 			   const struct nlattr * const tb[])
@@ -393,6 +484,11 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
 	case NFT_CT_LABELS:
 		nf_connlabels_put(ctx->net);
 		break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE:
+		if (--nft_ct_pcpu_template_refcnt == 0)
+			nft_ct_tmpl_put_pcpu();
 #endif
 	default:
 		break;
@@ -407,6 +503,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 	unsigned int len;
 	int err;
 
+	priv->dir = IP_CT_DIR_MAX;
 	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
 	switch (priv->key) {
 #ifdef CONFIG_NF_CONNTRACK_MARK
@@ -425,11 +522,29 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 		if (err)
 			return err;
 		break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE:
+		if (!nft_ct_tmpl_alloc_pcpu())
+			return -ENOMEM;
+		nft_ct_pcpu_template_refcnt++;
+		break;
 #endif
 	default:
 		return -EOPNOTSUPP;
 	}
 
+	if (tb[NFTA_CT_DIRECTION]) {
+		priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+		switch (priv->dir) {
+		case IP_CT_DIR_ORIGINAL:
+		case IP_CT_DIR_REPLY:
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
 	priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
 	err = nft_validate_register_load(priv->sreg, len);
 	if (err < 0)
@@ -504,6 +619,17 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
 		goto nla_put_failure;
+
+	switch (priv->key) {
+	case NFT_CT_ZONE:
+		if (priv->dir < IP_CT_DIR_MAX &&
+		    nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+			goto nla_put_failure;
+		break;
+	default:
+		break;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -529,6 +655,17 @@ static const struct nft_expr_ops nft_ct_set_ops = {
 	.dump		= nft_ct_set_dump,
 };
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static const struct nft_expr_ops nft_ct_set_zone_ops = {
+	.type		= &nft_ct_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+	.eval		= nft_ct_set_zone_eval,
+	.init		= nft_ct_set_init,
+	.destroy	= nft_ct_set_destroy,
+	.dump		= nft_ct_set_dump,
+};
+#endif
+
 static const struct nft_expr_ops *
 nft_ct_select_ops(const struct nft_ctx *ctx,
 		    const struct nlattr * const tb[])
@@ -542,8 +679,13 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
 	if (tb[NFTA_CT_DREG])
 		return &nft_ct_get_ops;
 
-	if (tb[NFTA_CT_SREG])
+	if (tb[NFTA_CT_SREG]) {
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+		if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE))
+			return &nft_ct_set_zone_ops;
+#endif
 		return &nft_ct_set_ops;
+	}
 
 	return ERR_PTR(-EINVAL);
 }
-- 
cgit v1.2.3


From 935b7f643018878bd9d4193eea8b575aff736b9b Mon Sep 17 00:00:00 2001
From: Manuel Messner
Date: Tue, 7 Feb 2017 03:14:53 +0100
Subject: netfilter: nft_exthdr: add TCP option matching

This patch implements the kernel side of the TCP option patch.

Signed-off-by: Manuel Messner <mm@skelett.io>
Reviewed-by: Florian Westphal <fw@strlen.de>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  17 ++++-
 net/netfilter/Kconfig                    |   4 +-
 net/netfilter/nft_exthdr.c               | 119 +++++++++++++++++++++++++++----
 3 files changed, 124 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3e60ed78c538..207951516ede 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -709,13 +709,27 @@ enum nft_exthdr_flags {
 };
 
 /**
- * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes
+ * enum nft_exthdr_op - nf_tables match options
+ *
+ * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers
+ * @NFT_EXTHDR_OP_TCP: match against tcp options
+ */
+enum nft_exthdr_op {
+	NFT_EXTHDR_OP_IPV6,
+	NFT_EXTHDR_OP_TCPOPT,
+	__NFT_EXTHDR_OP_MAX
+};
+#define NFT_EXTHDR_OP_MAX	(__NFT_EXTHDR_OP_MAX - 1)
+
+/**
+ * enum nft_exthdr_attributes - nf_tables extension header expression netlink attributes
  *
  * @NFTA_EXTHDR_DREG: destination register (NLA_U32: nft_registers)
  * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8)
  * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
  * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
+ * @NFTA_EXTHDR_OP: option match type (NLA_U8)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -724,6 +738,7 @@ enum nft_exthdr_attributes {
 	NFTA_EXTHDR_OFFSET,
 	NFTA_EXTHDR_LEN,
 	NFTA_EXTHDR_FLAGS,
+	NFTA_EXTHDR_OP,
 	__NFTA_EXTHDR_MAX
 };
 #define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ea479ed43373..9b28864cc36a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -467,10 +467,10 @@ config NF_TABLES_NETDEV
 	  This option enables support for the "netdev" table.
 
 config NFT_EXTHDR
-	tristate "Netfilter nf_tables IPv6 exthdr module"
+	tristate "Netfilter nf_tables exthdr module"
 	help
 	  This option adds the "exthdr" expression that you can use to match
-	  IPv6 extension headers.
+	  IPv6 extension headers and tcp options.
 
 config NFT_META
 	tristate "Netfilter nf_tables meta module"
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a89e5ab150db..c308920b194c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -15,20 +15,29 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
-// FIXME:
-#include <net/ipv6.h>
+#include <net/tcp.h>
 
 struct nft_exthdr {
 	u8			type;
 	u8			offset;
 	u8			len;
+	u8			op;
 	enum nft_registers	dreg:8;
 	u8			flags;
 };
 
-static void nft_exthdr_eval(const struct nft_expr *expr,
-			    struct nft_regs *regs,
-			    const struct nft_pktinfo *pkt)
+static unsigned int optlen(const u8 *opt, unsigned int offset)
+{
+	/* Beware zero-length options: make finite progress */
+	if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0)
+		return 1;
+	else
+		return opt[offset + 1];
+}
+
+static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
 	u32 *dest = &regs->data[priv->dreg];
@@ -52,6 +61,53 @@ err:
 	regs->verdict.code = NFT_BREAK;
 }
 
+static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	unsigned int i, optl, tcphdr_len, offset;
+	u32 *dest = &regs->data[priv->dreg];
+	struct tcphdr *tcph;
+	u8 *opt;
+
+	if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
+		goto err;
+
+	tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff);
+	if (!tcph)
+		goto err;
+
+	tcphdr_len = __tcp_hdrlen(tcph);
+	if (tcphdr_len < sizeof(*tcph))
+		goto err;
+
+	tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff);
+	if (!tcph)
+		goto err;
+
+	opt = (u8 *)tcph;
+	for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+		optl = optlen(opt, i);
+
+		if (priv->type != opt[i])
+			continue;
+
+		if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
+			goto err;
+
+		offset = i + priv->offset;
+		dest[priv->len / NFT_REG32_SIZE] = 0;
+		memcpy(dest, opt + offset, priv->len);
+
+		return;
+	}
+
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+
 static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
@@ -65,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
-	u32 offset, len, flags = 0;
+	u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
 	int err;
 
-	if (tb[NFTA_EXTHDR_DREG] == NULL ||
-	    tb[NFTA_EXTHDR_TYPE] == NULL ||
-	    tb[NFTA_EXTHDR_OFFSET] == NULL ||
-	    tb[NFTA_EXTHDR_LEN] == NULL)
+	if (!tb[NFTA_EXTHDR_DREG] ||
+	    !tb[NFTA_EXTHDR_TYPE] ||
+	    !tb[NFTA_EXTHDR_OFFSET] ||
+	    !tb[NFTA_EXTHDR_LEN])
 		return -EINVAL;
 
 	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
@@ -91,11 +147,18 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			return -EINVAL;
 	}
 
+	if (tb[NFTA_EXTHDR_OP]) {
+		err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
+		if (err < 0)
+			return err;
+	}
+
 	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
 	priv->offset = offset;
 	priv->len    = len;
 	priv->dreg   = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
 	priv->flags  = flags;
+	priv->op     = op;
 
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, priv->len);
@@ -115,6 +178,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -122,17 +187,45 @@ nla_put_failure:
 }
 
 static struct nft_expr_type nft_exthdr_type;
-static const struct nft_expr_ops nft_exthdr_ops = {
+static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
+	.type		= &nft_exthdr_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.eval		= nft_exthdr_ipv6_eval,
+	.init		= nft_exthdr_init,
+	.dump		= nft_exthdr_dump,
+};
+
+static const struct nft_expr_ops nft_exthdr_tcp_ops = {
 	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
-	.eval		= nft_exthdr_eval,
+	.eval		= nft_exthdr_tcp_eval,
 	.init		= nft_exthdr_init,
 	.dump		= nft_exthdr_dump,
 };
 
+static const struct nft_expr_ops *
+nft_exthdr_select_ops(const struct nft_ctx *ctx,
+		      const struct nlattr * const tb[])
+{
+	u32 op;
+
+	if (!tb[NFTA_EXTHDR_OP])
+		return &nft_exthdr_ipv6_ops;
+
+	op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
+	switch (op) {
+	case NFT_EXTHDR_OP_TCPOPT:
+		return &nft_exthdr_tcp_ops;
+	case NFT_EXTHDR_OP_IPV6:
+		return &nft_exthdr_ipv6_ops;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_exthdr_type __read_mostly = {
 	.name		= "exthdr",
-	.ops		= &nft_exthdr_ops,
+	.select_ops	= &nft_exthdr_select_ops,
 	.policy		= nft_exthdr_policy,
 	.maxattr	= NFTA_EXTHDR_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3


From 97e219b7c1f75b14b29abe28ad53e8709e8d15e5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Tue, 7 Feb 2017 15:37:15 -0800
Subject: gro_cells: move to net/core/gro_cells.c

We have many gro cells users, so lets move the code to avoid
duplication.

This creates a CONFIG_GRO_CELLS option.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig     |  3 ++
 include/net/gro_cells.h | 86 +++------------------------------------------
 net/Kconfig             |  4 +++
 net/core/Makefile       |  1 +
 net/core/gro_cells.c    | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/Kconfig        |  1 +
 net/ipv6/Kconfig        |  1 +
 net/xfrm/Kconfig        |  1 +
 8 files changed, 107 insertions(+), 82 deletions(-)
 create mode 100644 net/core/gro_cells.c

(limited to 'net')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 95c32f2d7601..a993cbeb9e0c 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -170,6 +170,7 @@ config VXLAN
        tristate "Virtual eXtensible Local Area Network (VXLAN)"
        depends on INET
        select NET_UDP_TUNNEL
+       select GRO_CELLS
        ---help---
 	  This allows one to create vxlan virtual interfaces that provide
 	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
@@ -184,6 +185,7 @@ config GENEVE
        tristate "Generic Network Virtualization Encapsulation"
        depends on INET && NET_UDP_TUNNEL
        select NET_IP_TUNNEL
+       select GRO_CELLS
        ---help---
 	  This allows one to create geneve virtual interfaces that provide
 	  Layer 2 Networks over Layer 3 Networks. GENEVE is often used
@@ -216,6 +218,7 @@ config MACSEC
 	select CRYPTO
 	select CRYPTO_AES
 	select CRYPTO_GCM
+	select GRO_CELLS
 	---help---
 	   MACsec is an encryption standard for Ethernet.
 
diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
index 2a1abbf8da74..fcaf8f479130 100644
--- a/include/net/gro_cells.h
+++ b/include/net/gro_cells.h
@@ -5,92 +5,14 @@
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 
-struct gro_cell {
-	struct sk_buff_head	napi_skbs;
-	struct napi_struct	napi;
-};
+struct gro_cell;
 
 struct gro_cells {
 	struct gro_cell __percpu	*cells;
 };
 
-static inline int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
-{
-	struct gro_cell *cell;
-	struct net_device *dev = skb->dev;
-
-	if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
-		return netif_rx(skb);
-
-	cell = this_cpu_ptr(gcells->cells);
-
-	if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
-		atomic_long_inc(&dev->rx_dropped);
-		kfree_skb(skb);
-		return NET_RX_DROP;
-	}
-
-	__skb_queue_tail(&cell->napi_skbs, skb);
-	if (skb_queue_len(&cell->napi_skbs) == 1)
-		napi_schedule(&cell->napi);
-	return NET_RX_SUCCESS;
-}
-
-/* called under BH context */
-static inline int gro_cell_poll(struct napi_struct *napi, int budget)
-{
-	struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
-	struct sk_buff *skb;
-	int work_done = 0;
-
-	while (work_done < budget) {
-		skb = __skb_dequeue(&cell->napi_skbs);
-		if (!skb)
-			break;
-		napi_gro_receive(napi, skb);
-		work_done++;
-	}
-
-	if (work_done < budget)
-		napi_complete_done(napi, work_done);
-	return work_done;
-}
-
-static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
-{
-	int i;
-
-	gcells->cells = alloc_percpu(struct gro_cell);
-	if (!gcells->cells)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i) {
-		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
-
-		__skb_queue_head_init(&cell->napi_skbs);
-
-		set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
-
-		netif_napi_add(dev, &cell->napi, gro_cell_poll, 64);
-		napi_enable(&cell->napi);
-	}
-	return 0;
-}
-
-static inline void gro_cells_destroy(struct gro_cells *gcells)
-{
-	int i;
-
-	if (!gcells->cells)
-		return;
-	for_each_possible_cpu(i) {
-		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
-
-		netif_napi_del(&cell->napi);
-		__skb_queue_purge(&cell->napi_skbs);
-	}
-	free_percpu(gcells->cells);
-	gcells->cells = NULL;
-}
+int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb);
+int gro_cells_init(struct gro_cells *gcells, struct net_device *dev);
+void gro_cells_destroy(struct gro_cells *gcells);
 
 #endif
diff --git a/net/Kconfig b/net/Kconfig
index 2f2842d2d3ed..f19c0c3b9589 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -413,6 +413,10 @@ config DST_CACHE
 	bool
 	default n
 
+config GRO_CELLS
+	bool
+	default n
+
 config NET_DEVLINK
 	tristate "Network physical/parent device Netlink interface"
 	help
diff --git a/net/core/Makefile b/net/core/Makefile
index f6761b6e3b29..79f9479e9658 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -28,3 +28,4 @@ obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
+obj-$(CONFIG_GRO_CELLS) += gro_cells.o
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000000..c98bbfbd26b8
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,92 @@
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <net/gro_cells.h>
+
+struct gro_cell {
+	struct sk_buff_head	napi_skbs;
+	struct napi_struct	napi;
+};
+
+int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct gro_cell *cell;
+
+	if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
+		return netif_rx(skb);
+
+	cell = this_cpu_ptr(gcells->cells);
+
+	if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+		atomic_long_inc(&dev->rx_dropped);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	__skb_queue_tail(&cell->napi_skbs, skb);
+	if (skb_queue_len(&cell->napi_skbs) == 1)
+		napi_schedule(&cell->napi);
+	return NET_RX_SUCCESS;
+}
+EXPORT_SYMBOL(gro_cells_receive);
+
+/* called under BH context */
+static int gro_cell_poll(struct napi_struct *napi, int budget)
+{
+	struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
+	struct sk_buff *skb;
+	int work_done = 0;
+
+	while (work_done < budget) {
+		skb = __skb_dequeue(&cell->napi_skbs);
+		if (!skb)
+			break;
+		napi_gro_receive(napi, skb);
+		work_done++;
+	}
+
+	if (work_done < budget)
+		napi_complete_done(napi, work_done);
+	return work_done;
+}
+
+int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
+{
+	int i;
+
+	gcells->cells = alloc_percpu(struct gro_cell);
+	if (!gcells->cells)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+		__skb_queue_head_init(&cell->napi_skbs);
+
+		set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
+
+		netif_napi_add(dev, &cell->napi, gro_cell_poll,
+			       NAPI_POLL_WEIGHT);
+		napi_enable(&cell->napi);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(gro_cells_init);
+
+void gro_cells_destroy(struct gro_cells *gcells)
+{
+	int i;
+
+	if (!gcells->cells)
+		return;
+	for_each_possible_cpu(i) {
+		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+		netif_napi_del(&cell->napi);
+		__skb_queue_purge(&cell->napi_skbs);
+	}
+	free_percpu(gcells->cells);
+	gcells->cells = NULL;
+}
+EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6e7baaf814c6..e30f9caddae8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX
 config NET_IP_TUNNEL
 	tristate
 	select DST_CACHE
+	select GRO_CELLS
 	default n
 
 config NET_IPGRE
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ec1267e2bd1f..3c7c76b2a7ba 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -208,6 +208,7 @@ config IPV6_TUNNEL
 	tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
 	select INET6_TUNNEL
 	select DST_CACHE
+	select GRO_CELLS
 	---help---
 	  Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
 	  RFC 2473.
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..c06d3997c6e7 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -4,6 +4,7 @@
 config XFRM
        bool
        depends on NET
+       select GRO_CELLS
 
 config XFRM_ALGO
 	tristate
-- 
cgit v1.2.3


From 8ef9594764617e3fd4500205b080b53c45c14c4b Mon Sep 17 00:00:00 2001
From: Roopa Prabhu
Date: Tue, 7 Feb 2017 16:12:00 -0800
Subject: bridge: vlan tunnel id info range fill size calc cleanups

This fixes a bug and cleans up tunnelid range size
calculation code by using consistent variable names
and checks in size calculation and fill functions.

tested for a few cases of vlan-vni range mappings:
(output from patched iproute2):
$bridge vlan showtunnel
port     vid        tunid
vxlan0   100-105    1000-1005
         200        2000
         210        2100
         211-213    2100-2102
         214        2104
         216-217    2108-2109
         219        2119

Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support")
Reported-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink_tunnel.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
index f38473509647..c913491495ab 100644
--- a/net/bridge/br_netlink_tunnel.c
+++ b/net/bridge/br_netlink_tunnel.c
@@ -30,18 +30,18 @@ static size_t __get_vlan_tinfo_size(void)
 		  nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
 }
 
-static bool vlan_tunnel_id_isrange(struct net_bridge_vlan *v,
-				   struct net_bridge_vlan *v_end)
+static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
+			       struct net_bridge_vlan *v_last)
 {
-	__be32 tunid_curr = tunnel_id_to_key32(v->tinfo.tunnel_id);
-	__be32 tunid_end = tunnel_id_to_key32(v_end->tinfo.tunnel_id);
+	__be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
+	__be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
 
-	return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_end)) == 1;
+	return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1;
 }
 
 static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
 {
-	struct net_bridge_vlan *v, *v_start = NULL, *v_end = NULL;
+	struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL;
 	int num_tinfos = 0;
 
 	/* Count number of vlan infos */
@@ -50,27 +50,25 @@ static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
 		if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id)
 			continue;
 
-		if (!v_start) {
+		if (!vtbegin) {
 			goto initvars;
-		} else if ((v->vid - v_end->vid) == 1 &&
-			   vlan_tunnel_id_isrange(v_end, v) == 1) {
-			v_end = v;
+		} else if ((v->vid - vtend->vid) == 1 &&
+			   vlan_tunid_inrange(v, vtend)) {
+			vtend = v;
 			continue;
 		} else {
-			if ((v_end->vid - v->vid) > 0 &&
-			    vlan_tunnel_id_isrange(v_end, v) > 0)
+			if ((vtend->vid - vtbegin->vid) > 0)
 				num_tinfos += 2;
 			else
 				num_tinfos += 1;
 		}
 initvars:
-		v_start = v;
-		v_end = v;
+		vtbegin = v;
+		vtend = v;
 	}
 
-	if (v_start) {
-		if ((v_end->vid - v->vid) > 0 &&
-		    vlan_tunnel_id_isrange(v_end, v) > 0)
+	if (vtbegin && vtend) {
+		if ((vtend->vid - vtbegin->vid) > 0)
 			num_tinfos += 2;
 		else
 			num_tinfos += 1;
@@ -171,7 +169,7 @@ int br_fill_vlan_tunnel_info(struct sk_buff *skb,
 		if (!vtbegin) {
 			goto initvars;
 		} else if ((v->vid - vtend->vid) == 1 &&
-			    vlan_tunnel_id_isrange(v, vtend)) {
+			    vlan_tunid_inrange(v, vtend)) {
 			vtend = v;
 			continue;
 		} else {
-- 
cgit v1.2.3


From 982acb97560c8118c2109504a22b0d78a580547d Mon Sep 17 00:00:00 2001
From: Ido Schimmel
Date: Wed, 8 Feb 2017 11:16:39 +0100
Subject: ipv4: fib: Notify about nexthop status changes

When a multipath route is hit the kernel doesn't consider nexthops that
are DEAD or LINKDOWN when IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN is set.
Devices that offload multipath routes need to be made aware of nexthop
status changes. Otherwise, the device will keep forwarding packets to
non-functional nexthops.

Add the FIB_EVENT_NH_{ADD,DEL} events to the fib notification chain,
which notify capable devices when they should add or delete a nexthop
from their tables.

Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  7 +++++++
 net/ipv4/fib_semantics.c | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 57c2a863d0b2..45a184eaff2b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -214,11 +214,18 @@ struct fib_entry_notifier_info {
 	u32 nlflags;
 };
 
+struct fib_nh_notifier_info {
+	struct fib_notifier_info info; /* must be first */
+	struct fib_nh *fib_nh;
+};
+
 enum fib_event_type {
 	FIB_EVENT_ENTRY_ADD,
 	FIB_EVENT_ENTRY_DEL,
 	FIB_EVENT_RULE_ADD,
 	FIB_EVENT_RULE_DEL,
+	FIB_EVENT_NH_ADD,
+	FIB_EVENT_NH_DEL,
 };
 
 int register_fib_notifier(struct notifier_block *nb,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6306a67880e8..317026a39cfa 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1355,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
 	return ret;
 }
 
+static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
+				 enum fib_event_type event_type)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
+	struct fib_nh_notifier_info info = {
+		.fib_nh = fib_nh,
+	};
+
+	switch (event_type) {
+	case FIB_EVENT_NH_ADD:
+		if (fib_nh->nh_flags & RTNH_F_DEAD)
+			break;
+		if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+		    fib_nh->nh_flags & RTNH_F_LINKDOWN)
+			break;
+		return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
+					  &info.info);
+	case FIB_EVENT_NH_DEL:
+		if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+		     fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
+		    (fib_nh->nh_flags & RTNH_F_DEAD))
+			return call_fib_notifiers(dev_net(fib_nh->nh_dev),
+						  event_type, &info.info);
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
 /* Event              force Flags           Description
  * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
  * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
@@ -1396,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 					nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
 					break;
 				}
+				call_fib_nh_notifiers(nexthop_nh,
+						      FIB_EVENT_NH_DEL);
 				dead++;
 			}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1550,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 				continue;
 			alive++;
 			nexthop_nh->nh_flags &= ~nh_flags;
+			call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
 		} endfor_nexthops(fi)
 
 		if (alive > 0) {
-- 
cgit v1.2.3


From 960fdfdeb9e85a67bed136bc945c541ba61c2bdd Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 7 Feb 2017 14:52:30 +0100
Subject: xfrm: input: constify xfrm_input_afinfo

Nothing writes to these structures (the module owner was not used).

While at it, size xfrm_input_afinfo[] by the highest existing xfrm family
(INET6), not AF_MAX.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h        |  5 ++---
 net/ipv4/xfrm4_protocol.c |  3 +--
 net/ipv6/xfrm6_protocol.c |  3 +--
 net/xfrm/xfrm_input.c     | 31 +++++++++++--------------------
 4 files changed, 15 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d9a81dcef53e..da0e4dd653e2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -349,13 +349,12 @@ struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family);
 
 struct xfrm_input_afinfo {
 	unsigned int		family;
-	struct module		*owner;
 	int			(*callback)(struct sk_buff *skb, u8 protocol,
 					    int err);
 };
 
-int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo);
-int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo);
+int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo);
+int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo);
 
 void xfrm_state_delete_tunnel(struct xfrm_state *x);
 
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index dccefa9d84cf..8dd0e6ab8606 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -188,9 +188,8 @@ static const struct net_protocol ipcomp4_protocol = {
 	.netns_ok	=	1,
 };
 
-static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
 	.family		=	AF_INET,
-	.owner		=	THIS_MODULE,
 	.callback	=	xfrm4_rcv_cb,
 };
 
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index 54d13f8dbbae..b2dc8ce49378 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -162,9 +162,8 @@ static const struct inet6_protocol ipcomp6_protocol = {
 	.flags		=	INET6_PROTO_NOPOLICY,
 };
 
-static struct xfrm_input_afinfo xfrm6_input_afinfo = {
+static const struct xfrm_input_afinfo xfrm6_input_afinfo = {
 	.family		=	AF_INET6,
-	.owner		=	THIS_MODULE,
 	.callback	=	xfrm6_rcv_cb,
 };
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 3213fe8027be..8722294c6e59 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -19,19 +19,18 @@
 static struct kmem_cache *secpath_cachep __read_mostly;
 
 static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
-static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO];
+static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
 
 static struct gro_cells gro_cells;
 static struct net_device xfrm_napi_dev;
 
-int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
+int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
 {
 	int err = 0;
 
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+	if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo)))
 		return -EAFNOSUPPORT;
+
 	spin_lock_bh(&xfrm_input_afinfo_lock);
 	if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
 		err = -EEXIST;
@@ -42,14 +41,10 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_input_register_afinfo);
 
-int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
+int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo)
 {
 	int err = 0;
 
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
-		return -EAFNOSUPPORT;
 	spin_lock_bh(&xfrm_input_afinfo_lock);
 	if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) {
 		if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo))
@@ -63,12 +58,13 @@ int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_input_unregister_afinfo);
 
-static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
+static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
 {
-	struct xfrm_input_afinfo *afinfo;
+	const struct xfrm_input_afinfo *afinfo;
 
-	if (unlikely(family >= NPROTO))
+	if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo)))
 		return NULL;
+
 	rcu_read_lock();
 	afinfo = rcu_dereference(xfrm_input_afinfo[family]);
 	if (unlikely(!afinfo))
@@ -76,22 +72,17 @@ static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
 	return afinfo;
 }
 
-static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo)
-{
-	rcu_read_unlock();
-}
-
 static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
 		       int err)
 {
 	int ret;
-	struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
+	const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
 
 	if (!afinfo)
 		return -EAFNOSUPPORT;
 
 	ret = afinfo->callback(skb, protocol, err);
-	xfrm_input_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
cgit v1.2.3


From f5e2bb4f5b2252e8f170f59bece2cf9a2efc8e6a Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 7 Feb 2017 15:00:14 +0100
Subject: xfrm: policy: xfrm_get_tos cannot fail

The comment makes it look like get_tos() is used to validate something,
but it turns out the comment was about xfrm_find_bundle() which got removed
years ago.

xfrm_get_tos will return either the tos (ipv4) or 0 (ipv6).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 99ad1af2927f..6fffa2fac607 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1538,19 +1538,13 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
 
 }
 
-/* Check that the bundle accepts the flow and its components are
- * still valid.
- */
-
-static inline int xfrm_get_tos(const struct flowi *fl, int family)
+static int xfrm_get_tos(const struct flowi *fl, int family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	int tos;
-
-	if (!afinfo)
-		return -EINVAL;
+	struct xfrm_policy_afinfo *afinfo;
+	int tos = 0;
 
-	tos = afinfo->get_tos(fl);
+	afinfo = xfrm_policy_get_afinfo(family);
+	tos = afinfo ? afinfo->get_tos(fl) : 0;
 
 	xfrm_policy_put_afinfo(afinfo);
 
@@ -1705,9 +1699,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 	xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
 
 	tos = xfrm_get_tos(fl, family);
-	err = tos;
-	if (tos < 0)
-		goto put_states;
 
 	dst_hold(dst);
 
-- 
cgit v1.2.3


From 2b61997aa0c68ae033d066ac2d9905ada81b761a Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 7 Feb 2017 15:00:15 +0100
Subject: xfrm: policy: xfrm_policy_unregister_afinfo can return void

Nothing checks the return value. Also, the errors returned on unregister
are impossible (we only support INET and INET6, so no way
xfrm_policy_afinfo[afinfo->family] can be anything other than 'afinfo'
itself).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  2 +-
 net/xfrm/xfrm_policy.c | 35 +++++++++++++----------------------
 2 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index da0e4dd653e2..39037b1cce7a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -304,7 +304,7 @@ struct xfrm_policy_afinfo {
 };
 
 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo);
-int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo);
+void xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo);
 void km_policy_notify(struct xfrm_policy *xp, int dir,
 		      const struct km_event *c);
 void km_state_notify(struct xfrm_state *x, const struct km_event *c);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 6fffa2fac607..794148f76ae2 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2883,34 +2883,25 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_policy_register_afinfo);
 
-int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
+void xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
-	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
+	struct dst_ops *dst_ops = afinfo->dst_ops;
+
 	if (unlikely(afinfo->family >= NPROTO))
-		return -EAFNOSUPPORT;
-	spin_lock(&xfrm_policy_afinfo_lock);
-	if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
-		if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
-			err = -EINVAL;
-		else
-			RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
+		return;
+
+	if (likely(xfrm_policy_afinfo[afinfo->family] != afinfo)) {
+		RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
 					 NULL);
 	}
-	spin_unlock(&xfrm_policy_afinfo_lock);
-	if (!err) {
-		struct dst_ops *dst_ops = afinfo->dst_ops;
 
-		synchronize_rcu();
+	synchronize_rcu();
 
-		dst_ops->kmem_cachep = NULL;
-		dst_ops->check = NULL;
-		dst_ops->negative_advice = NULL;
-		dst_ops->link_failure = NULL;
-		afinfo->garbage_collect = NULL;
-	}
-	return err;
+	dst_ops->kmem_cachep = NULL;
+	dst_ops->check = NULL;
+	dst_ops->negative_advice = NULL;
+	dst_ops->link_failure = NULL;
+	afinfo->garbage_collect = NULL;
 }
 EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
 
-- 
cgit v1.2.3


From 3d7d25a68ea5153d9d0d01c8c83acf644eab9704 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 7 Feb 2017 15:00:16 +0100
Subject: xfrm: policy: remove garbage_collect callback

Just call xfrm_garbage_collect_deferred() directly.
This gets rid of a write to afinfo in register/unregister and allows to
constify afinfo later on.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      | 2 +-
 net/ipv4/xfrm4_policy.c | 2 +-
 net/ipv6/xfrm6_policy.c | 2 +-
 net/xfrm/xfrm_policy.c  | 6 ++----
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 39037b1cce7a..a22368edd105 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -282,7 +282,6 @@ struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
 	struct dst_ops		*dst_ops;
-	void			(*garbage_collect)(struct net *net);
 	struct dst_entry	*(*dst_lookup)(struct net *net,
 					       int tos, int oif,
 					       const xfrm_address_t *saddr,
@@ -1169,6 +1168,7 @@ static inline void xfrm_sk_free_policy(struct sock *sk)
 }
 
 void xfrm_garbage_collect(struct net *net);
+void xfrm_garbage_collect_deferred(struct net *net);
 
 #else
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6a7ff6957535..77ca91d7b79c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -219,7 +219,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
 {
 	struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
 
-	xfrm4_policy_afinfo.garbage_collect(net);
+	xfrm_garbage_collect_deferred(net);
 	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index e0f71c01d728..c273f22b12af 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -220,7 +220,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
 {
 	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
 
-	xfrm6_policy_afinfo.garbage_collect(net);
+	xfrm_garbage_collect_deferred(net);
 	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 794148f76ae2..80695a157b6e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2733,10 +2733,11 @@ void xfrm_garbage_collect(struct net *net)
 }
 EXPORT_SYMBOL(xfrm_garbage_collect);
 
-static void xfrm_garbage_collect_deferred(struct net *net)
+void xfrm_garbage_collect_deferred(struct net *net)
 {
 	flow_cache_flush_deferred(net);
 }
+EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
 
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
@@ -2873,8 +2874,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 			dst_ops->link_failure = xfrm_link_failure;
 		if (likely(dst_ops->neigh_lookup == NULL))
 			dst_ops->neigh_lookup = xfrm_neigh_lookup;
-		if (likely(afinfo->garbage_collect == NULL))
-			afinfo->garbage_collect = xfrm_garbage_collect_deferred;
 		rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
 	}
 	spin_unlock(&xfrm_policy_afinfo_lock);
@@ -2901,7 +2900,6 @@ void xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
 	dst_ops->check = NULL;
 	dst_ops->negative_advice = NULL;
 	dst_ops->link_failure = NULL;
-	afinfo->garbage_collect = NULL;
 }
 EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
 
-- 
cgit v1.2.3


From a2817d8b279bc8fe62da76e6019eb9ff9d4e319c Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 7 Feb 2017 15:00:17 +0100
Subject: xfrm: policy: remove family field

Only needed it to register the policy backend at init time.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  5 ++---
 net/ipv4/xfrm4_policy.c |  5 +----
 net/ipv6/xfrm6_policy.c |  5 +----
 net/xfrm/xfrm_policy.c  | 34 +++++++++++++++++-----------------
 4 files changed, 21 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a22368edd105..6e061309adca 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -280,7 +280,6 @@ struct net_device;
 struct xfrm_type;
 struct xfrm_dst;
 struct xfrm_policy_afinfo {
-	unsigned short		family;
 	struct dst_ops		*dst_ops;
 	struct dst_entry	*(*dst_lookup)(struct net *net,
 					       int tos, int oif,
@@ -302,8 +301,8 @@ struct xfrm_policy_afinfo {
 	struct dst_entry	*(*blackhole_route)(struct net *net, struct dst_entry *orig);
 };
 
-int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo);
-void xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo);
+int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family);
+void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
 void km_policy_notify(struct xfrm_policy *xp, int dir,
 		      const struct km_event *c);
 void km_state_notify(struct xfrm_state *x, const struct km_event *c);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 77ca91d7b79c..25613539766f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
 #include <net/ip.h>
 #include <net/l3mdev.h>
 
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-
 static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 					    int tos, int oif,
 					    const xfrm_address_t *saddr,
@@ -272,7 +270,6 @@ static struct dst_ops xfrm4_dst_ops_template = {
 };
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
-	.family = 		AF_INET,
 	.dst_ops =		&xfrm4_dst_ops_template,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
@@ -376,7 +373,7 @@ static struct pernet_operations __net_initdata xfrm4_net_ops = {
 
 static void __init xfrm4_policy_init(void)
 {
-	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
 }
 
 void __init xfrm4_init(void)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index c273f22b12af..3217e85334e3 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -25,8 +25,6 @@
 #include <net/mip6.h>
 #endif
 
-static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
-
 static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
 					  const xfrm_address_t *daddr)
@@ -292,7 +290,6 @@ static struct dst_ops xfrm6_dst_ops_template = {
 };
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
-	.family =		AF_INET6,
 	.dst_ops =		&xfrm6_dst_ops_template,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.get_saddr =		xfrm6_get_saddr,
@@ -305,7 +302,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 
 static int __init xfrm6_policy_init(void)
 {
-	return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo);
+	return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6);
 }
 
 static void xfrm6_policy_fini(void)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 80695a157b6e..7698785876d7 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,7 +45,7 @@ struct xfrm_flo {
 };
 
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
-static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
+static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
@@ -103,11 +103,11 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl
 	return false;
 }
 
-static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
+static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
 {
-	struct xfrm_policy_afinfo *afinfo;
+	const struct xfrm_policy_afinfo *afinfo;
 
-	if (unlikely(family >= NPROTO))
+	if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
 		return NULL;
 	rcu_read_lock();
 	afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
@@ -2848,15 +2848,15 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 	return dst->path->ops->neigh_lookup(dst, skb, daddr);
 }
 
-int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
+int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
 {
 	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+
+	if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
 		return -EAFNOSUPPORT;
+
 	spin_lock(&xfrm_policy_afinfo_lock);
-	if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
+	if (unlikely(xfrm_policy_afinfo[family] != NULL))
 		err = -EEXIST;
 	else {
 		struct dst_ops *dst_ops = afinfo->dst_ops;
@@ -2874,7 +2874,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 			dst_ops->link_failure = xfrm_link_failure;
 		if (likely(dst_ops->neigh_lookup == NULL))
 			dst_ops->neigh_lookup = xfrm_neigh_lookup;
-		rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
+		rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
 	}
 	spin_unlock(&xfrm_policy_afinfo_lock);
 
@@ -2882,16 +2882,16 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_policy_register_afinfo);
 
-void xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
+void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
 {
 	struct dst_ops *dst_ops = afinfo->dst_ops;
+	int i;
 
-	if (unlikely(afinfo->family >= NPROTO))
-		return;
-
-	if (likely(xfrm_policy_afinfo[afinfo->family] != afinfo)) {
-		RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
-					 NULL);
+	for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
+		if (xfrm_policy_afinfo[i] != afinfo)
+			continue;
+		RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
+		break;
 	}
 
 	synchronize_rcu();
-- 
cgit v1.2.3


From bdba9fe01e1bcb942c95a1a332b27ef829c87df4 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 7 Feb 2017 15:00:18 +0100
Subject: xfrm: policy: remove xfrm_policy_put_afinfo

Alternative is to keep it an make the (unused) afinfo arg const to avoid
the compiler warnings once the afinfo structs get constified.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7698785876d7..80ef0dcf0e29 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -116,11 +116,6 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
 	return afinfo;
 }
 
-static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
-{
-	rcu_read_unlock();
-}
-
 static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 						  int tos, int oif,
 						  const xfrm_address_t *saddr,
@@ -136,7 +131,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 
 	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return dst;
 }
@@ -1436,7 +1431,7 @@ xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
 	if (unlikely(afinfo == NULL))
 		return -EINVAL;
 	err = afinfo->get_saddr(net, oif, local, remote);
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -1546,7 +1541,7 @@ static int xfrm_get_tos(const struct flowi *fl, int family)
 	afinfo = xfrm_policy_get_afinfo(family);
 	tos = afinfo ? afinfo->get_tos(fl) : 0;
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return tos;
 }
@@ -1632,7 +1627,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 	} else
 		xdst = ERR_PTR(-ENOBUFS);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return xdst;
 }
@@ -1649,7 +1644,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 
 	err = afinfo->init_path(path, dst, nfheader_len);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return err;
 }
@@ -1666,7 +1661,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 
 	err = afinfo->fill_dst(xdst, dev, fl);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return err;
 }
@@ -2215,7 +2210,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
 	} else {
 		ret = afinfo->blackhole_route(net, dst_orig);
 	}
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -2465,7 +2460,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
 
 	afinfo->decode_session(skb, fl, reverse);
 	err = security_xfrm_decode_session(skb, &fl->flowi_secid);
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(__xfrm_decode_session);
-- 
cgit v1.2.3


From 37b103830ec3e52a761bb647eb78da22a1fe4e09 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 7 Feb 2017 15:00:19 +0100
Subject: xfrm: policy: make policy backend const

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c |  2 +-
 net/ipv6/xfrm6_policy.c |  2 +-
 net/xfrm/xfrm_policy.c  | 18 +++++++++---------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 25613539766f..71b4ecc195c7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -269,7 +269,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
 	.gc_thresh =		INT_MAX,
 };
 
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_ops =		&xfrm4_dst_ops_template,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 3217e85334e3..79651bc71bf0 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -289,7 +289,7 @@ static struct dst_ops xfrm6_dst_ops_template = {
 	.gc_thresh =		INT_MAX,
 };
 
-static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
+static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.dst_ops =		&xfrm6_dst_ops_template,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.get_saddr =		xfrm6_get_saddr,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 80ef0dcf0e29..04ed1a1ae019 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,7 +45,7 @@ struct xfrm_flo {
 };
 
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
-static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
+static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
@@ -122,7 +122,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 						  const xfrm_address_t *daddr,
 						  int family)
 {
-	struct xfrm_policy_afinfo *afinfo;
+	const struct xfrm_policy_afinfo *afinfo;
 	struct dst_entry *dst;
 
 	afinfo = xfrm_policy_get_afinfo(family);
@@ -1426,7 +1426,7 @@ xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
 	       xfrm_address_t *remote, unsigned short family)
 {
 	int err;
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
 	if (unlikely(afinfo == NULL))
 		return -EINVAL;
@@ -1535,7 +1535,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
 
 static int xfrm_get_tos(const struct flowi *fl, int family)
 {
-	struct xfrm_policy_afinfo *afinfo;
+	const struct xfrm_policy_afinfo *afinfo;
 	int tos = 0;
 
 	afinfo = xfrm_policy_get_afinfo(family);
@@ -1598,7 +1598,7 @@ static const struct flow_cache_ops xfrm_bundle_fc_ops = {
 
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 	struct dst_ops *dst_ops;
 	struct xfrm_dst *xdst;
 
@@ -1635,7 +1635,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 				 int nfheader_len)
 {
-	struct xfrm_policy_afinfo *afinfo =
+	const struct xfrm_policy_afinfo *afinfo =
 		xfrm_policy_get_afinfo(dst->ops->family);
 	int err;
 
@@ -1652,7 +1652,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 				const struct flowi *fl)
 {
-	struct xfrm_policy_afinfo *afinfo =
+	const struct xfrm_policy_afinfo *afinfo =
 		xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
 	int err;
 
@@ -2201,7 +2201,7 @@ error:
 static struct dst_entry *make_blackhole(struct net *net, u16 family,
 					struct dst_entry *dst_orig)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 	struct dst_entry *ret;
 
 	if (!afinfo) {
@@ -2452,7 +2452,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star
 int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
 			  unsigned int family, int reverse)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 	int err;
 
 	if (unlikely(afinfo == NULL))
-- 
cgit v1.2.3


From 8585989d146c61dd073d2135c5bb11d0f979d576 Mon Sep 17 00:00:00 2001
From: Luca Coelho
Date: Wed, 8 Feb 2017 15:00:34 +0200
Subject: cfg80211: fix NAN bands definition

The nl80211_nan_dual_band_conf enumeration doesn't make much sense.
The default value is assigned to a bit, which makes it weird if the
default bit and other bits are set at the same time.

To improve this, get rid of NL80211_NAN_BAND_DEFAULT and add a wiphy
configuration to let the drivers define which bands are supported.
This is exposed to the userspace, which then can make a decision on
which band(s) to use.  Additionally, rename all "dual_band" elements
to "bands", to make things clearer.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 18 ++++++++++----
 include/uapi/linux/nl80211.h | 57 ++++++++++++++++++++------------------------
 net/mac80211/cfg.c           |  4 ++--
 net/mac80211/trace.h         | 16 ++++++-------
 net/wireless/core.c          |  3 ++-
 net/wireless/nl80211.c       | 35 ++++++++++++++++++++-------
 net/wireless/trace.h         | 16 ++++++-------
 7 files changed, 86 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a2c18b53e053..c92dc03c8528 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5,7 +5,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright 2015-2016	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2416,11 +2416,13 @@ struct cfg80211_qos_map {
  * This struct defines NAN configuration parameters
  *
  * @master_pref: master preference (1 - 255)
- * @dual: dual band operation mode, see &enum nl80211_nan_dual_band_conf
+ * @bands: operating bands, a bitmap of &enum nl80211_band values.
+ *	For instance, for NL80211_BAND_2GHZ, bit 0 would be set
+ *	(i.e. BIT(NL80211_BAND_2GHZ)).
  */
 struct cfg80211_nan_conf {
 	u8 master_pref;
-	u8 dual;
+	u8 bands;
 };
 
 /**
@@ -2428,11 +2430,11 @@ struct cfg80211_nan_conf {
  * configuration
  *
  * @CFG80211_NAN_CONF_CHANGED_PREF: master preference
- * @CFG80211_NAN_CONF_CHANGED_DUAL: dual band operation
+ * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands
  */
 enum cfg80211_nan_conf_changes {
 	CFG80211_NAN_CONF_CHANGED_PREF = BIT(0),
-	CFG80211_NAN_CONF_CHANGED_DUAL = BIT(1),
+	CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1),
 };
 
 /**
@@ -3596,6 +3598,10 @@ struct wiphy_iftype_ext_capab {
  *	attribute indices defined in &enum nl80211_bss_select_attr.
  *
  * @cookie_counter: unique generic cookie counter, used to identify objects.
+ * @nan_supported_bands: bands supported by the device in NAN mode, a
+ *	bitmap of &enum nl80211_band values.  For instance, for
+ *	NL80211_BAND_2GHZ, bit 0 would be set
+ *	(i.e. BIT(NL80211_BAND_2GHZ)).
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -3727,6 +3733,8 @@ struct wiphy {
 
 	u64 cookie_counter;
 
+	u8 nan_supported_bands;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index cd547b864595..5ed257c4cd4e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -10,7 +10,7 @@
  * Copyright 2008, 2009 Luis R. Rodriguez <lrodriguez@atheros.com>
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -854,12 +854,15 @@
  *	cfg80211_scan_done().
  *
  * @NL80211_CMD_START_NAN: Start NAN operation, identified by its
- *	%NL80211_ATTR_WDEV interface. This interface must have been previously
- *	created with %NL80211_CMD_NEW_INTERFACE. After it has been started, the
- *	NAN interface will create or join a cluster. This command must have a
- *	valid %NL80211_ATTR_NAN_MASTER_PREF attribute and optional
- *	%NL80211_ATTR_NAN_DUAL attributes.
- *	After this command NAN functions can be added.
+ *	%NL80211_ATTR_WDEV interface. This interface must have been
+ *	previously created with %NL80211_CMD_NEW_INTERFACE. After it
+ *	has been started, the NAN interface will create or join a
+ *	cluster. This command must have a valid
+ *	%NL80211_ATTR_NAN_MASTER_PREF attribute and optional
+ *	%NL80211_ATTR_BANDS attributes.  If %NL80211_ATTR_BANDS is
+ *	omitted or set to 0, it means don't-care and the device will
+ *	decide what to use.  After this command NAN functions can be
+ *	added.
  * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by
  *	its %NL80211_ATTR_WDEV interface.
  * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined
@@ -880,10 +883,14 @@
  *	This command is also used as a notification sent when a NAN function is
  *	terminated. This will contain a %NL80211_ATTR_NAN_FUNC_INST_ID
  *	and %NL80211_ATTR_COOKIE attributes.
- * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN configuration. NAN
- *	must be operational (%NL80211_CMD_START_NAN was executed).
- *	It must contain at least one of the following attributes:
- *	%NL80211_ATTR_NAN_MASTER_PREF, %NL80211_ATTR_NAN_DUAL.
+ * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN
+ *	configuration. NAN must be operational (%NL80211_CMD_START_NAN
+ *	was executed).  It must contain at least one of the following
+ *	attributes: %NL80211_ATTR_NAN_MASTER_PREF,
+ *	%NL80211_ATTR_BANDS.  If %NL80211_ATTR_BANDS is omitted, the
+ *	current configuration is not changed.  If it is present but
+ *	set to zero, the configuration is changed to don't-care
+ *	(i.e. the device can decide what to do).
  * @NL80211_CMD_NAN_FUNC_MATCH: Notification sent when a match is reported.
  *	This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and
  *	%NL80211_ATTR_COOKIE.
@@ -1963,10 +1970,13 @@ enum nl80211_commands {
  *	%NL80211_CMD_CHANGE_NAN_CONFIG. Its type is u8 and it can't be 0.
  *	Also, values 1 and 255 are reserved for certification purposes and
  *	should not be used during a normal device operation.
- * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see
- *	&enum nl80211_nan_dual_band_conf). This attribute is used with
- *	%NL80211_CMD_START_NAN and optionally with
- *	%NL80211_CMD_CHANGE_NAN_CONFIG.
+ * @NL80211_ATTR_BANDS: operating bands configuration.  This is a u32
+ *	bitmask of BIT(NL80211_BAND_*) as described in %enum
+ *	nl80211_band.  For instance, for NL80211_BAND_2GHZ, bit 0
+ *	would be set.  This attribute is used with
+ *	%NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG, and
+ *	it is optional.  If no bands are set, it means don't-care and
+ *	the device will decide what to use.
  * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See
  *	&enum nl80211_nan_func_attributes for description of this nested
  *	attribute.
@@ -2397,7 +2407,7 @@ enum nl80211_attrs {
 	NL80211_ATTR_MESH_PEER_AID,
 
 	NL80211_ATTR_NAN_MASTER_PREF,
-	NL80211_ATTR_NAN_DUAL,
+	NL80211_ATTR_BANDS,
 	NL80211_ATTR_NAN_FUNC,
 	NL80211_ATTR_NAN_MATCH,
 
@@ -5070,21 +5080,6 @@ enum nl80211_bss_select_attr {
 	NL80211_BSS_SELECT_ATTR_MAX = __NL80211_BSS_SELECT_ATTR_AFTER_LAST - 1
 };
 
-/**
- * enum nl80211_nan_dual_band_conf - NAN dual band configuration
- *
- * Defines the NAN dual band mode of operation
- *
- * @NL80211_NAN_BAND_DEFAULT: device default mode
- * @NL80211_NAN_BAND_2GHZ: 2.4GHz mode
- * @NL80211_NAN_BAND_5GHZ: 5GHz mode
-  */
-enum nl80211_nan_dual_band_conf {
-	NL80211_NAN_BAND_DEFAULT	= 1 << 0,
-	NL80211_NAN_BAND_2GHZ		= 1 << 1,
-	NL80211_NAN_BAND_5GHZ		= 1 << 2,
-};
-
 /**
  * enum nl80211_nan_function_type - NAN function type
  *
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index a0be2f6cd121..ac879bb17870 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -208,8 +208,8 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
 	if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
 		new_conf.master_pref = conf->master_pref;
 
-	if (changes & CFG80211_NAN_CONF_CHANGED_DUAL)
-		new_conf.dual = conf->dual;
+	if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
+		new_conf.bands = conf->bands;
 
 	ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
 	if (!ret)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index f78d9f4f8711..0d645bc148d0 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1736,21 +1736,21 @@ TRACE_EVENT(drv_start_nan,
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual)
+		__field(u8, bands)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 	),
 
 	TP_printk(
 		LOCAL_PR_FMT  VIF_PR_FMT
-		", master preference: %u, dual: %d",
+		", master preference: %u, bands: 0x%0x",
 		LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
-		__entry->dual
+		__entry->bands
 	)
 );
 
@@ -1787,7 +1787,7 @@ TRACE_EVENT(drv_nan_change_conf,
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual)
+		__field(u8, bands)
 		__field(u32, changes)
 	),
 
@@ -1795,15 +1795,15 @@ TRACE_EVENT(drv_nan_change_conf,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 		__entry->changes = changes;
 	),
 
 	TP_printk(
 		LOCAL_PR_FMT  VIF_PR_FMT
-		", master preference: %u, dual: %d, changes: 0x%x",
+		", master preference: %u, bands: 0x%0x, changes: 0x%x",
 		LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
-		__entry->dual, __entry->changes
+		__entry->bands, __entry->changes
 	)
 );
 
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 903fc419217a..e55e05bc4805 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -626,7 +626,8 @@ int wiphy_register(struct wiphy *wiphy)
 
 	if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
 		    (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
-		     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func)))
+		     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func ||
+		     !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
 		return -EINVAL;
 
 #ifndef CONFIG_WIRELESS_WDS
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9d738f75bd4e..b5f755b3ac5d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -398,7 +398,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	},
 	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
 	[NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
-	[NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
+	[NL80211_ATTR_BANDS] = { .type = NLA_U32 },
 	[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
 	[NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
 				    .len = FILS_MAX_KEK_LEN },
@@ -1886,6 +1886,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 			}
 		}
 
+		if (nla_put_u32(msg, NL80211_ATTR_BANDS,
+				rdev->wiphy.nan_supported_bands))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -10777,15 +10781,22 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
 		return -EINVAL;
 
-	if (!info->attrs[NL80211_ATTR_NAN_DUAL])
-		return -EINVAL;
-
 	conf.master_pref =
 		nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
 	if (!conf.master_pref)
 		return -EINVAL;
 
-	conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+	if (info->attrs[NL80211_ATTR_BANDS]) {
+		u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+		if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+			return -EOPNOTSUPP;
+
+		if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+			return -EINVAL;
+
+		conf.bands = bands;
+	}
 
 	err = rdev_start_nan(rdev, wdev, &conf);
 	if (err)
@@ -11150,9 +11161,17 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
 		changed |= CFG80211_NAN_CONF_CHANGED_PREF;
 	}
 
-	if (info->attrs[NL80211_ATTR_NAN_DUAL]) {
-		conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
-		changed |= CFG80211_NAN_CONF_CHANGED_DUAL;
+	if (info->attrs[NL80211_ATTR_BANDS]) {
+		u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+		if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+			return -EOPNOTSUPP;
+
+		if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+			return -EINVAL;
+
+		conf.bands = bands;
+		changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
 	}
 
 	if (!changed)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2419c390f150..776e80cef9b4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1915,18 +1915,18 @@ TRACE_EVENT(rdev_start_nan,
 		WIPHY_ENTRY
 		WDEV_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual);
+		__field(u8, bands);
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 	),
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
-		  ", master preference: %u, dual: %d",
+		  ", master preference: %u, bands: 0x%0x",
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
-		  __entry->dual)
+		  __entry->bands)
 );
 
 TRACE_EVENT(rdev_nan_change_conf,
@@ -1937,20 +1937,20 @@ TRACE_EVENT(rdev_nan_change_conf,
 		WIPHY_ENTRY
 		WDEV_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual);
+		__field(u8, bands);
 		__field(u32, changes);
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 		__entry->changes = changes;
 	),
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
-		  ", master preference: %u, dual: %d, changes: %x",
+		  ", master preference: %u, bands: 0x%0x, changes: %x",
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
-		  __entry->dual, __entry->changes)
+		  __entry->bands, __entry->changes)
 );
 
 DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
-- 
cgit v1.2.3


From f181d6a3bcc35633facf5f3925699021c13492c5 Mon Sep 17 00:00:00 2001
From: Koen Vandeputte
Date: Wed, 8 Feb 2017 15:32:05 +0100
Subject: mac80211: fix CSA in IBSS mode

Add the missing IBSS capability flag during capability init as it needs
to be inserted into the generated beacon in order for CSA to work.

Fixes: cd7760e62c2ac ("mac80211: add support for CSA in IBSS mode")
Signed-off-by: Piotr Gawlowicz <gawlowicz@tkn.tu-berlin.de>
Signed-off-by: Mikołaj Chwalisz <chwalisz@tkn.tu-berlin.de>
Tested-by: Koen Vandeputte <koen.vandeputte@ncentric.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index a31d30713d08..98999d3d5262 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -487,14 +487,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
 	struct beacon_data *presp, *old_presp;
 	struct cfg80211_bss *cbss;
 	const struct cfg80211_bss_ies *ies;
-	u16 capability = 0;
+	u16 capability = WLAN_CAPABILITY_IBSS;
 	u64 tsf;
 	int ret = 0;
 
 	sdata_assert_lock(sdata);
 
 	if (ifibss->privacy)
-		capability = WLAN_CAPABILITY_PRIVACY;
+		capability |= WLAN_CAPABILITY_PRIVACY;
 
 	cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan,
 				ifibss->bssid, ifibss->ssid,
-- 
cgit v1.2.3


From 119aecbae57e6f10b85b7cd5e070eef006dbcd74 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Thu, 9 Feb 2017 01:18:16 +0800
Subject: sctp: streams should be recovered when it fails to send request.

Now when sending stream reset request, it closes the streams to
block further xmit of data until this request is completed, then
calls sctp_send_reconf to send the chunk.

But if sctp_send_reconf returns err, and it doesn't recover the
streams' states back,  which means the request chunk would not be
queued and sent, so the asoc will get stuck, streams are closed
and no packet is even queued.

This patch is to fix it by recovering the streams' states when
it fails to send the request, it is also to fix a return value.

Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 13d5e07dcd7d..6a686e330c57 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -136,8 +136,10 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 				goto out;
 
 	chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
-	if (!chunk)
+	if (!chunk) {
+		retval = -ENOMEM;
 		goto out;
+	}
 
 	if (out) {
 		if (str_nums)
@@ -149,7 +151,6 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 				stream->out[i].state = SCTP_STREAM_CLOSED;
 	}
 
-	asoc->strreset_outstanding = out + in;
 	asoc->strreset_chunk = chunk;
 	sctp_chunk_hold(asoc->strreset_chunk);
 
@@ -157,8 +158,22 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 	if (retval) {
 		sctp_chunk_put(asoc->strreset_chunk);
 		asoc->strreset_chunk = NULL;
+		if (!out)
+			goto out;
+
+		if (str_nums)
+			for (i = 0; i < str_nums; i++)
+				stream->out[str_list[i]].state =
+						       SCTP_STREAM_OPEN;
+		else
+			for (i = 0; i < stream->outcnt; i++)
+				stream->out[i].state = SCTP_STREAM_OPEN;
+
+		goto out;
 	}
 
+	asoc->strreset_outstanding = out + in;
+
 out:
 	return retval;
 }
-- 
cgit v1.2.3


From c56480a1e90261842f54f3a5a9ebc12d827f0c3e Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Thu, 9 Feb 2017 01:18:17 +0800
Subject: sctp: add support for generating stream reconf ssn/tsn reset request
 chunk

This patch is to define SSN/TSN Reset Request Parameter described
in rfc6525 section 4.3.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  5 +++++
 include/net/sctp/sm.h    |  2 ++
 net/sctp/sm_make_chunk.c | 29 +++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index d74fca3f3141..71c0d41d9a59 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -737,4 +737,9 @@ struct sctp_strreset_inreq {
 	__u16 list_of_streams[0];
 };
 
+struct sctp_strreset_tsnreq {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 430ed139fbbb..ac37c1782e23 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -265,6 +265,8 @@ struct sctp_chunk *sctp_make_strreset_req(
 				const struct sctp_association *asoc,
 				__u16 stream_num, __u16 *stream_list,
 				bool out, bool in);
+struct sctp_chunk *sctp_make_strreset_tsnreq(
+				const struct sctp_association *asoc);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c7d3249f88ec..749842aead33 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3658,3 +3658,32 @@ struct sctp_chunk *sctp_make_strreset_req(
 
 	return retval;
 }
+
+/* RE-CONFIG 4.3 (SSN/TSN RESET ALL)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 15       |      Parameter Length = 8     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Request Sequence Number              |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnreq(
+				const struct sctp_association *asoc)
+{
+	struct sctp_strreset_tsnreq tsnreq;
+	__u16 length = sizeof(tsnreq);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST;
+	tsnreq.param_hdr.length = htons(length);
+	tsnreq.request_seq = htonl(asoc->strreset_outseq);
+
+	sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq);
+
+	return retval;
+}
-- 
cgit v1.2.3


From a92ce1a42dde1caaee4afae67531e3e7acecf6e4 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Thu, 9 Feb 2017 01:18:18 +0800
Subject: sctp: implement sender-side procedures for SSN/TSN Reset Request
 Parameter

This patch is to implement Sender-Side Procedures for the SSN/TSN
Reset Request Parameter descibed in rfc6525 section 5.1.4.

It is also to add sockopt SCTP_RESET_ASSOC in rfc6525 section 6.3.3
for users.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h   |  1 +
 include/uapi/linux/sctp.h |  1 +
 net/sctp/socket.c         | 29 +++++++++++++++++++++++++++++
 net/sctp/stream.c         | 40 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 480b65a24aff..b60ca14068d8 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -198,6 +198,7 @@ int sctp_offload_init(void);
  */
 int sctp_send_reset_streams(struct sctp_association *asoc,
 			    struct sctp_reset_streams *params);
+int sctp_send_reset_assoc(struct sctp_association *asoc);
 
 /*
  * Module global variables
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 03c27cefffb1..c0bd8c3d565a 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -117,6 +117,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_ASSOC_STATUS	115
 #define SCTP_ENABLE_STREAM_RESET	118
 #define SCTP_RESET_STREAMS	119
+#define SCTP_RESET_ASSOC	120
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a8b4252fe084..45a7c417eb7f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3818,6 +3818,32 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_reset_assoc(struct sock *sk,
+				       char __user *optval,
+				       unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	sctp_assoc_t associd;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(associd))
+		goto out;
+
+	if (copy_from_user(&associd, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, associd);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_reset_assoc(asoc);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3990,6 +4016,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_RESET_STREAMS:
 		retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
 		break;
+	case SCTP_RESET_ASSOC:
+		retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 6a686e330c57..53e49fc2f0a3 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -177,3 +177,43 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 out:
 	return retval;
 }
+
+int sctp_send_reset_assoc(struct sctp_association *asoc)
+{
+	struct sctp_chunk *chunk = NULL;
+	int retval;
+	__u16 i;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
+		return -ENOPROTOOPT;
+
+	if (asoc->strreset_outstanding)
+		return -EINPROGRESS;
+
+	chunk = sctp_make_strreset_tsnreq(asoc);
+	if (!chunk)
+		return -ENOMEM;
+
+	/* Block further xmit of data until this request is completed */
+	for (i = 0; i < asoc->stream->outcnt; i++)
+		asoc->stream->out[i].state = SCTP_STREAM_CLOSED;
+
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+
+		for (i = 0; i < asoc->stream->outcnt; i++)
+			asoc->stream->out[i].state = SCTP_STREAM_OPEN;
+
+		return retval;
+	}
+
+	asoc->strreset_outstanding = 1;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 78098117f8bfad4f2104c3f7b6b69071af95a246 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Thu, 9 Feb 2017 01:18:19 +0800
Subject: sctp: add support for generating stream reconf add incoming/outgoing
 streams request chunk

This patch is to define Add Incoming/Outgoing Streams Request
Parameter described in rfc6525 section 4.5 and 4.6. They can
be in one same chunk trunk as rfc6525 section 3.1-7 describes,
so make them in one function.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  7 +++++++
 include/net/sctp/sm.h    |  3 +++
 net/sctp/sm_make_chunk.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 71c0d41d9a59..b055788de0cf 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -742,4 +742,11 @@ struct sctp_strreset_tsnreq {
 	__u32 request_seq;
 };
 
+struct sctp_strreset_addstrm {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+	__u16 number_of_streams;
+	__u16 reserved;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index ac37c1782e23..3675fde3a26e 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -267,6 +267,9 @@ struct sctp_chunk *sctp_make_strreset_req(
 				bool out, bool in);
 struct sctp_chunk *sctp_make_strreset_tsnreq(
 				const struct sctp_association *asoc);
+struct sctp_chunk *sctp_make_strreset_addstrm(
+				const struct sctp_association *asoc,
+				__u16 out, __u16 in);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 749842aead33..7f8dbf2c6cee 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3687,3 +3687,49 @@ struct sctp_chunk *sctp_make_strreset_tsnreq(
 
 	return retval;
 }
+
+/* RE-CONFIG 4.5/4.6 (ADD STREAM)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 17       |      Parameter Length = 12    |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Re-configuration Request Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |      Number of new streams    |         Reserved              |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_addstrm(
+				const struct sctp_association *asoc,
+				__u16 out, __u16 in)
+{
+	struct sctp_strreset_addstrm addstrm;
+	__u16 size = sizeof(addstrm);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, (!!out + !!in) * size);
+	if (!retval)
+		return NULL;
+
+	if (out) {
+		addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS;
+		addstrm.param_hdr.length = htons(size);
+		addstrm.number_of_streams = htons(out);
+		addstrm.request_seq = htonl(asoc->strreset_outseq);
+		addstrm.reserved = 0;
+
+		sctp_addto_chunk(retval, size, &addstrm);
+	}
+
+	if (in) {
+		addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS;
+		addstrm.param_hdr.length = htons(size);
+		addstrm.number_of_streams = htons(in);
+		addstrm.request_seq = htonl(asoc->strreset_outseq + !!out);
+		addstrm.reserved = 0;
+
+		sctp_addto_chunk(retval, size, &addstrm);
+	}
+
+	return retval;
+}
-- 
cgit v1.2.3


From 242bd2d519d7194633e309286ba7ba29a1ad63e8 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Thu, 9 Feb 2017 01:18:20 +0800
Subject: sctp: implement sender-side procedures for Add Incoming/Outgoing
 Streams Request Parameter

This patch is to implement Sender-Side Procedures for the Add
Outgoing and Incoming Streams Request Parameter described in
rfc6525 section 5.1.5-5.1.6.

It is also to add sockopt SCTP_ADD_STREAMS in rfc6525 section
6.3.4 for users.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h   |  2 ++
 include/uapi/linux/sctp.h |  7 +++++
 net/sctp/socket.c         | 29 ++++++++++++++++++
 net/sctp/stream.c         | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index b60ca14068d8..6dfc5536a3e6 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -199,6 +199,8 @@ int sctp_offload_init(void);
 int sctp_send_reset_streams(struct sctp_association *asoc,
 			    struct sctp_reset_streams *params);
 int sctp_send_reset_assoc(struct sctp_association *asoc);
+int sctp_send_add_streams(struct sctp_association *asoc,
+			  struct sctp_add_streams *params);
 
 /*
  * Module global variables
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c0bd8c3d565a..a91a9cccbae6 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -118,6 +118,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ENABLE_STREAM_RESET	118
 #define SCTP_RESET_STREAMS	119
 #define SCTP_RESET_ASSOC	120
+#define SCTP_ADD_STREAMS	121
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -1027,4 +1028,10 @@ struct sctp_reset_streams {
 	uint16_t srs_stream_list[];	/* list if srs_num_streams is not 0 */
 };
 
+struct sctp_add_streams {
+	sctp_assoc_t sas_assoc_id;
+	uint16_t sas_instrms;
+	uint16_t sas_outstrms;
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 45a7c417eb7f..75f35cea4371 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3844,6 +3844,32 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_add_streams(struct sock *sk,
+				       char __user *optval,
+				       unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_add_streams params;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.sas_assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_add_streams(asoc, &params);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4019,6 +4045,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_RESET_ASSOC:
 		retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
 		break;
+	case SCTP_ADD_STREAMS:
+		retval = sctp_setsockopt_add_streams(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 53e49fc2f0a3..eb02490245ba 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -217,3 +217,80 @@ int sctp_send_reset_assoc(struct sctp_association *asoc)
 
 	return 0;
 }
+
+int sctp_send_add_streams(struct sctp_association *asoc,
+			  struct sctp_add_streams *params)
+{
+	struct sctp_stream *stream = asoc->stream;
+	struct sctp_chunk *chunk = NULL;
+	int retval = -ENOMEM;
+	__u32 outcnt, incnt;
+	__u16 out, in;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
+		retval = -ENOPROTOOPT;
+		goto out;
+	}
+
+	if (asoc->strreset_outstanding) {
+		retval = -EINPROGRESS;
+		goto out;
+	}
+
+	out = params->sas_outstrms;
+	in  = params->sas_instrms;
+	outcnt = stream->outcnt + out;
+	incnt = stream->incnt + in;
+	if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM ||
+	    (!out && !in)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (out) {
+		struct sctp_stream_out *streamout;
+
+		streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
+				     GFP_KERNEL);
+		if (!streamout)
+			goto out;
+
+		memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
+		stream->out = streamout;
+	}
+
+	if (in) {
+		struct sctp_stream_in *streamin;
+
+		streamin = krealloc(stream->in, incnt * sizeof(*streamin),
+				    GFP_KERNEL);
+		if (!streamin)
+			goto out;
+
+		memset(streamin + stream->incnt, 0, in * sizeof(*streamin));
+		stream->in = streamin;
+	}
+
+	chunk = sctp_make_strreset_addstrm(asoc, out, in);
+	if (!chunk)
+		goto out;
+
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+		goto out;
+	}
+
+	stream->incnt = incnt;
+	stream->outcnt = outcnt;
+
+	asoc->strreset_outstanding = !!out + !!in;
+
+out:
+	return retval;
+}
-- 
cgit v1.2.3


From 50f008e583d11b3133f23f29b05a0fcac2af3b40 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Wed, 8 Feb 2017 14:40:04 -0800
Subject: net: dsa: Fix duplicate object rule

While adding switch.o to the list of DSA object files, we essentially
duplicated the previous obj-y line and just added switch.o, remove the
duplicate.

Fixes: f515f192ab4f ("net: dsa: add switch notifier")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 72912982de3d..31d343796251 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,5 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o slave.o dsa2.o
 dsa_core-y += dsa.o slave.o dsa2.o switch.o
 
 # tagging formats
-- 
cgit v1.2.3


From 5e17da634a21b1200853fe82ba67d6571f2beabe Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:52 -0800
Subject: openvswitch: Fix comments for skb->_nfct

Fix comments referring to skb 'nfct' and 'nfctinfo' fields now that
they are combined into '_nfct'.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index fbffe0ea4c4f..5de6d12b3a73 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -157,7 +157,7 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	ovs_ct_get_labels(ct, &key->ct.labels);
 }
 
-/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+/* Update 'key' based on skb->_nfct.  If 'post_ct' is true, then OVS has
  * previously sent the packet to conntrack via the ct action.  If
  * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
  * initialized from the connection status.
@@ -421,12 +421,12 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
 
 /* Find an existing connection which this packet belongs to without
  * re-attributing statistics or modifying the connection state.  This allows an
- * skb->nfct lost due to an upcall to be recovered during actions execution.
+ * skb->_nfct lost due to an upcall to be recovered during actions execution.
  *
  * Must be called with rcu_read_lock.
  *
- * On success, populates skb->nfct and skb->nfctinfo, and returns the
- * connection.  Returns NULL if there is no existing entry.
+ * On success, populates skb->_nfct and returns the connection.  Returns NULL
+ * if there is no existing entry.
  */
 static struct nf_conn *
 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
@@ -464,7 +464,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 	return ct;
 }
 
-/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
 static bool skb_nfct_cached(struct net *net,
 			    const struct sw_flow_key *key,
 			    const struct ovs_conntrack_info *info,
@@ -475,7 +475,7 @@ static bool skb_nfct_cached(struct net *net,
 
 	ct = nf_ct_get(skb, &ctinfo);
 	/* If no ct, check if we have evidence that an existing conntrack entry
-	 * might be found for this skb.  This happens when we lose a skb->nfct
+	 * might be found for this skb.  This happens when we lose a skb->_nfct
 	 * due to an upcall.  If the connection was not confirmed, it is not
 	 * cached and needs to be run through conntrack again.
 	 */
@@ -699,7 +699,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
  * not done already.  Update key with new CT state after passing the packet
  * through conntrack.
- * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
  * set to NULL and 0 will be returned.
  */
 static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
-- 
cgit v1.2.3


From 9ff464db50e437eef131f719cc2e9902eea9c607 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:53 -0800
Subject: openvswitch: Use inverted tuple in ovs_ct_find_existing() if NATted.

The conntrack lookup for existing connections fails to invert the
packet 5-tuple for NATted packets, and therefore fails to find the
existing conntrack entry.  Conntrack only stores 5-tuples for incoming
packets, and there are various situations where a lookup on a packet
that has already been transformed by NAT needs to be made.  Looking up
an existing conntrack entry upon executing packet received from the
userspace is one of them.

This patch fixes ovs_ct_find_existing() to invert the packet 5-tuple
for the conntrack lookup whenever the packet has already been
transformed by conntrack from its input form as evidenced by one of
the NAT flags being set in the conntrack state metadata.

Fixes: 05752523e565 ("openvswitch: Interface with NAT.")
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 5de6d12b3a73..4df9a5449c95 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -430,7 +430,7 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
  */
 static struct nf_conn *
 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
-		     u8 l3num, struct sk_buff *skb)
+		     u8 l3num, struct sk_buff *skb, bool natted)
 {
 	struct nf_conntrack_l3proto *l3proto;
 	struct nf_conntrack_l4proto *l4proto;
@@ -453,6 +453,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 		return NULL;
 	}
 
+	/* Must invert the tuple if skb has been transformed by NAT. */
+	if (natted) {
+		struct nf_conntrack_tuple inverse;
+
+		if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+			pr_debug("ovs_ct_find_existing: Inversion failed!\n");
+			return NULL;
+		}
+		tuple = inverse;
+	}
+
 	/* look for tuple match */
 	h = nf_conntrack_find_get(net, zone, &tuple);
 	if (!h)
@@ -460,6 +471,13 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 
 	ct = nf_ct_tuplehash_to_ctrack(h);
 
+	/* Inverted packet tuple matches the reverse direction conntrack tuple,
+	 * select the other tuplehash to get the right 'ctinfo' bits for this
+	 * packet.
+	 */
+	if (natted)
+		h = &ct->tuplehash[!h->tuple.dst.dir];
+
 	nf_ct_set(skb, ct, ovs_ct_get_info(h));
 	return ct;
 }
@@ -482,7 +500,9 @@ static bool skb_nfct_cached(struct net *net,
 	if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
 	    !(key->ct.state & OVS_CS_F_INVALID) &&
 	    key->ct.zone == info->zone.id)
-		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
+		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
+					  !!(key->ct.state
+					     & OVS_CS_F_NAT_MASK));
 	if (!ct)
 		return false;
 	if (!net_eq(net, read_pnet(&ct->ct_net)))
-- 
cgit v1.2.3


From 193e30967897f3a8b6f9f137ac30571d832c2c5c Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:54 -0800
Subject: openvswitch: Do not trigger events for unconfirmed connections.

Receiving change events before the 'new' event for the connection has
been received can be confusing.  Avoid triggering change events for
setting conntrack mark or labels before the conntrack entry has been
confirmed.

Fixes: 182e3042e15d ("openvswitch: Allow matching on conntrack mark")
Fixes: c2ac66735870 ("openvswitch: Allow matching on conntrack label")
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 4df9a5449c95..a6ff374d57d3 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -245,7 +245,8 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
 	new_mark = ct_mark | (ct->mark & ~(mask));
 	if (ct->mark != new_mark) {
 		ct->mark = new_mark;
-		nf_conntrack_event_cache(IPCT_MARK, ct);
+		if (nf_ct_is_confirmed(ct))
+			nf_conntrack_event_cache(IPCT_MARK, ct);
 		key->ct.mark = new_mark;
 	}
 
@@ -262,7 +263,6 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_labels *cl;
 	struct nf_conn *ct;
-	int err;
 
 	/* The connection could be invalid, in which case set_label is no-op.*/
 	ct = nf_ct_get(skb, &ctinfo);
@@ -277,10 +277,26 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
 	if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
 		return -ENOSPC;
 
-	err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
-				    OVS_CT_LABELS_LEN / sizeof(u32));
-	if (err)
-		return err;
+	if (nf_ct_is_confirmed(ct)) {
+		/* Triggers a change event, which makes sense only for
+		 * confirmed connections.
+		 */
+		int err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
+						OVS_CT_LABELS_LEN / sizeof(u32));
+		if (err)
+			return err;
+	} else {
+		u32 *dst = (u32 *)cl->bits;
+		const u32 *msk = (const u32 *)mask->ct_labels;
+		const u32 *lbl = (const u32 *)labels->ct_labels;
+		int i;
+
+		/* No-one else has access to the non-confirmed entry, copy
+		 * labels over, keeping any bits we are not explicitly setting.
+		 */
+		for (i = 0; i < OVS_CT_LABELS_LEN / sizeof(u32); i++)
+			dst[i] = (dst[i] & ~msk[i]) | (lbl[i] & msk[i]);
+	}
 
 	ovs_ct_get_labels(ct, &key->ct.labels);
 	return 0;
-- 
cgit v1.2.3


From cb80d58fae76d8ea93555149b2b16e19b89a1f4f Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:55 -0800
Subject: openvswitch: Unionize ovs_key_ct_label with a u32 array.

Make the array of labels in struct ovs_key_ct_label an union, adding a
u32 array of the same byte size as the existing u8 array.  It is
faster to loop through the labels 32 bits at the time, which is also
the alignment of netlink attributes.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  8 ++++++--
 net/openvswitch/conntrack.c      | 15 ++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 375d812fea36..96aee34ef55f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -446,9 +446,13 @@ struct ovs_key_nd {
 	__u8	nd_tll[ETH_ALEN];
 };
 
-#define OVS_CT_LABELS_LEN	16
+#define OVS_CT_LABELS_LEN_32	4
+#define OVS_CT_LABELS_LEN	(OVS_CT_LABELS_LEN_32 * sizeof(__u32))
 struct ovs_key_ct_labels {
-	__u8	ct_labels[OVS_CT_LABELS_LEN];
+	union {
+		__u8	ct_labels[OVS_CT_LABELS_LEN];
+		__u32	ct_labels_32[OVS_CT_LABELS_LEN_32];
+	};
 };
 
 /* OVS_KEY_ATTR_CT_STATE flags */
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a6ff374d57d3..f23934ccce20 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -281,20 +281,21 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
 		/* Triggers a change event, which makes sense only for
 		 * confirmed connections.
 		 */
-		int err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
-						OVS_CT_LABELS_LEN / sizeof(u32));
+		int err = nf_connlabels_replace(ct, labels->ct_labels_32,
+						mask->ct_labels_32,
+						OVS_CT_LABELS_LEN_32);
 		if (err)
 			return err;
 	} else {
 		u32 *dst = (u32 *)cl->bits;
-		const u32 *msk = (const u32 *)mask->ct_labels;
-		const u32 *lbl = (const u32 *)labels->ct_labels;
+		const u32 *msk = mask->ct_labels_32;
+		const u32 *lbl = labels->ct_labels_32;
 		int i;
 
 		/* No-one else has access to the non-confirmed entry, copy
 		 * labels over, keeping any bits we are not explicitly setting.
 		 */
-		for (i = 0; i < OVS_CT_LABELS_LEN / sizeof(u32); i++)
+		for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
 			dst[i] = (dst[i] & ~msk[i]) | (lbl[i] & msk[i]);
 	}
 
@@ -866,8 +867,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 {
 	size_t i;
 
-	for (i = 0; i < sizeof(*labels); i++)
-		if (labels->ct_labels[i])
+	for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+		if (labels->ct_labels_32[i])
 			return true;
 
 	return false;
-- 
cgit v1.2.3


From b87cec3814ccc7f6afb0a1378ee7e5110d07cdd3 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:56 -0800
Subject: openvswitch: Simplify labels length logic.

Since 23014011ba42 ("netfilter: conntrack: support a fixed size of 128
distinct labels"), the size of conntrack labels extension has fixed to
128 bits, so we do not need to check for labels sizes shorter than 128
at run-time.  This patch simplifies labels length logic accordingly,
but allows the conntrack labels size to be increased in the future
without breaking the build.  In the event of conntrack labels
increasing in size OVS would still be able to deal with the 128 first
label bits.

Suggested-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f23934ccce20..fe2a410ce70a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -129,22 +129,20 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
 #endif
 }
 
+/* Guard against conntrack labels max size shrinking below 128 bits. */
+#if NF_CT_LABELS_MAX_SIZE < 16
+#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
+#endif
+
 static void ovs_ct_get_labels(const struct nf_conn *ct,
 			      struct ovs_key_ct_labels *labels)
 {
 	struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
 
-	if (cl) {
-		size_t len = sizeof(cl->bits);
-
-		if (len > OVS_CT_LABELS_LEN)
-			len = OVS_CT_LABELS_LEN;
-		else if (len < OVS_CT_LABELS_LEN)
-			memset(labels, 0, OVS_CT_LABELS_LEN);
-		memcpy(labels, cl->bits, len);
-	} else {
+	if (cl)
+		memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
+	else
 		memset(labels, 0, OVS_CT_LABELS_LEN);
-	}
 }
 
 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
@@ -274,7 +272,7 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
 		nf_ct_labels_ext_add(ct);
 		cl = nf_ct_labels_find(ct);
 	}
-	if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
+	if (!cl)
 		return -ENOSPC;
 
 	if (nf_ct_is_confirmed(ct)) {
-- 
cgit v1.2.3


From 6ffcea79957df43caeaa6d1de5062556a5afc262 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:57 -0800
Subject: openvswitch: Refactor labels initialization.

Refactoring conntrack labels initialization makes changes in later
patches easier to review.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 104 ++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index fe2a410ce70a..7c5bb98c22c6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -227,19 +227,12 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
 	return 0;
 }
 
-static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
 			   u32 ct_mark, u32 mask)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
 	u32 new_mark;
 
-	/* The connection could be invalid, in which case set_mark is no-op. */
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct)
-		return 0;
-
 	new_mark = ct_mark | (ct->mark & ~(mask));
 	if (ct->mark != new_mark) {
 		ct->mark = new_mark;
@@ -254,50 +247,66 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
 #endif
 }
 
-static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
-			     const struct ovs_key_ct_labels *labels,
-			     const struct ovs_key_ct_labels *mask)
+static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
 {
-	enum ip_conntrack_info ctinfo;
 	struct nf_conn_labels *cl;
-	struct nf_conn *ct;
-
-	/* The connection could be invalid, in which case set_label is no-op.*/
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct)
-		return 0;
 
 	cl = nf_ct_labels_find(ct);
 	if (!cl) {
 		nf_ct_labels_ext_add(ct);
 		cl = nf_ct_labels_find(ct);
 	}
+
+	return cl;
+}
+
+/* Initialize labels for a new, yet to be committed conntrack entry.  Note that
+ * since the new connection is not yet confirmed, and thus no-one else has
+ * access to it's labels, we simply write them over.  Also, we refrain from
+ * triggering events, as receiving change events before the create event would
+ * be confusing.
+ */
+static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
+			      const struct ovs_key_ct_labels *labels,
+			      const struct ovs_key_ct_labels *mask)
+{
+	struct nf_conn_labels *cl;
+	u32 *dst;
+	int i;
+
+	cl = ovs_ct_get_conn_labels(ct);
 	if (!cl)
 		return -ENOSPC;
 
-	if (nf_ct_is_confirmed(ct)) {
-		/* Triggers a change event, which makes sense only for
-		 * confirmed connections.
-		 */
-		int err = nf_connlabels_replace(ct, labels->ct_labels_32,
-						mask->ct_labels_32,
-						OVS_CT_LABELS_LEN_32);
-		if (err)
-			return err;
-	} else {
-		u32 *dst = (u32 *)cl->bits;
-		const u32 *msk = mask->ct_labels_32;
-		const u32 *lbl = labels->ct_labels_32;
-		int i;
+	dst = (u32 *)cl->bits;
+	for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+		dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
+			(labels->ct_labels_32[i] & mask->ct_labels_32[i]);
 
-		/* No-one else has access to the non-confirmed entry, copy
-		 * labels over, keeping any bits we are not explicitly setting.
-		 */
-		for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
-			dst[i] = (dst[i] & ~msk[i]) | (lbl[i] & msk[i]);
-	}
+	memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
+	return 0;
+}
+
+static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
+			     const struct ovs_key_ct_labels *labels,
+			     const struct ovs_key_ct_labels *mask)
+{
+	struct nf_conn_labels *cl;
+	int err;
+
+	cl = ovs_ct_get_conn_labels(ct);
+	if (!cl)
+		return -ENOSPC;
+
+	err = nf_connlabels_replace(ct, labels->ct_labels_32,
+				    mask->ct_labels_32,
+				    OVS_CT_LABELS_LEN_32);
+	if (err)
+		return err;
+
+	memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
 
-	ovs_ct_get_labels(ct, &key->ct.labels);
 	return 0;
 }
 
@@ -877,25 +886,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
 			 struct sk_buff *skb)
 {
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
 	int err;
 
 	err = __ovs_ct_lookup(net, key, info, skb);
 	if (err)
 		return err;
 
+	/* The connection could be invalid, in which case this is a no-op.*/
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return 0;
+
 	/* Apply changes before confirming the connection so that the initial
 	 * conntrack NEW netlink event carries the values given in the CT
 	 * action.
 	 */
 	if (info->mark.mask) {
-		err = ovs_ct_set_mark(skb, key, info->mark.value,
+		err = ovs_ct_set_mark(ct, key, info->mark.value,
 				      info->mark.mask);
 		if (err)
 			return err;
 	}
 	if (labels_nonzero(&info->labels.mask)) {
-		err = ovs_ct_set_labels(skb, key, &info->labels.value,
-					&info->labels.mask);
+		if (!nf_ct_is_confirmed(ct))
+			err = ovs_ct_init_labels(ct, key, &info->labels.value,
+						 &info->labels.mask);
+		else
+			err = ovs_ct_set_labels(ct, key, &info->labels.value,
+						&info->labels.mask);
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From 09aa98ad496d6b11a698b258bc64d7f64c55d682 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:58 -0800
Subject: openvswitch: Inherit master's labels.

We avoid calling into nf_conntrack_in() for expected connections, as
that would remove the expectation that we want to stick around until
we are ready to commit the connection.  Instead, we do a lookup in the
expectation table directly.  However, after a successful expectation
lookup we have set the flow key label field from the master
connection, whereas nf_conntrack_in() does not do this.  This leads to
master's labels being inherited after an expectation lookup, but those
labels not being inherited after the corresponding conntrack action
with a commit flag.

This patch resolves the problem by changing the commit code path to
also inherit the master's labels to the expected connection.
Resolving this conflict in favor of inheriting the labels allows more
information be passed from the master connection to related
connections, which would otherwise be much harder if the 32 bits in
the connmark are not enough.  Labels can still be set explicitly, so
this change only affects the default values of the labels in presense
of a master connection.

Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action")
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 7c5bb98c22c6..f989ccf38eab 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -73,6 +73,8 @@ struct ovs_conntrack_info {
 #endif
 };
 
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
+
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
 
 static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -270,18 +272,32 @@ static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
 			      const struct ovs_key_ct_labels *labels,
 			      const struct ovs_key_ct_labels *mask)
 {
-	struct nf_conn_labels *cl;
-	u32 *dst;
-	int i;
+	struct nf_conn_labels *cl, *master_cl;
+	bool have_mask = labels_nonzero(mask);
+
+	/* Inherit master's labels to the related connection? */
+	master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
+
+	if (!master_cl && !have_mask)
+		return 0;   /* Nothing to do. */
 
 	cl = ovs_ct_get_conn_labels(ct);
 	if (!cl)
 		return -ENOSPC;
 
-	dst = (u32 *)cl->bits;
-	for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
-		dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
-			(labels->ct_labels_32[i] & mask->ct_labels_32[i]);
+	/* Inherit the master's labels, if any. */
+	if (master_cl)
+		*cl = *master_cl;
+
+	if (have_mask) {
+		u32 *dst = (u32 *)cl->bits;
+		int i;
+
+		for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+			dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
+				(labels->ct_labels_32[i]
+				 & mask->ct_labels_32[i]);
+	}
 
 	memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
 
@@ -909,13 +925,14 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 		if (err)
 			return err;
 	}
-	if (labels_nonzero(&info->labels.mask)) {
-		if (!nf_ct_is_confirmed(ct))
-			err = ovs_ct_init_labels(ct, key, &info->labels.value,
-						 &info->labels.mask);
-		else
-			err = ovs_ct_set_labels(ct, key, &info->labels.value,
-						&info->labels.mask);
+	if (!nf_ct_is_confirmed(ct)) {
+		err = ovs_ct_init_labels(ct, key, &info->labels.value,
+					 &info->labels.mask);
+		if (err)
+			return err;
+	} else if (labels_nonzero(&info->labels.mask)) {
+		err = ovs_ct_set_labels(ct, key, &info->labels.value,
+					&info->labels.mask);
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From 9dd7f8907c3705dc7a7a375d1c6e30b06e6daffc Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:21:59 -0800
Subject: openvswitch: Add original direction conntrack tuple to sw_flow_key.

Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key.  The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry.  This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.

The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.

The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple.  This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.

When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state.  While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards.  If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change.  When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.

It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information.  If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.

The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields.  Hence, the IP addresses are overlaid in union with ARP
and ND fields.  This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets.  ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 20 +++++++++-
 net/openvswitch/actions.c        |  2 +
 net/openvswitch/conntrack.c      | 86 +++++++++++++++++++++++++++++++++++++---
 net/openvswitch/conntrack.h      | 10 ++++-
 net/openvswitch/flow.c           | 34 +++++++++++++---
 net/openvswitch/flow.h           | 49 ++++++++++++++++++-----
 net/openvswitch/flow_netlink.c   | 85 +++++++++++++++++++++++++++++----------
 net/openvswitch/flow_netlink.h   |  7 +++-
 8 files changed, 246 insertions(+), 47 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 96aee34ef55f..90af8b8e10f8 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2007-2013 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -331,6 +331,8 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
 	OVS_KEY_ATTR_CT_MARK,	/* u32 connection tracking mark */
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
+	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
+	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -472,6 +474,22 @@ struct ovs_key_ct_labels {
 
 #define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 
+struct ovs_key_ct_tuple_ipv4 {
+	__be32 ipv4_src;
+	__be32 ipv4_dst;
+	__be16 src_port;
+	__be16 dst_port;
+	__u8   ipv4_proto;
+};
+
+struct ovs_key_ct_tuple_ipv6 {
+	__be32 ipv6_src[4];
+	__be32 ipv6_dst[4];
+	__be16 src_port;
+	__be16 dst_port;
+	__u8   ipv6_proto;
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index efa9a8858cc6..b1beb2b94ec7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1074,6 +1074,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_CT_ZONE:
 	case OVS_KEY_ATTR_CT_MARK:
 	case OVS_KEY_ATTR_CT_LABELS:
+	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
+	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
 		err = -EINVAL;
 		break;
 	}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f989ccf38eab..bfd7606c8be1 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -147,6 +147,20 @@ static void ovs_ct_get_labels(const struct nf_conn *ct,
 		memset(labels, 0, OVS_CT_LABELS_LEN);
 }
 
+static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
+					const struct nf_conntrack_tuple *orig,
+					u8 icmp_proto)
+{
+	key->ct.orig_proto = orig->dst.protonum;
+	if (orig->dst.protonum == icmp_proto) {
+		key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
+		key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
+	} else {
+		key->ct.orig_tp.src = orig->src.u.all;
+		key->ct.orig_tp.dst = orig->dst.u.all;
+	}
+}
+
 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 				const struct nf_conntrack_zone *zone,
 				const struct nf_conn *ct)
@@ -155,6 +169,35 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	key->ct.zone = zone->id;
 	key->ct.mark = ovs_ct_get_mark(ct);
 	ovs_ct_get_labels(ct, &key->ct.labels);
+
+	if (ct) {
+		const struct nf_conntrack_tuple *orig;
+
+		/* Use the master if we have one. */
+		if (ct->master)
+			ct = ct->master;
+		orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+		/* IP version must match with the master connection. */
+		if (key->eth.type == htons(ETH_P_IP) &&
+		    nf_ct_l3num(ct) == NFPROTO_IPV4) {
+			key->ipv4.ct_orig.src = orig->src.u3.ip;
+			key->ipv4.ct_orig.dst = orig->dst.u3.ip;
+			__ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
+			return;
+		} else if (key->eth.type == htons(ETH_P_IPV6) &&
+			   !sw_flow_key_is_nd(key) &&
+			   nf_ct_l3num(ct) == NFPROTO_IPV6) {
+			key->ipv6.ct_orig.src = orig->src.u3.in6;
+			key->ipv6.ct_orig.dst = orig->dst.u3.in6;
+			__ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
+			return;
+		}
+	}
+	/* Clear 'ct.orig_proto' to mark the non-existence of conntrack
+	 * original direction key fields.
+	 */
+	key->ct.orig_proto = 0;
 }
 
 /* Update 'key' based on skb->_nfct.  If 'post_ct' is true, then OVS has
@@ -208,24 +251,55 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 	ovs_ct_update_key(skb, NULL, key, false, false);
 }
 
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+#define IN6_ADDR_INITIALIZER(ADDR) \
+	{ (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
+	  (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
+
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+		   const struct sw_flow_key *output, struct sk_buff *skb)
 {
-	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct.state))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
-	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct.zone))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
-	    nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
-		    &key->ct.labels))
+	    nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
+		    &output->ct.labels))
 		return -EMSGSIZE;
 
+	if (swkey->ct.orig_proto) {
+		if (swkey->eth.type == htons(ETH_P_IP)) {
+			struct ovs_key_ct_tuple_ipv4 orig = {
+				output->ipv4.ct_orig.src,
+				output->ipv4.ct_orig.dst,
+				output->ct.orig_tp.src,
+				output->ct.orig_tp.dst,
+				output->ct.orig_proto,
+			};
+			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
+				    sizeof(orig), &orig))
+				return -EMSGSIZE;
+		} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+			struct ovs_key_ct_tuple_ipv6 orig = {
+				IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
+				IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
+				output->ct.orig_tp.src,
+				output->ct.orig_tp.dst,
+				output->ct.orig_proto,
+			};
+			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
+				    sizeof(orig), &orig))
+				return -EMSGSIZE;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..9e92445dc092 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 		   const struct ovs_conntrack_info *);
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+		   const struct sw_flow_key *output, struct sk_buff *skb);
 void ovs_ct_free_action(const struct nlattr *a);
 
 #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -79,9 +80,14 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 	key->ct.zone = 0;
 	key->ct.mark = 0;
 	memset(&key->ct.labels, 0, sizeof(key->ct.labels));
+	/* Clear 'ct.orig_proto' to mark the non-existence of original
+	 * direction key fields.
+	 */
+	key->ct.orig_proto = 0;
 }
 
-static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
+				 const struct sw_flow_key *output,
 				 struct sk_buff *skb)
 {
 	return 0;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2c0a00f7f1b7..9d4bb8eb63f2 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -765,7 +765,7 @@ static int key_extract_mac_proto(struct sk_buff *skb)
 int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
-	int res;
+	int res, err;
 
 	/* Extract metadata from packet. */
 	if (tun_info) {
@@ -792,7 +792,6 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	key->phy.priority = skb->priority;
 	key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
 	key->phy.skb_mark = skb->mark;
-	ovs_ct_fill_key(skb, key);
 	key->ovs_flow_hash = 0;
 	res = key_extract_mac_proto(skb);
 	if (res < 0)
@@ -800,17 +799,26 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	key->mac_proto = res;
 	key->recirc_id = 0;
 
-	return key_extract(skb, key);
+	err = key_extract(skb, key);
+	if (!err)
+		ovs_ct_fill_key(skb, key);   /* Must be after key_extract(). */
+	return err;
 }
 
 int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 				   struct sk_buff *skb,
 				   struct sw_flow_key *key, bool log)
 {
+	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+	u64 attrs = 0;
 	int err;
 
+	err = parse_flow_nlattrs(attr, a, &attrs, log);
+	if (err)
+		return -EINVAL;
+
 	/* Extract metadata from netlink attributes. */
-	err = ovs_nla_get_flow_metadata(net, attr, key, log);
+	err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
 	if (err)
 		return err;
 
@@ -824,5 +832,21 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 	 */
 
 	skb->protocol = key->eth.type;
-	return key_extract(skb, key);
+	err = key_extract(skb, key);
+	if (err)
+		return err;
+
+	/* Check that we have conntrack original direction tuple metadata only
+	 * for packets for which it makes sense.  Otherwise the key may be
+	 * corrupted due to overlapping key fields.
+	 */
+	if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
+	    key->eth.type != htons(ETH_P_IP))
+		return -EINVAL;
+	if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
+	    (key->eth.type != htons(ETH_P_IPV6) ||
+	     sw_flow_key_is_nd(key)))
+		return -EINVAL;
+
+	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index f61cae7f9030..76e05b25f030 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -107,10 +107,16 @@ struct sw_flow_key {
 				__be32 src;	/* IP source address. */
 				__be32 dst;	/* IP destination address. */
 			} addr;
-			struct {
-				u8 sha[ETH_ALEN];	/* ARP source hardware address. */
-				u8 tha[ETH_ALEN];	/* ARP target hardware address. */
-			} arp;
+			union {
+				struct {
+					__be32 src;
+					__be32 dst;
+				} ct_orig;	/* Conntrack original direction fields. */
+				struct {
+					u8 sha[ETH_ALEN];	/* ARP source hardware address. */
+					u8 tha[ETH_ALEN];	/* ARP target hardware address. */
+				} arp;
+			};
 		} ipv4;
 		struct {
 			struct {
@@ -118,23 +124,44 @@ struct sw_flow_key {
 				struct in6_addr dst;	/* IPv6 destination address. */
 			} addr;
 			__be32 label;			/* IPv6 flow label. */
-			struct {
-				struct in6_addr target;	/* ND target address. */
-				u8 sll[ETH_ALEN];	/* ND source link layer address. */
-				u8 tll[ETH_ALEN];	/* ND target link layer address. */
-			} nd;
+			union {
+				struct {
+					struct in6_addr src;
+					struct in6_addr dst;
+				} ct_orig;	/* Conntrack original direction fields. */
+				struct {
+					struct in6_addr target;	/* ND target address. */
+					u8 sll[ETH_ALEN];	/* ND source link layer address. */
+					u8 tll[ETH_ALEN];	/* ND target link layer address. */
+				} nd;
+			};
 		} ipv6;
 	};
 	struct {
 		/* Connection tracking fields. */
+		u8 state;
+		u8 orig_proto;		/* CT orig tuple IP protocol. */
 		u16 zone;
 		u32 mark;
-		u8 state;
+		struct {
+			__be16 src;	/* CT orig tuple tp src port. */
+			__be16 dst;	/* CT orig tuple tp dst port. */
+		} orig_tp;
+
 		struct ovs_key_ct_labels labels;
 	} ct;
 
 } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
 
+static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
+{
+	return key->eth.type == htons(ETH_P_IPV6) &&
+		key->ip.proto == NEXTHDR_ICMP &&
+		key->tp.dst == 0 &&
+		(key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+		 key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
+}
+
 struct sw_flow_key_range {
 	unsigned short int start;
 	unsigned short int end;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c87d359b9b37..989f38f120bb 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -129,7 +129,9 @@ static bool match_validate(const struct sw_flow_match *match,
 	/* The following mask attributes allowed only if they
 	 * pass the validation tests. */
 	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+			| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
 			| (1 << OVS_KEY_ATTR_IPV6)
+			| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
 			| (1 << OVS_KEY_ATTR_TCP)
 			| (1 << OVS_KEY_ATTR_TCP_FLAGS)
 			| (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match,
 
 	if (match->key->eth.type == htons(ETH_P_IP)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
 			mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+			mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
+		}
 
 		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
 			if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match,
 
 	if (match->key->eth.type == htons(ETH_P_IPV6)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV6;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
 			mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+			mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
+		}
 
 		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
 			if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match,
 						htons(NDISC_NEIGHBOUR_SOLICITATION) ||
 				    match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
 					key_expected |= 1 << OVS_KEY_ATTR_ND;
+					/* Original direction conntrack tuple
+					 * uses the same space as the ND fields
+					 * in the key, so both are not allowed
+					 * at the same time.
+					 */
+					mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
 					if (match->mask && (match->mask->key.tp.src == htons(0xff)))
 						mask_allowed |= 1 << OVS_KEY_ATTR_ND;
 				}
@@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
+		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },
 	[OVS_KEY_ATTR_CT_MARK]	 = { .len = sizeof(u32) },
 	[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
+	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
+		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
+	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
+		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
 	return __parse_flow_nlattrs(attr, a, attrsp, log, true);
 }
 
-static int parse_flow_nlattrs(const struct nlattr *attr,
-			      const struct nlattr *a[], u64 *attrsp,
-			      bool log)
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+		       u64 *attrsp, bool log)
 {
 	return __parse_flow_nlattrs(attr, a, attrsp, log, false);
 }
@@ -1082,6 +1098,34 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				   sizeof(*cl), is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
 	}
+	if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
+		const struct ovs_key_ct_tuple_ipv4 *ct;
+
+		ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
+
+		SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_proto, ct->ipv4_proto, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
+	}
+	if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
+		const struct ovs_key_ct_tuple_ipv6 *ct;
+
+		ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
+
+		SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
+				   sizeof(match->key->ipv6.ct_orig.src),
+				   is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
+				   sizeof(match->key->ipv6.ct_orig.dst),
+				   is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_proto, ct->ipv6_proto, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
+	}
 
 	/* For layer 3 packets the Ethernet type is provided
 	 * and treated as metadata but no MAC addresses are provided.
@@ -1493,9 +1537,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
 
 /**
  * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
- * @key: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
+ * @net: Network namespace.
+ * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
+ * metadata.
+ * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
+ * attributes.
+ * @attrs: Bit mask for the netlink attributes included in @a.
  * @log: Boolean to allow kernel error logging.  Normally true, but when
  * probing for feature compatibility this should be passed in as false to
  * suppress unnecessary error logging.
@@ -1504,25 +1551,23 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
  * take the same form accepted by flow_from_nlattrs(), but only enough of it to
  * get the metadata, that is, the parts of the flow key that cannot be
  * extracted from the packet itself.
+ *
+ * This must be called before the packet key fields are filled in 'key'.
  */
 
-int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
-			      struct sw_flow_key *key,
-			      bool log)
+int ovs_nla_get_flow_metadata(struct net *net,
+			      const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+			      u64 attrs, struct sw_flow_key *key, bool log)
 {
-	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
 	struct sw_flow_match match;
-	u64 attrs = 0;
-	int err;
-
-	err = parse_flow_nlattrs(attr, a, &attrs, log);
-	if (err)
-		return -EINVAL;
 
 	memset(&match, 0, sizeof(match));
 	match.key = key;
 
 	memset(&key->ct, 0, sizeof(key->ct));
+	memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
+	memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
+
 	key->phy.in_port = DP_MAX_PORTS;
 
 	return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1584,7 +1629,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
 		goto nla_put_failure;
 
-	if (ovs_ct_put_key(output, skb))
+	if (ovs_ct_put_key(swkey, output, skb))
 		goto nla_put_failure;
 
 	if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
 
 int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
 		    int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
-			      struct sw_flow_key *, bool log);
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+		       u64 *attrsp, bool log);
+int ovs_nla_get_flow_metadata(struct net *net,
+			      const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+			      u64 attrs, struct sw_flow_key *key, bool log);
 
 int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
-- 
cgit v1.2.3


From dd41d33f0b033885211a5d6f3ee19e73238aa9ee Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:22:00 -0800
Subject: openvswitch: Add force commit.

Stateful network admission policy may allow connections to one
direction and reject connections initiated in the other direction.
After policy change it is possible that for a new connection an
overlapping conntrack entry already exists, where the original
direction of the existing connection is opposed to the new
connection's initial packet.

Most importantly, conntrack state relating to the current packet gets
the "reply" designation based on whether the original direction tuple
or the reply direction tuple matched.  If this "directionality" is
wrong w.r.t. to the stateful network admission policy it may happen
that packets in neither direction are correctly admitted.

This patch adds a new "force commit" option to the OVS conntrack
action that checks the original direction of an existing conntrack
entry.  If that direction is opposed to the current packet, the
existing conntrack entry is deleted and a new one is subsequently
created in the correct direction.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  5 +++++
 net/openvswitch/conntrack.c      | 26 ++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 90af8b8e10f8..7f41f7d0000f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -674,6 +674,10 @@ struct ovs_action_hash {
  * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
  * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
  * translation (NAT) on the packet.
+ * @OVS_CT_ATTR_FORCE_COMMIT: Like %OVS_CT_ATTR_COMMIT, but instead of doing
+ * nothing if the connection is already committed will check that the current
+ * packet is in conntrack entry's original direction.  If directionality does
+ * not match, will delete the existing conntrack entry and commit a new one.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -684,6 +688,7 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
 				   related connections. */
 	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
+	OVS_CT_ATTR_FORCE_COMMIT,  /* No argument */
 	__OVS_CT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index bfd7606c8be1..8b15bab70583 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,6 +65,7 @@ struct ovs_conntrack_info {
 	struct nf_conn *ct;
 	u8 commit : 1;
 	u8 nat : 3;                 /* enum ovs_ct_nat */
+	u8 force : 1;
 	u16 family;
 	struct md_mark mark;
 	struct md_labels labels;
@@ -613,10 +614,13 @@ static bool skb_nfct_cached(struct net *net,
 	 */
 	if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
 	    !(key->ct.state & OVS_CS_F_INVALID) &&
-	    key->ct.zone == info->zone.id)
+	    key->ct.zone == info->zone.id) {
 		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
 					  !!(key->ct.state
 					     & OVS_CS_F_NAT_MASK));
+		if (ct)
+			nf_ct_get(skb, &ctinfo);
+	}
 	if (!ct)
 		return false;
 	if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -630,6 +634,18 @@ static bool skb_nfct_cached(struct net *net,
 		if (help && rcu_access_pointer(help->helper) != info->helper)
 			return false;
 	}
+	/* Force conntrack entry direction to the current packet? */
+	if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+		/* Delete the conntrack entry if confirmed, else just release
+		 * the reference.
+		 */
+		if (nf_ct_is_confirmed(ct))
+			nf_ct_delete(ct, 0, 0);
+		else
+			nf_conntrack_put(&ct->ct_general);
+		nf_ct_set(skb, NULL, 0);
+		return false;
+	}
 
 	return true;
 }
@@ -1207,6 +1223,7 @@ static int parse_nat(const struct nlattr *attr,
 
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },
+	[OVS_CT_ATTR_FORCE_COMMIT]	= { .minlen = 0, .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
 				    .maxlen = sizeof(u16) },
 	[OVS_CT_ATTR_MARK]	= { .minlen = sizeof(struct md_mark),
@@ -1246,6 +1263,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 		}
 
 		switch (type) {
+		case OVS_CT_ATTR_FORCE_COMMIT:
+			info->force = true;
+			/* fall through. */
 		case OVS_CT_ATTR_COMMIT:
 			info->commit = true;
 			break;
@@ -1472,7 +1492,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 	if (!start)
 		return -EMSGSIZE;
 
-	if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
+	if (ct_info->commit && nla_put_flag(skb, ct_info->force
+					    ? OVS_CT_ATTR_FORCE_COMMIT
+					    : OVS_CT_ATTR_COMMIT))
 		return -EMSGSIZE;
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
 	    nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
-- 
cgit v1.2.3


From 316d4d78cf9b6795b83f97c45368748741df418c Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Thu, 9 Feb 2017 11:22:01 -0800
Subject: openvswitch: Pack struct sw_flow_key.

struct sw_flow_key has two 16-bit holes. Move the most matched
conntrack match fields there.  In some typical cases this reduces the
size of the key that needs to be hashed into half and into one cache
line.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c    | 40 ++++++++++++++++++++--------------------
 net/openvswitch/conntrack.h    |  8 ++++----
 net/openvswitch/flow.h         | 14 ++++++++------
 net/openvswitch/flow_netlink.c | 11 +++++++----
 4 files changed, 39 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 8b15bab70583..c2d452eab0c5 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -152,7 +152,7 @@ static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
 					const struct nf_conntrack_tuple *orig,
 					u8 icmp_proto)
 {
-	key->ct.orig_proto = orig->dst.protonum;
+	key->ct_orig_proto = orig->dst.protonum;
 	if (orig->dst.protonum == icmp_proto) {
 		key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
 		key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
@@ -166,8 +166,8 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 				const struct nf_conntrack_zone *zone,
 				const struct nf_conn *ct)
 {
-	key->ct.state = state;
-	key->ct.zone = zone->id;
+	key->ct_state = state;
+	key->ct_zone = zone->id;
 	key->ct.mark = ovs_ct_get_mark(ct);
 	ovs_ct_get_labels(ct, &key->ct.labels);
 
@@ -195,10 +195,10 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 			return;
 		}
 	}
-	/* Clear 'ct.orig_proto' to mark the non-existence of conntrack
+	/* Clear 'ct_orig_proto' to mark the non-existence of conntrack
 	 * original direction key fields.
 	 */
-	key->ct.orig_proto = 0;
+	key->ct_orig_proto = 0;
 }
 
 /* Update 'key' based on skb->_nfct.  If 'post_ct' is true, then OVS has
@@ -228,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
 		if (keep_nat_flags) {
-			state |= key->ct.state & OVS_CS_F_NAT_MASK;
+			state |= key->ct_state & OVS_CS_F_NAT_MASK;
 		} else {
 			if (ct->status & IPS_SRC_NAT)
 				state |= OVS_CS_F_SRC_NAT;
@@ -259,11 +259,11 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
 		   const struct sw_flow_key *output, struct sk_buff *skb)
 {
-	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct.state))
+	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
-	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct.zone))
+	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
@@ -275,14 +275,14 @@ int ovs_ct_put_key(const struct sw_flow_key *swkey,
 		    &output->ct.labels))
 		return -EMSGSIZE;
 
-	if (swkey->ct.orig_proto) {
+	if (swkey->ct_orig_proto) {
 		if (swkey->eth.type == htons(ETH_P_IP)) {
 			struct ovs_key_ct_tuple_ipv4 orig = {
 				output->ipv4.ct_orig.src,
 				output->ipv4.ct_orig.dst,
 				output->ct.orig_tp.src,
 				output->ct.orig_tp.dst,
-				output->ct.orig_proto,
+				output->ct_orig_proto,
 			};
 			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
 				    sizeof(orig), &orig))
@@ -293,7 +293,7 @@ int ovs_ct_put_key(const struct sw_flow_key *swkey,
 				IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
 				output->ct.orig_tp.src,
 				output->ct.orig_tp.dst,
-				output->ct.orig_proto,
+				output->ct_orig_proto,
 			};
 			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
 				    sizeof(orig), &orig))
@@ -612,11 +612,11 @@ static bool skb_nfct_cached(struct net *net,
 	 * due to an upcall.  If the connection was not confirmed, it is not
 	 * cached and needs to be run through conntrack again.
 	 */
-	if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
-	    !(key->ct.state & OVS_CS_F_INVALID) &&
-	    key->ct.zone == info->zone.id) {
+	if (!ct && key->ct_state & OVS_CS_F_TRACKED &&
+	    !(key->ct_state & OVS_CS_F_INVALID) &&
+	    key->ct_zone == info->zone.id) {
 		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
-					  !!(key->ct.state
+					  !!(key->ct_state
 					     & OVS_CS_F_NAT_MASK));
 		if (ct)
 			nf_ct_get(skb, &ctinfo);
@@ -740,7 +740,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		__be16 src;
 
-		key->ct.state |= OVS_CS_F_SRC_NAT;
+		key->ct_state |= OVS_CS_F_SRC_NAT;
 		if (key->eth.type == htons(ETH_P_IP))
 			key->ipv4.addr.src = ip_hdr(skb)->saddr;
 		else if (key->eth.type == htons(ETH_P_IPV6))
@@ -762,7 +762,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
 	} else {
 		__be16 dst;
 
-		key->ct.state |= OVS_CS_F_DST_NAT;
+		key->ct_state |= OVS_CS_F_DST_NAT;
 		if (key->eth.type == htons(ETH_P_IP))
 			key->ipv4.addr.dst = ip_hdr(skb)->daddr;
 		else if (key->eth.type == htons(ETH_P_IPV6))
@@ -886,7 +886,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		 * NAT after the nf_conntrack_in() call.  We can actually clear
 		 * the whole state, as it will be re-initialized below.
 		 */
-		key->ct.state = 0;
+		key->ct_state = 0;
 
 		/* Update the key, but keep the NAT flags. */
 		ovs_ct_update_key(skb, info, key, true, true);
@@ -902,9 +902,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		 *
 		 * NAT will be done only if the CT action has NAT, and only
 		 * once per packet (per zone), as guarded by the NAT bits in
-		 * the key->ct.state.
+		 * the key->ct_state.
 		 */
-		if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+		if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
 		    (nf_ct_is_confirmed(ct) || info->commit) &&
 		    ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
 			return -EINVAL;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 9e92445dc092..bc7efd1867ab 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -76,14 +76,14 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 				   struct sw_flow_key *key)
 {
-	key->ct.state = 0;
-	key->ct.zone = 0;
+	key->ct_state = 0;
+	key->ct_zone = 0;
 	key->ct.mark = 0;
 	memset(&key->ct.labels, 0, sizeof(key->ct.labels));
-	/* Clear 'ct.orig_proto' to mark the non-existence of original
+	/* Clear 'ct_orig_proto' to mark the non-existence of original
 	 * direction key fields.
 	 */
-	key->ct.orig_proto = 0;
+	key->ct_orig_proto = 0;
 }
 
 static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 76e05b25f030..a9bc1c875965 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -85,6 +85,11 @@ struct sw_flow_key {
 		struct vlan_head cvlan;
 		__be16 type;		/* Ethernet frame type. */
 	} eth;
+	/* Filling a hole of two bytes. */
+	u8 ct_state;
+	u8 ct_orig_proto;		/* CT original direction tuple IP
+					 * protocol.
+					 */
 	union {
 		struct {
 			__be32 top_lse;	/* top label stack entry */
@@ -96,6 +101,7 @@ struct sw_flow_key {
 			u8     frag;	/* One of OVS_FRAG_TYPE_*. */
 		} ip;
 	};
+	u16 ct_zone;			/* Conntrack zone. */
 	struct {
 		__be16 src;		/* TCP/UDP/SCTP source port. */
 		__be16 dst;		/* TCP/UDP/SCTP destination port. */
@@ -138,16 +144,12 @@ struct sw_flow_key {
 		} ipv6;
 	};
 	struct {
-		/* Connection tracking fields. */
-		u8 state;
-		u8 orig_proto;		/* CT orig tuple IP protocol. */
-		u16 zone;
-		u32 mark;
+		/* Connection tracking fields not packed above. */
 		struct {
 			__be16 src;	/* CT orig tuple tp src port. */
 			__be16 dst;	/* CT orig tuple tp dst port. */
 		} orig_tp;
-
+		u32 mark;
 		struct ovs_key_ct_labels labels;
 	} ct;
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 989f38f120bb..6f5fa50f716d 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1072,14 +1072,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 			return -EINVAL;
 		}
 
-		SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
 	}
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
 	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
 		u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
 
-		SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
 	}
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
@@ -1107,7 +1107,7 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 		SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
 		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
 		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
-		SW_FLOW_KEY_PUT(match, ct.orig_proto, ct->ipv4_proto, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
 	}
 	if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
@@ -1123,7 +1123,7 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				   is_mask);
 		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
 		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
-		SW_FLOW_KEY_PUT(match, ct.orig_proto, ct->ipv6_proto, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
 	}
 
@@ -1564,6 +1564,9 @@ int ovs_nla_get_flow_metadata(struct net *net,
 	memset(&match, 0, sizeof(match));
 	match.key = key;
 
+	key->ct_state = 0;
+	key->ct_zone = 0;
+	key->ct_orig_proto = 0;
 	memset(&key->ct, 0, sizeof(key->ct));
 	memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
 	memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
-- 
cgit v1.2.3


From 58e3bdd59742feef680861acec19126e40e4fa8d Mon Sep 17 00:00:00 2001
From: Ido Schimmel
Date: Thu, 9 Feb 2017 10:28:38 +0100
Subject: ipv4: fib: Only flush FIB aliases belonging to currently flushed
 table

In case the MAIN table is flushed and its trie is shared with the LOCAL
table, then we might be flushing FIB aliases belonging to the latter.
This can lead to FIB_ENTRY_DEL notifications sent with the wrong table
ID.

The above doesn't affect current listeners, as the table ID is ignored
during entry deletion, but this will change later in the patchset.

When flushing a particular table, skip any aliases belonging to a
different one.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Patrick McHardy <kaber@trash.net>
Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2919d1a10cfd..5ef4596d7afc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1963,7 +1963,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
 			struct fib_info *fi = fa->fa_info;
 
-			if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+			if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
+			    tb->tb_id != fa->tb_id) {
 				slen = fa->fa_slen;
 				continue;
 			}
-- 
cgit v1.2.3


From 42d5aa76ec8fc9602922cc590a437ecd6693523b Mon Sep 17 00:00:00 2001
From: Ido Schimmel
Date: Thu, 9 Feb 2017 10:28:39 +0100
Subject: ipv4: fib: Send deletion notification with actual FIB alias type

When a FIB alias is removed, a notification is sent using the type
passed from user space - can be RTN_UNSPEC - instead of the actual type
of the removed alias. This is problematic for listeners of the FIB
notification chain, as several FIB aliases can exist with matching
parameters, but the type.

Solve this by passing the actual type of the removed FIB alias.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5ef4596d7afc..b0bfb1cc791a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1653,8 +1653,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
 		return -ESRCH;
 
 	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
-				 fa_to_delete->fa_info, tos, cfg->fc_type,
-				 tb->tb_id, 0);
+				 fa_to_delete->fa_info, tos,
+				 fa_to_delete->fa_type, tb->tb_id, 0);
 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 
-- 
cgit v1.2.3


From 5b7d616dbccc2fd6ae959045e1a9ca17de5dfc2a Mon Sep 17 00:00:00 2001
From: Ido Schimmel
Date: Thu, 9 Feb 2017 10:28:40 +0100
Subject: ipv4: fib: Send notification before deleting FIB alias

When a FIB alias is replaced following NLM_F_REPLACE, the ENTRY_ADD
notification is sent after the reference on the previous FIB info was
dropped. This is problematic as potential listeners might need to access
it in their notification blocks.

Solve this by sending the notification prior to the deletion of the
replaced FIB alias. This is consistent with ENTRY_DEL notifications.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b0bfb1cc791a..1c4d42e46dbb 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1295,6 +1295,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			new_fa->tb_id = tb->tb_id;
 			new_fa->fa_default = -1;
 
+			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+						 key, plen, fi,
+						 new_fa->fa_tos, cfg->fc_type,
+						 tb->tb_id, nlflags);
+			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
+
 			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 
 			alias_free_mem_rcu(fa);
@@ -1303,13 +1310,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			if (state & FA_S_ACCESSED)
 				rt_cache_flush(cfg->fc_nlinfo.nl_net);
 
-			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
-						 key, plen, fi,
-						 new_fa->fa_tos, cfg->fc_type,
-						 tb->tb_id, cfg->fc_nlflags);
-			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
-				tb->tb_id, &cfg->fc_nlinfo, nlflags);
-
 			goto succeeded;
 		}
 		/* Error if we find a perfect match which
-- 
cgit v1.2.3


From 2f3a5272e5c16c3c10fbba06928a513f9b1e2fcd Mon Sep 17 00:00:00 2001
From: Ido Schimmel
Date: Thu, 9 Feb 2017 10:28:41 +0100
Subject: ipv4: fib: Add events for FIB replace and append

The FIB notification chain currently uses the NLM_F_{REPLACE,APPEND}
flags to signal routes being replaced or appended.

Instead of using netlink flags for in-kernel notifications we can simply
introduce two new events in the FIB notification chain. This has the
added advantage of making the API cleaner, thereby making it clear that
these events should be supported by listeners of the notification chain.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h |  3 ++-
 net/ipv4/fib_trie.c  | 27 ++++++++++++++-------------
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 45a184eaff2b..368bb4024b78 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -211,7 +211,6 @@ struct fib_entry_notifier_info {
 	u8 tos;
 	u8 type;
 	u32 tb_id;
-	u32 nlflags;
 };
 
 struct fib_nh_notifier_info {
@@ -220,6 +219,8 @@ struct fib_nh_notifier_info {
 };
 
 enum fib_event_type {
+	FIB_EVENT_ENTRY_REPLACE,
+	FIB_EVENT_ENTRY_APPEND,
 	FIB_EVENT_ENTRY_ADD,
 	FIB_EVENT_ENTRY_DEL,
 	FIB_EVENT_RULE_ADD,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1c4d42e46dbb..d8cea210af0e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -124,7 +124,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb,
 static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
 				   enum fib_event_type event_type, u32 dst,
 				   int dst_len, struct fib_info *fi,
-				   u8 tos, u8 type, u32 tb_id, u32 nlflags)
+				   u8 tos, u8 type, u32 tb_id)
 {
 	struct fib_entry_notifier_info info = {
 		.dst = dst,
@@ -133,7 +133,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
 		.tos = tos,
 		.type = type,
 		.tb_id = tb_id,
-		.nlflags = nlflags,
 	};
 	return call_fib_notifier(nb, net, event_type, &info.info);
 }
@@ -197,7 +196,7 @@ int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 static int call_fib_entry_notifiers(struct net *net,
 				    enum fib_event_type event_type, u32 dst,
 				    int dst_len, struct fib_info *fi,
-				    u8 tos, u8 type, u32 tb_id, u32 nlflags)
+				    u8 tos, u8 type, u32 tb_id)
 {
 	struct fib_entry_notifier_info info = {
 		.dst = dst,
@@ -206,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net,
 		.tos = tos,
 		.type = type,
 		.tb_id = tb_id,
-		.nlflags = nlflags,
 	};
 	return call_fib_notifiers(net, event_type, &info.info);
 }
@@ -1198,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
 int fib_table_insert(struct net *net, struct fib_table *tb,
 		     struct fib_config *cfg)
 {
+	enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct fib_alias *fa, *new_fa;
 	struct key_vector *l, *tp;
@@ -1295,10 +1294,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			new_fa->tb_id = tb->tb_id;
 			new_fa->fa_default = -1;
 
-			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
 						 key, plen, fi,
 						 new_fa->fa_tos, cfg->fc_type,
-						 tb->tb_id, nlflags);
+						 tb->tb_id);
 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
 				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
 
@@ -1319,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 		if (fa_match)
 			goto out;
 
-		if (cfg->fc_nlflags & NLM_F_APPEND)
+		if (cfg->fc_nlflags & NLM_F_APPEND) {
+			event = FIB_EVENT_ENTRY_APPEND;
 			nlflags |= NLM_F_APPEND;
-		else
+		} else {
 			fa = fa_first;
+		}
 	}
 	err = -ENOENT;
 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1351,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 		tb->tb_num_default++;
 
 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
-	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
-				 cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
+	call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
+				 tb->tb_id);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
 		  &cfg->fc_nlinfo, nlflags);
 succeeded:
@@ -1654,7 +1655,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
 
 	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
 				 fa_to_delete->fa_info, tos,
-				 fa_to_delete->fa_type, tb->tb_id, 0);
+				 fa_to_delete->fa_type, tb->tb_id);
 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 
@@ -1973,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 						 n->key,
 						 KEYLENGTH - fa->fa_slen,
 						 fi, fa->fa_tos, fa->fa_type,
-						 tb->tb_id, 0);
+						 tb->tb_id);
 			hlist_del_rcu(&fa->fa_list);
 			fib_release_info(fa->fa_info);
 			alias_free_mem_rcu(fa);
@@ -2013,7 +2014,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
 
 		call_fib_entry_notifier(nb, net, event_type, l->key,
 					KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
-					fa->fa_type, fa->tb_id, 0);
+					fa->fa_type, fa->tb_id);
 	}
 }
 
-- 
cgit v1.2.3


From 79112c26f14c38ddbac3b2739469e373ef424fe6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 14:38:55 +0100
Subject: sched: rename tcf_destroy to tcf_destroy_proto

This function destroys TC filter protocol, not TC filter. So name it
accordingly.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 2 +-
 net/sched/cls_api.c       | 8 ++++----
 net/sched/sch_api.c       | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e2f426f6d62f..453350650b9a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -405,7 +405,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 				const struct Qdisc_ops *ops, u32 parentid);
 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 			       const struct qdisc_size_table *stab);
-bool tcf_destroy(struct tcf_proto *tp, bool force);
+bool tcf_proto_destroy(struct tcf_proto *tp, bool force);
 void tcf_destroy_chain(struct tcf_proto __rcu **fl);
 int skb_do_redirect(struct sk_buff *);
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1ecdf809b5fa..90536ebae02a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -323,7 +323,7 @@ replay:
 
 			tfilter_notify(net, skb, n, tp, fh,
 				       RTM_DELTFILTER, false);
-			tcf_destroy(tp, true);
+			tcf_proto_destroy(tp, true);
 			err = 0;
 			goto errout;
 		}
@@ -338,7 +338,7 @@ replay:
 			err = -EEXIST;
 			if (n->nlmsg_flags & NLM_F_EXCL) {
 				if (tp_created)
-					tcf_destroy(tp, true);
+					tcf_proto_destroy(tp, true);
 				goto errout;
 			}
 			break;
@@ -350,7 +350,7 @@ replay:
 				tfilter_notify(net, skb, n, tp,
 					       t->tcm_handle,
 					       RTM_DELTFILTER, false);
-				if (tcf_destroy(tp, false))
+				if (tcf_proto_destroy(tp, false))
 					RCU_INIT_POINTER(*back, next);
 			}
 			goto errout;
@@ -374,7 +374,7 @@ replay:
 		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
 	} else {
 		if (tp_created)
-			tcf_destroy(tp, true);
+			tcf_proto_destroy(tp, true);
 	}
 
 errout:
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ef53ede11590..f30b517f2282 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1900,7 +1900,7 @@ reset:
 }
 EXPORT_SYMBOL(tc_classify);
 
-bool tcf_destroy(struct tcf_proto *tp, bool force)
+bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
 {
 	if (tp->ops->destroy(tp, force)) {
 		module_put(tp->ops->owner);
@@ -1917,7 +1917,7 @@ void tcf_destroy_chain(struct tcf_proto __rcu **fl)
 
 	while ((tp = rtnl_dereference(*fl)) != NULL) {
 		RCU_INIT_POINTER(*fl, tp->next);
-		tcf_destroy(tp, true);
+		tcf_proto_destroy(tp, true);
 	}
 }
 EXPORT_SYMBOL(tcf_destroy_chain);
-- 
cgit v1.2.3


From cf1facda2f61bc3e9ffd985b6d624dec6ad3f279 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 14:38:56 +0100
Subject: sched: move tcf_proto_destroy and tcf_destroy_chain helpers into
 cls_api

Creation is done in this file, move destruction to be at the same place.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |  2 ++
 include/net/sch_generic.h |  2 --
 net/sched/cls_api.c       | 21 +++++++++++++++++++++
 net/sched/sch_api.c       | 22 ----------------------
 net/sched/sch_atm.c       |  1 +
 net/sched/sch_cbq.c       |  1 +
 net/sched/sch_choke.c     |  1 +
 net/sched/sch_dsmark.c    |  1 +
 net/sched/sch_fq_codel.c  |  1 +
 net/sched/sch_htb.c       |  1 +
 net/sched/sch_ingress.c   |  1 +
 net/sched/sch_multiq.c    |  2 +-
 net/sched/sch_prio.c      |  2 +-
 net/sched/sch_sfb.c       |  1 +
 net/sched/sch_sfq.c       |  1 +
 15 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index dabb00af46a0..71b266cd63d4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -17,6 +17,8 @@ struct tcf_walker {
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
+void tcf_destroy_chain(struct tcf_proto __rcu **fl);
+
 static inline unsigned long
 __cls_set_class(unsigned long *clp, unsigned long cl)
 {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 453350650b9a..aeec4086afb2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -405,8 +405,6 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 				const struct Qdisc_ops *ops, u32 parentid);
 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 			       const struct qdisc_size_table *stab);
-bool tcf_proto_destroy(struct tcf_proto *tp, bool force);
-void tcf_destroy_chain(struct tcf_proto __rcu **fl);
 int skb_do_redirect(struct sk_buff *);
 
 static inline void skb_reset_tc(struct sk_buff *skb)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 90536ebae02a..4efa4df8322f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -127,6 +127,27 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 	return first;
 }
 
+static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
+{
+	if (tp->ops->destroy(tp, force)) {
+		module_put(tp->ops->owner);
+		kfree_rcu(tp, rcu);
+		return true;
+	}
+	return false;
+}
+
+void tcf_destroy_chain(struct tcf_proto __rcu **fl)
+{
+	struct tcf_proto *tp;
+
+	while ((tp = rtnl_dereference(*fl)) != NULL) {
+		RCU_INIT_POINTER(*fl, tp->next);
+		tcf_proto_destroy(tp, true);
+	}
+}
+EXPORT_SYMBOL(tcf_destroy_chain);
+
 /* Add/change/delete/get a filter node */
 
 static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f30b517f2282..adeabaec0d0b 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1900,28 +1900,6 @@ reset:
 }
 EXPORT_SYMBOL(tc_classify);
 
-bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
-{
-	if (tp->ops->destroy(tp, force)) {
-		module_put(tp->ops->owner);
-		kfree_rcu(tp, rcu);
-		return true;
-	}
-
-	return false;
-}
-
-void tcf_destroy_chain(struct tcf_proto __rcu **fl)
-{
-	struct tcf_proto *tp;
-
-	while ((tp = rtnl_dereference(*fl)) != NULL) {
-		RCU_INIT_POINTER(*fl, tp->next);
-		tcf_proto_destroy(tp, true);
-	}
-}
-EXPORT_SYMBOL(tcf_destroy_chain);
-
 #ifdef CONFIG_PROC_FS
 static int psched_show(struct seq_file *seq, void *v)
 {
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 481e4f12aeb4..2209c2ddacbf 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -15,6 +15,7 @@
 #include <linux/file.h>		/* for fput */
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 /*
  * The ATM queuing discipline provides a framework for invoking classifiers
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f1207582cbf3..d6ca18dc04c3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -19,6 +19,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 
 /*	Class-Based Queueing (CBQ) algorithm.
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 3b6d5bd69101..3b86a97bc67c 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -16,6 +16,7 @@
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
 #include <net/flow_dissector.h>
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1308bbf460f7..802ac7c2e5e8 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -13,6 +13,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/bitops.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <asm/byteorder.h>
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 2f50e4c72fb4..9f3a884d1590 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -23,6 +23,7 @@
 #include <linux/vmalloc.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/codel.h>
 #include <net/codel_impl.h>
 #include <net/codel_qdisc.h>
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 760f39e7caee..4cd5fb134bc9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -40,6 +40,7 @@
 #include <net/netlink.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 /* HTB algorithm.
     Author: devik@cdi.cz
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 8fe6999b642a..3bab5f66c392 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -16,6 +16,7 @@
 
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
 {
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 9ffbb025b37e..e7839a0d0eaa 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -25,7 +25,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
-
+#include <net/pkt_cls.h>
 
 struct multiq_sched_data {
 	u16 bands;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8f575899adfa..d4d7db267b6e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -20,7 +20,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
-
+#include <net/pkt_cls.h>
 
 struct prio_sched_data {
 	int bands;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 20a350bd1b1d..fe6963d21519 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -25,6 +25,7 @@
 #include <linux/jhash.h>
 #include <net/ip.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/inet_ecn.h>
 
 /*
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 7f195ed4d568..83d06e251b93 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -23,6 +23,7 @@
 #include <linux/vmalloc.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/red.h>
 
 
-- 
cgit v1.2.3


From 33a48927c193d030c80ecaeb3e021b7ed85f9c78 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 14:38:57 +0100
Subject: sched: push TC filter protocol creation into a separate function

Make the long function tc_ctl_tfilter a little bit shorter and easier to
read. Also make the creation of filter proto symmetric to destruction.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 110 ++++++++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 4efa4df8322f..d378a0b156c1 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
@@ -38,14 +39,14 @@ static DEFINE_RWLOCK(cls_mod_lock);
 
 /* Find classifier type by string name */
 
-static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
+static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
 {
 	const struct tcf_proto_ops *t, *res = NULL;
 
 	if (kind) {
 		read_lock(&cls_mod_lock);
 		list_for_each_entry(t, &tcf_proto_base, head) {
-			if (nla_strcmp(kind, t->kind) == 0) {
+			if (strcmp(kind, t->kind) == 0) {
 				if (try_module_get(t->owner))
 					res = t;
 				break;
@@ -127,6 +128,56 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 	return first;
 }
 
+static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
+					  u32 prio, u32 parent, struct Qdisc *q)
+{
+	struct tcf_proto *tp;
+	int err;
+
+	tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+	if (!tp)
+		return ERR_PTR(-ENOBUFS);
+
+	err = -ENOENT;
+	tp->ops = tcf_proto_lookup_ops(kind);
+	if (!tp->ops) {
+#ifdef CONFIG_MODULES
+		rtnl_unlock();
+		request_module("cls_%s", kind);
+		rtnl_lock();
+		tp->ops = tcf_proto_lookup_ops(kind);
+		/* We dropped the RTNL semaphore in order to perform
+		 * the module load. So, even if we succeeded in loading
+		 * the module we have to replay the request. We indicate
+		 * this using -EAGAIN.
+		 */
+		if (tp->ops) {
+			module_put(tp->ops->owner);
+			err = -EAGAIN;
+		} else {
+			err = -ENOENT;
+		}
+		goto errout;
+#endif
+	}
+	tp->classify = tp->ops->classify;
+	tp->protocol = protocol;
+	tp->prio = prio;
+	tp->classid = parent;
+	tp->q = q;
+
+	err = tp->ops->init(tp);
+	if (err) {
+		module_put(tp->ops->owner);
+		goto errout;
+	}
+	return tp;
+
+errout:
+	kfree(tp);
+	return ERR_PTR(err);
+}
+
 static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
 {
 	if (tp->ops->destroy(tp, force)) {
@@ -164,7 +215,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 	struct tcf_proto __rcu **back;
 	struct tcf_proto __rcu **chain;
 	struct tcf_proto *tp;
-	const struct tcf_proto_ops *tp_ops;
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	unsigned long fh;
@@ -279,58 +329,16 @@ replay:
 		    !(n->nlmsg_flags & NLM_F_CREATE))
 			goto errout;
 
+		if (!nprio)
+			nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
 
-		/* Create new proto tcf */
-
-		err = -ENOBUFS;
-		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
-		if (tp == NULL)
-			goto errout;
-		err = -ENOENT;
-		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
-		if (tp_ops == NULL) {
-#ifdef CONFIG_MODULES
-			struct nlattr *kind = tca[TCA_KIND];
-			char name[IFNAMSIZ];
-
-			if (kind != NULL &&
-			    nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
-				rtnl_unlock();
-				request_module("cls_%s", name);
-				rtnl_lock();
-				tp_ops = tcf_proto_lookup_ops(kind);
-				/* We dropped the RTNL semaphore in order to
-				 * perform the module load.  So, even if we
-				 * succeeded in loading the module we have to
-				 * replay the request.  We indicate this using
-				 * -EAGAIN.
-				 */
-				if (tp_ops != NULL) {
-					module_put(tp_ops->owner);
-					err = -EAGAIN;
-				}
-			}
-#endif
-			kfree(tp);
+		tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
+				      protocol, nprio, parent, q);
+		if (IS_ERR(tp)) {
+			err = PTR_ERR(tp);
 			goto errout;
 		}
-		tp->ops = tp_ops;
-		tp->protocol = protocol;
-		tp->prio = nprio ? :
-			       TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
-		tp->q = q;
-		tp->classify = tp_ops->classify;
-		tp->classid = parent;
-
-		err = tp_ops->init(tp);
-		if (err != 0) {
-			module_put(tp_ops->owner);
-			kfree(tp);
-			goto errout;
-		}
-
 		tp_created = 1;
-
 	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
 		goto errout;
 
-- 
cgit v1.2.3


From 6bb16e7ae26095892e8c51de4142d8f344793340 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 14:38:58 +0100
Subject: sched: move err set right before goto errout in tc_ctl_tfilter

This makes the reader to know right away what is the error value.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index d378a0b156c1..f44378c4859f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -293,9 +293,10 @@ replay:
 
 	/* And the last stroke */
 	chain = cops->tcf_chain(q, cl);
-	err = -EINVAL;
-	if (chain == NULL)
+	if (chain == NULL) {
+		err = -EINVAL;
 		goto errout;
+	}
 	if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
 		tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
 		tcf_destroy_chain(chain);
@@ -310,8 +311,10 @@ replay:
 		if (tp->prio >= prio) {
 			if (tp->prio == prio) {
 				if (!nprio ||
-				    (tp->protocol != protocol && protocol))
+				    (tp->protocol != protocol && protocol)) {
+					err = -EINVAL;
 					goto errout;
+				}
 			} else
 				tp = NULL;
 			break;
@@ -321,13 +324,16 @@ replay:
 	if (tp == NULL) {
 		/* Proto-tcf does not exist, create new one */
 
-		if (tca[TCA_KIND] == NULL || !protocol)
+		if (tca[TCA_KIND] == NULL || !protocol) {
+			err = -EINVAL;
 			goto errout;
+		}
 
-		err = -ENOENT;
 		if (n->nlmsg_type != RTM_NEWTFILTER ||
-		    !(n->nlmsg_flags & NLM_F_CREATE))
+		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+			err = -ENOENT;
 			goto errout;
+		}
 
 		if (!nprio)
 			nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
@@ -339,8 +345,10 @@ replay:
 			goto errout;
 		}
 		tp_created = 1;
-	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
+	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+		err = -EINVAL;
 		goto errout;
+	}
 
 	fh = tp->ops->get(tp, t->tcm_handle);
 
@@ -357,17 +365,18 @@ replay:
 			goto errout;
 		}
 
-		err = -ENOENT;
 		if (n->nlmsg_type != RTM_NEWTFILTER ||
-		    !(n->nlmsg_flags & NLM_F_CREATE))
+		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+			err = -ENOENT;
 			goto errout;
+		}
 	} else {
 		switch (n->nlmsg_type) {
 		case RTM_NEWTFILTER:
-			err = -EEXIST;
 			if (n->nlmsg_flags & NLM_F_EXCL) {
 				if (tp_created)
 					tcf_proto_destroy(tp, true);
+				err = -EEXIST;
 				goto errout;
 			}
 			break;
-- 
cgit v1.2.3


From 7215032ced14f9943c8b72b61f4ac52902002158 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 14:38:59 +0100
Subject: sched: add missing curly braces in else branch in tc_ctl_tfilter

Curly braces need to be there, for stylistic reasons.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f44378c4859f..48864ad2322d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -315,8 +315,9 @@ replay:
 					err = -EINVAL;
 					goto errout;
 				}
-			} else
+			} else {
 				tp = NULL;
+			}
 			break;
 		}
 	}
-- 
cgit v1.2.3


From 40c81b25b16cd871afe70630c131bd2544848d1f Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 14:39:00 +0100
Subject: sched: check negative err value to safe one level of indent

As it is more common, check err for !0. That allows to safe one level of
indentation and makes the code easier to read. Also, make 'next' variable
global in function as it is used twice.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 48864ad2322d..abe1fe13ae54 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -214,6 +214,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 	struct Qdisc  *q;
 	struct tcf_proto __rcu **back;
 	struct tcf_proto __rcu **chain;
+	struct tcf_proto *next;
 	struct tcf_proto *tp;
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
@@ -355,10 +356,8 @@ replay:
 
 	if (fh == 0) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
-			struct tcf_proto *next = rtnl_dereference(tp->next);
-
+			next = rtnl_dereference(tp->next);
 			RCU_INIT_POINTER(*back, next);
-
 			tfilter_notify(net, skb, n, tp, fh,
 				       RTM_DELTFILTER, false);
 			tcf_proto_destroy(tp, true);
@@ -383,16 +382,13 @@ replay:
 			break;
 		case RTM_DELTFILTER:
 			err = tp->ops->delete(tp, fh);
-			if (err == 0) {
-				struct tcf_proto *next = rtnl_dereference(tp->next);
-
-				tfilter_notify(net, skb, n, tp,
-					       t->tcm_handle,
-					       RTM_DELTFILTER, false);
-				if (tcf_proto_destroy(tp, false))
-					RCU_INIT_POINTER(*back, next);
-			}
-			goto errout;
+			if (err)
+				goto errout;
+			next = rtnl_dereference(tp->next);
+			tfilter_notify(net, skb, n, tp, t->tcm_handle,
+				       RTM_DELTFILTER, false);
+			if (tcf_proto_destroy(tp, false))
+				RCU_INIT_POINTER(*back, next);
 		case RTM_GETTFILTER:
 			err = tfilter_notify(net, skb, n, tp, fh,
 					     RTM_NEWTFILTER, true);
-- 
cgit v1.2.3


From 147c1e9b902c25c868024260d24bb0b1dac1433d Mon Sep 17 00:00:00 2001
From: Nogah Frankel
Date: Thu, 9 Feb 2017 14:54:40 +0100
Subject: switchdev: bridge: Offload multicast disabled

Offload multicast disabled flag, for more accurate mc flood behavior:
When it is on, the mdb should be ignored.
When it is off, unregistered mc packets should be flooded to mc router
ports.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  2 ++
 net/bridge/br_multicast.c | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index eba80c4fc56f..2971c2a2cdf2 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -48,6 +48,7 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
 	SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
 	SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
+	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
 };
 
 struct switchdev_attr {
@@ -62,6 +63,7 @@ struct switchdev_attr {
 		unsigned long brport_flags;		/* PORT_BRIDGE_FLAGS */
 		clock_t ageing_time;			/* BRIDGE_AGEING_TIME */
 		bool vlan_filtering;			/* BRIDGE_VLAN_FILTERING */
+		bool mc_disabled;			/* MC_DISABLED */
 	} u;
 };
 
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 1de3438e36bf..8c0e896936ff 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -27,6 +27,7 @@
 #include <linux/inetdevice.h>
 #include <linux/mroute.h>
 #include <net/ip.h>
+#include <net/switchdev.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
 #include <net/mld.h>
@@ -1007,6 +1008,18 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
 }
 #endif
 
+static void br_mc_disabled_update(struct net_device *dev, bool value)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = dev,
+		.id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
+		.flags = SWITCHDEV_F_DEFER,
+		.u.mc_disabled = value,
+	};
+
+	switchdev_port_attr_set(dev, &attr);
+}
+
 int br_multicast_add_port(struct net_bridge_port *port)
 {
 	port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
@@ -1019,6 +1032,8 @@ int br_multicast_add_port(struct net_bridge_port *port)
 	setup_timer(&port->ip6_own_query.timer,
 		    br_ip6_multicast_port_query_expired, (unsigned long)port);
 #endif
+	br_mc_disabled_update(port->dev, port->br->multicast_disabled);
+
 	port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
 	if (!port->mcast_stats)
 		return -ENOMEM;
@@ -2121,6 +2136,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
 	if (br->multicast_disabled == !val)
 		goto unlock;
 
+	br_mc_disabled_update(br->dev, !val);
 	br->multicast_disabled = !val;
 	if (br->multicast_disabled)
 		goto unlock;
-- 
cgit v1.2.3


From f12e7d95d12fa4dbe655bd038a6d38526a879deb Mon Sep 17 00:00:00 2001
From: Nogah Frankel
Date: Thu, 9 Feb 2017 14:54:41 +0100
Subject: bridge: mcast: Merge the mc router ports deletions to one function

There are three places where a port gets deleted from the mc router port
list. This patch join the actual deletion to one function.
It will be helpful for later patch that will offload changes in the mc
router ports list.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8c0e896936ff..2add6d417aa4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -47,6 +47,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 __u16 vid,
 					 const unsigned char *src);
 
+static void __del_port_router(struct net_bridge_port *p);
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
@@ -850,16 +851,10 @@ static void br_multicast_router_expired(unsigned long data)
 	spin_lock(&br->multicast_lock);
 	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
 	    port->multicast_router == MDB_RTR_TYPE_PERM ||
-	    timer_pending(&port->multicast_router_timer) ||
-	    hlist_unhashed(&port->rlist))
+	    timer_pending(&port->multicast_router_timer))
 		goto out;
 
-	hlist_del_init_rcu(&port->rlist);
-	br_rtr_notify(br->dev, port, RTM_DELMDB);
-	/* Don't allow timer refresh if the router expired */
-	if (port->multicast_router == MDB_RTR_TYPE_TEMP)
-		port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
-
+	__del_port_router(port);
 out:
 	spin_unlock(&br->multicast_lock);
 }
@@ -1101,13 +1096,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
 		if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
 			br_multicast_del_pg(br, pg);
 
-	if (!hlist_unhashed(&port->rlist)) {
-		hlist_del_init_rcu(&port->rlist);
-		br_rtr_notify(br->dev, port, RTM_DELMDB);
-		/* Don't allow timer refresh if disabling */
-		if (port->multicast_router == MDB_RTR_TYPE_TEMP)
-			port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
-	}
+	__del_port_router(port);
+
 	del_timer(&port->multicast_router_timer);
 	del_timer(&port->ip4_own_query.timer);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -2059,6 +2049,10 @@ static void __del_port_router(struct net_bridge_port *p)
 		return;
 	hlist_del_init_rcu(&p->rlist);
 	br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+
+	/* don't allow timer refresh */
+	if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+		p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 }
 
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
-- 
cgit v1.2.3


From 6d5496483f5eb7b4da2e83c7b2149a21ad412d96 Mon Sep 17 00:00:00 2001
From: Nogah Frankel
Date: Thu, 9 Feb 2017 14:54:42 +0100
Subject: switchdev: bridge: Offload mc router ports

Offload the mc router ports list, whenever it is being changed.
It is done because in some cases mc packets needs to be flooded to all
the ports in this list.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  2 ++
 net/bridge/br_multicast.c | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 2971c2a2cdf2..929d6af321cd 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -46,6 +46,7 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
 	SWITCHDEV_ATTR_ID_PORT_STP_STATE,
 	SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
+	SWITCHDEV_ATTR_ID_PORT_MROUTER,
 	SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
 	SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
 	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
@@ -61,6 +62,7 @@ struct switchdev_attr {
 		struct netdev_phys_item_id ppid;	/* PORT_PARENT_ID */
 		u8 stp_state;				/* PORT_STP_STATE */
 		unsigned long brport_flags;		/* PORT_BRIDGE_FLAGS */
+		bool mrouter;				/* PORT_MROUTER */
 		clock_t ageing_time;			/* BRIDGE_AGEING_TIME */
 		bool vlan_filtering;			/* BRIDGE_VLAN_FILTERING */
 		bool mc_disabled;			/* MC_DISABLED */
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2add6d417aa4..b760f2620abf 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1317,6 +1317,19 @@ br_multicast_update_query_timer(struct net_bridge *br,
 	mod_timer(&query->timer, jiffies + br->multicast_querier_interval);
 }
 
+static void br_port_mc_router_state_change(struct net_bridge_port *p,
+					   bool is_mc_router)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = p->dev,
+		.id = SWITCHDEV_ATTR_ID_PORT_MROUTER,
+		.flags = SWITCHDEV_F_DEFER,
+		.u.mrouter = is_mc_router,
+	};
+
+	switchdev_port_attr_set(p->dev, &attr);
+}
+
 /*
  * Add port to router_list
  *  list is maintained ordered by pointer value
@@ -1342,6 +1355,7 @@ static void br_multicast_add_router(struct net_bridge *br,
 	else
 		hlist_add_head_rcu(&port->rlist, &br->router_list);
 	br_rtr_notify(br->dev, port, RTM_NEWMDB);
+	br_port_mc_router_state_change(port, true);
 }
 
 static void br_multicast_mark_router(struct net_bridge *br,
@@ -2049,6 +2063,7 @@ static void __del_port_router(struct net_bridge_port *p)
 		return;
 	hlist_del_init_rcu(&p->rlist);
 	br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+	br_port_mc_router_state_change(p, false);
 
 	/* don't allow timer refresh */
 	if (p->multicast_router == MDB_RTR_TYPE_TEMP)
-- 
cgit v1.2.3


From 71d0ed7079dffbc5cd0941d77d9b84e04109c9bb Mon Sep 17 00:00:00 2001
From: Amir Vadai
Date: Tue, 7 Feb 2017 09:56:07 +0200
Subject: net/act_pedit: Support using offset relative to the conventional
 network headers

Extend pedit to enable the user setting offset relative to network
headers. This change would enable to work with more complex header
schemes (vs the simple IPv4 case) where setting a fixed offset relative
to the network header is not enough.

After this patch, the action has information about the exact header type
and field inside this header. This information could be used later on
for hardware offloading of pedit.

Backward compatibility was being kept:
1. Old kernel <-> new userspace
2. New kernel <-> old userspace
3. add rule using new userspace <-> dump using old userspace
4. add rule using old userspace <-> dump using new userspace

When using the extended api, new netlink attributes are being used. This
way, operation will fail in (1) and (3) - and no malformed rule be added
or dumped. Of course, new user space that doesn't need the new
functionality can use the old netlink attributes and operation will
succeed.
Since action can support both api's, (2) should work, and it is easy to
write the new user space to have (4) work.

The action is having a strict check that only header types and commands
it can handle are accepted. This way future additions will be much
easier.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent ffff: \
  flower \
    ip_proto tcp \
    dst_port 80 \
  action pedit munge tcp dport set 8080 pipe \
  action mirred egress redirect dev veth0

Will forward tcp port whose original dest port is 80, while modifying
the destination port to 8080.

Signed-off-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_pedit.h        |   5 +
 include/uapi/linux/tc_act/tc_pedit.h |  23 ++++
 net/sched/act_pedit.c                | 196 ++++++++++++++++++++++++++++++++---
 3 files changed, 208 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 29e38d6823df..e076f22035a5 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -3,11 +3,16 @@
 
 #include <net/act_api.h>
 
+struct tcf_pedit_key_ex {
+	enum pedit_header_type htype;
+};
+
 struct tcf_pedit {
 	struct tc_action	common;
 	unsigned char		tcfp_nkeys;
 	unsigned char		tcfp_flags;
 	struct tc_pedit_key	*tcfp_keys;
+	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 };
 #define to_pedit(a) ((struct tcf_pedit *)a)
 
diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index 6389959a5157..22f19eeda997 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -11,10 +11,33 @@ enum {
 	TCA_PEDIT_TM,
 	TCA_PEDIT_PARMS,
 	TCA_PEDIT_PAD,
+	TCA_PEDIT_PARMS_EX,
+	TCA_PEDIT_KEYS_EX,
+	TCA_PEDIT_KEY_EX,
 	__TCA_PEDIT_MAX
 };
 #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1)
                                                                                 
+enum {
+	TCA_PEDIT_KEY_EX_HTYPE = 1,
+	__TCA_PEDIT_KEY_EX_MAX
+};
+#define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
+
+ /* TCA_PEDIT_KEY_EX_HDR_TYPE_NETWROK is a special case for legacy users. It
+  * means no specific header type - offset is relative to the network layer
+  */
+enum pedit_header_type {
+	TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
+	__PEDIT_HDR_TYPE_MAX,
+};
+#define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
+
 struct tc_pedit_key {
 	__u32           mask;  /* AND */
 	__u32           val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..fdd012bd3602 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,6 +22,7 @@
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_pedit.h>
+#include <uapi/linux/tc_act/tc_pedit.h>
 
 #define PEDIT_TAB_MASK	15
 
@@ -30,18 +31,112 @@ static struct tc_action_ops act_pedit_ops;
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
+	[TCA_PEDIT_KEYS_EX]   = { .type = NLA_NESTED },
 };
 
+static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
+	[TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+};
+
+static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
+							u8 n)
+{
+	struct tcf_pedit_key_ex *keys_ex;
+	struct tcf_pedit_key_ex *k;
+	const struct nlattr *ka;
+	int err = -EINVAL;
+	int rem;
+
+	if (!nla || !n)
+		return NULL;
+
+	keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
+	if (!keys_ex)
+		return ERR_PTR(-ENOMEM);
+
+	k = keys_ex;
+
+	nla_for_each_nested(ka, nla, rem) {
+		struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
+
+		if (!n) {
+			err = -EINVAL;
+			goto err_out;
+		}
+		n--;
+
+		if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
+				       pedit_key_ex_policy);
+		if (err)
+			goto err_out;
+
+		if (!tb[TCA_PEDIT_KEY_EX_HTYPE]) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+
+		if (k->htype > TCA_PEDIT_HDR_TYPE_MAX) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		k++;
+	}
+
+	if (n)
+		goto err_out;
+
+	return keys_ex;
+
+err_out:
+	kfree(keys_ex);
+	return ERR_PTR(err);
+}
+
+static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
+				 struct tcf_pedit_key_ex *keys_ex, int n)
+{
+	struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+
+	for (; n > 0; n--) {
+		struct nlattr *key_start;
+
+		key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+
+		if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype)) {
+			nlmsg_trim(skb, keys_start);
+			return -EINVAL;
+		}
+
+		nla_nest_end(skb, key_start);
+
+		keys_ex++;
+	}
+
+	nla_nest_end(skb, keys_start);
+
+	return 0;
+}
+
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
 			  int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
+	struct nlattr *pattr;
 	struct tc_pedit *parm;
 	int ret = 0, err;
 	struct tcf_pedit *p;
 	struct tc_pedit_key *keys = NULL;
+	struct tcf_pedit_key_ex *keys_ex;
 	int ksize;
 
 	if (nla == NULL)
@@ -51,13 +146,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_PEDIT_PARMS] == NULL)
+	pattr = tb[TCA_PEDIT_PARMS];
+	if (!pattr)
+		pattr = tb[TCA_PEDIT_PARMS_EX];
+	if (!pattr)
 		return -EINVAL;
-	parm = nla_data(tb[TCA_PEDIT_PARMS]);
+
+	parm = nla_data(pattr);
 	ksize = parm->nkeys * sizeof(struct tc_pedit_key);
-	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
+	if (nla_len(pattr) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
+	keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
+	if (IS_ERR(keys_ex))
+		return PTR_ERR(keys_ex);
+
 	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
@@ -69,6 +172,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
 			tcf_hash_cleanup(*a, est);
+			kfree(keys_ex);
 			return -ENOMEM;
 		}
 		ret = ACT_P_CREATED;
@@ -81,8 +185,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p = to_pedit(*a);
 		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
-			if (keys == NULL)
+			if (!keys) {
+				kfree(keys_ex);
 				return -ENOMEM;
+			}
 		}
 	}
 
@@ -95,6 +201,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p->tcfp_nkeys = parm->nkeys;
 	}
 	memcpy(p->tcfp_keys, parm->keys, ksize);
+
+	kfree(p->tcfp_keys_ex);
+	p->tcfp_keys_ex = keys_ex;
+
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
 		tcf_hash_insert(tn, *a);
@@ -106,6 +216,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit_key *keys = p->tcfp_keys;
 	kfree(keys);
+	kfree(p->tcfp_keys_ex);
 }
 
 static bool offset_valid(struct sk_buff *skb, int offset)
@@ -119,38 +230,84 @@ static bool offset_valid(struct sk_buff *skb, int offset)
 	return true;
 }
 
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+				enum pedit_header_type htype, int *hoffset)
+{
+	int ret = -EINVAL;
+
+	switch (htype) {
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+		if (skb_mac_header_was_set(skb)) {
+			*hoffset = skb_mac_offset(skb);
+			ret = 0;
+		}
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+		*hoffset = skb_network_offset(skb);
+		ret = 0;
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+		if (skb_transport_header_was_set(skb)) {
+			*hoffset = skb_transport_offset(skb);
+			ret = 0;
+		}
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	};
+
+	return ret;
+}
+
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		     struct tcf_result *res)
 {
 	struct tcf_pedit *p = to_pedit(a);
 	int i;
-	unsigned int off;
 
 	if (skb_unclone(skb, GFP_ATOMIC))
 		return p->tcf_action;
 
-	off = skb_network_offset(skb);
-
 	spin_lock(&p->tcf_lock);
 
 	tcf_lastuse_update(&p->tcf_tm);
 
 	if (p->tcfp_nkeys > 0) {
 		struct tc_pedit_key *tkey = p->tcfp_keys;
+		struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
+		enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
 			u32 *ptr, _data;
 			int offset = tkey->off;
+			int hoffset;
+			int rc;
+
+			if (tkey_ex) {
+				htype = tkey_ex->htype;
+				tkey_ex++;
+			}
+
+			rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
+			if (rc) {
+				pr_info("tc filter pedit bad header type specified (0x%x)\n",
+					htype);
+				goto bad;
+			}
 
 			if (tkey->offmask) {
 				char *d, _d;
 
-				if (!offset_valid(skb, off + tkey->at)) {
+				if (!offset_valid(skb, hoffset + tkey->at)) {
 					pr_info("tc filter pedit 'at' offset %d out of bounds\n",
-						off + tkey->at);
+						hoffset + tkey->at);
 					goto bad;
 				}
-				d = skb_header_pointer(skb, off + tkey->at, 1,
+				d = skb_header_pointer(skb, hoffset + tkey->at, 1,
 						       &_d);
 				if (!d)
 					goto bad;
@@ -163,19 +320,19 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 				goto bad;
 			}
 
-			if (!offset_valid(skb, off + offset)) {
+			if (!offset_valid(skb, hoffset + offset)) {
 				pr_info("tc filter pedit offset %d out of bounds\n",
-					offset);
+					hoffset + offset);
 				goto bad;
 			}
 
-			ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+			ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
 			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
 			if (ptr == &_data)
-				skb_store_bits(skb, off + offset, ptr, 4);
+				skb_store_bits(skb, hoffset + offset, ptr, 4);
 		}
 
 		goto done;
@@ -215,8 +372,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 	opt->refcnt = p->tcf_refcnt - ref;
 	opt->bindcnt = p->tcf_bindcnt - bind;
 
-	if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
-		goto nla_put_failure;
+	if (p->tcfp_keys_ex) {
+		tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
+
+		if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
+			goto nla_put_failure;
+	} else {
+		if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+			goto nla_put_failure;
+	}
 
 	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
-- 
cgit v1.2.3


From 853a14ba4682f820266469979c9297debc05f60c Mon Sep 17 00:00:00 2001
From: Amir Vadai
Date: Tue, 7 Feb 2017 09:56:08 +0200
Subject: net/act_pedit: Introduce 'add' operation

This command could be useful to inc/dec fields.

For example, to forward any TCP packet and decrease its TTL:
$ tc filter add dev enp0s9 protocol ip parent ffff: \
    flower ip_proto tcp \
    action pedit munge ip ttl add 0xff pipe \
    action mirred egress redirect dev veth0

In the example above, adding 0xff to this u8 field is actually
decreasing it by one, since the operation is masked.

Signed-off-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_pedit.h        |  1 +
 include/uapi/linux/tc_act/tc_pedit.h |  8 ++++++++
 net/sched/act_pedit.c                | 30 ++++++++++++++++++++++++++----
 3 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index e076f22035a5..dfbd6ee0bc7c 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -5,6 +5,7 @@
 
 struct tcf_pedit_key_ex {
 	enum pedit_header_type htype;
+	enum pedit_cmd cmd;
 };
 
 struct tcf_pedit {
diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index 22f19eeda997..143d2b31a316 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -20,6 +20,7 @@ enum {
                                                                                 
 enum {
 	TCA_PEDIT_KEY_EX_HTYPE = 1,
+	TCA_PEDIT_KEY_EX_CMD = 2,
 	__TCA_PEDIT_KEY_EX_MAX
 };
 #define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
@@ -38,6 +39,13 @@ enum pedit_header_type {
 };
 #define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
 
+enum pedit_cmd {
+	TCA_PEDIT_KEY_EX_CMD_SET = 0,
+	TCA_PEDIT_KEY_EX_CMD_ADD = 1,
+	__PEDIT_CMD_MAX,
+};
+#define TCA_PEDIT_CMD_MAX (__PEDIT_CMD_MAX - 1)
+
 struct tc_pedit_key {
 	__u32           mask;  /* AND */
 	__u32           val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fdd012bd3602..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -36,6 +36,7 @@ static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 
 static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
 	[TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+	[TCA_PEDIT_KEY_EX_CMD]	  = { .type = NLA_U16 },
 };
 
 static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
@@ -75,14 +76,17 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
 		if (err)
 			goto err_out;
 
-		if (!tb[TCA_PEDIT_KEY_EX_HTYPE]) {
+		if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
+		    !tb[TCA_PEDIT_KEY_EX_CMD]) {
 			err = -EINVAL;
 			goto err_out;
 		}
 
 		k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+		k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
 
-		if (k->htype > TCA_PEDIT_HDR_TYPE_MAX) {
+		if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
+		    k->cmd > TCA_PEDIT_CMD_MAX) {
 			err = -EINVAL;
 			goto err_out;
 		}
@@ -110,7 +114,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
 
 		key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
 
-		if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype)) {
+		if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
+		    nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
 			nlmsg_trim(skb, keys_start);
 			return -EINVAL;
 		}
@@ -280,15 +285,19 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		struct tc_pedit_key *tkey = p->tcfp_keys;
 		struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
 		enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+		enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
 			u32 *ptr, _data;
 			int offset = tkey->off;
 			int hoffset;
+			u32 val;
 			int rc;
 
 			if (tkey_ex) {
 				htype = tkey_ex->htype;
+				cmd = tkey_ex->cmd;
+
 				tkey_ex++;
 			}
 
@@ -330,7 +339,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
-			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
+			switch (cmd) {
+			case TCA_PEDIT_KEY_EX_CMD_SET:
+				val = tkey->val;
+				break;
+			case TCA_PEDIT_KEY_EX_CMD_ADD:
+				val = (*ptr + tkey->val) & ~tkey->mask;
+				break;
+			default:
+				pr_info("tc filter pedit bad command (%d)\n",
+					cmd);
+				goto bad;
+			}
+
+			*ptr = ((*ptr & tkey->mask) ^ val);
 			if (ptr == &_data)
 				skb_store_bits(skb, hoffset + offset, ptr, 4);
 		}
-- 
cgit v1.2.3


From 722c9a0cebb88ac1a982285f15d5fd44f4140c66 Mon Sep 17 00:00:00 2001
From: tcharding
Date: Thu, 9 Feb 2017 17:56:04 +1100
Subject: net: Fix checkpatch WARNING: please, no space before tabs

This patch fixes multiple occurrences of space before tabs warnings.
More lines of code were moved than required to keep kernel-doc
comments uniform.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 142 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 71 insertions(+), 71 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0921609dfa81..b7c795017299 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,5 +1,5 @@
 /*
- * 	NET3	Protocol independent device support routines.
+ *      NET3    Protocol independent device support routines.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -7,7 +7,7 @@
  *		2 of the License, or (at your option) any later version.
  *
  *	Derived from the non IP parts of dev.c 1.0.19
- * 		Authors:	Ross Biro
+ *              Authors:	Ross Biro
  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  *
@@ -21,9 +21,9 @@
  *
  *	Changes:
  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
- *              			to 2 if register_netdev gets called
- *              			before net_dev_init & also removed a
- *              			few lines of code in the process.
+ *                                      to 2 if register_netdev gets called
+ *                                      before net_dev_init & also removed a
+ *                                      few lines of code in the process.
  *		Alan Cox	:	device private ioctl copies fields back.
  *		Alan Cox	:	Transmit queue code does relevant
  *					stunts to keep the queue safe.
@@ -36,7 +36,7 @@
  *		Alan Cox	:	100 backlog just doesn't cut it when
  *					you start doing multicast video 8)
  *		Alan Cox	:	Rewrote net_bh and list manager.
- *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
+ *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  *		Alan Cox	:	Took out transmit every packet pass
  *					Saved a few bytes in the ioctl handler
  *		Alan Cox	:	Network driver sets packet type before
@@ -46,7 +46,7 @@
  *		Richard Kooijman:	Timestamp fixes.
  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  *		Alan Cox	:	Device lock protection.
- *		Alan Cox	: 	Fixed nasty side effect of device close
+ *              Alan Cox        :       Fixed nasty side effect of device close
  *					changes.
  *		Rudi Cilibrasi	:	Pass the right thing to
  *					set_mac_address()
@@ -67,8 +67,8 @@
  *	Paul Rusty Russell	:	SIOCSIFNAME
  *              Pekka Riikonen  :	Netdev boot-time settings code
  *              Andrew Morton   :       Make unregister_netdevice wait
- *              			indefinitely on dev->refcnt
- * 		J Hadi Salim	:	- Backlog queue sampling
+ *                                      indefinitely on dev->refcnt
+ *              J Hadi Salim    :       - Backlog queue sampling
  *				        - netif_rx() feedback
  */
 
@@ -574,13 +574,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map)
 }
 
 /**
- *	netdev_boot_setup_check	- check boot time settings
- *	@dev: the netdevice
+ * netdev_boot_setup_check	- check boot time settings
+ * @dev: the netdevice
  *
- * 	Check boot time settings for the device.
- *	The found settings are set for the device to be used
- *	later in the device probing.
- *	Returns 0 if no settings found, 1 if they are.
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
  */
 int netdev_boot_setup_check(struct net_device *dev)
 {
@@ -590,10 +590,10 @@ int netdev_boot_setup_check(struct net_device *dev)
 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 		    !strcmp(dev->name, s[i].name)) {
-			dev->irq 	= s[i].map.irq;
-			dev->base_addr 	= s[i].map.base_addr;
-			dev->mem_start 	= s[i].map.mem_start;
-			dev->mem_end 	= s[i].map.mem_end;
+			dev->irq = s[i].map.irq;
+			dev->base_addr = s[i].map.base_addr;
+			dev->mem_start = s[i].map.mem_start;
+			dev->mem_end = s[i].map.mem_end;
 			return 1;
 		}
 	}
@@ -603,14 +603,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check);
 
 
 /**
- *	netdev_boot_base	- get address from boot time settings
- *	@prefix: prefix for network device
- *	@unit: id for network device
+ * netdev_boot_base	- get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
  *
- * 	Check boot time settings for the base address of device.
- *	The found settings are set for the device to be used
- *	later in the device probing.
- *	Returns 0 if no settings found.
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
  */
 unsigned long netdev_boot_base(const char *prefix, int unit)
 {
@@ -737,15 +737,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
 EXPORT_SYMBOL(__dev_get_by_name);
 
 /**
- *	dev_get_by_name_rcu	- find a device by its name
- *	@net: the applicable net namespace
- *	@name: name to find
+ * dev_get_by_name_rcu	- find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
  *
- *	Find an interface by name.
- *	If the name is found a pointer to the device is returned.
- * 	If the name is not found then %NULL is returned.
- *	The reference counters are not incremented so the caller must be
- *	careful with locks. The caller must hold RCU lock.
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
  */
 
 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
@@ -1289,8 +1289,8 @@ void netdev_state_change(struct net_device *dev)
 EXPORT_SYMBOL(netdev_state_change);
 
 /**
- * 	netdev_notify_peers - notify network peers about existence of @dev
- * 	@dev: network device
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
  *
  * Generate traffic such that interested network peers are aware of
  * @dev, such as by generating a gratuitous ARP. This may be used when
@@ -1518,17 +1518,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 static int dev_boot_phase = 1;
 
 /**
- *	register_netdevice_notifier - register a network notifier block
- *	@nb: notifier
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
  *
- *	Register a notifier to be called when network device events occur.
- *	The notifier passed is linked into the kernel structures and must
- *	not be reused until it has been unregistered. A negative errno code
- *	is returned on a failure.
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
  *
- * 	When registered all registration and up events are replayed
- *	to the new notifier to allow device to have a race free
- *	view of the network device list.
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
  */
 
 int register_netdevice_notifier(struct notifier_block *nb)
@@ -1585,17 +1585,17 @@ outroll:
 EXPORT_SYMBOL(register_netdevice_notifier);
 
 /**
- *	unregister_netdevice_notifier - unregister a network notifier block
- *	@nb: notifier
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
  *
- *	Unregister a notifier previously registered by
- *	register_netdevice_notifier(). The notifier is unlinked into the
- *	kernel structures and may then be reused. A negative errno code
- *	is returned on a failure.
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
  *
- * 	After unregistering unregister and down device events are synthesized
- *	for all devices on the device list to the removed notifier to remove
- *	the need for special case cleanup code.
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
  */
 
 int unregister_netdevice_notifier(struct notifier_block *nb)
@@ -7544,17 +7544,17 @@ void netdev_freemem(struct net_device *dev)
 }
 
 /**
- *	alloc_netdev_mqs - allocate network device
- *	@sizeof_priv:		size of private data to allocate space for
- *	@name:			device name format string
- *	@name_assign_type: 	origin of device name
- *	@setup:			callback to initialize device
- *	@txqs:			the number of TX subqueues to allocate
- *	@rxqs:			the number of RX subqueues to allocate
- *
- *	Allocates a struct net_device with private data area for driver use
- *	and performs basic initialization.  Also allocates subqueue structs
- *	for each queue on the device.
+ * alloc_netdev_mqs - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization.  Also allocates subqueue structs
+ * for each queue on the device.
  */
 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned char name_assign_type,
@@ -7666,13 +7666,13 @@ free_dev:
 EXPORT_SYMBOL(alloc_netdev_mqs);
 
 /**
- *	free_netdev - free network device
- *	@dev: device
+ * free_netdev - free network device
+ * @dev: device
  *
- *	This function does the last stage of destroying an allocated device
- * 	interface. The reference to the device object is released.
- *	If this is the last reference then it will be freed.
- *	Must be called in process context.
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released. If this
+ * is the last reference then it will be freed.Must be called in process
+ * context.
  */
 void free_netdev(struct net_device *dev)
 {
-- 
cgit v1.2.3


From 643aa9cba0b688ffde28ac39aebec6f56fc2a8af Mon Sep 17 00:00:00 2001
From: tcharding
Date: Thu, 9 Feb 2017 17:56:05 +1100
Subject: net: Fix checkpatch whitespace errors

This patch fixes two trivial whitespace errors. Brace should be
on the previous line and trailing statements should be on next line.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index b7c795017299..60a0be743cdb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -192,7 +192,8 @@ static seqcount_t devnet_rename_seq;
 
 static inline void dev_base_seq_inc(struct net *net)
 {
-	while (++net->dev_base_seq == 0);
+	while (++net->dev_base_seq == 0)
+		;
 }
 
 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -274,8 +275,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  * according to dev->type
  */
-static const unsigned short netdev_lock_type[] =
-	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+static const unsigned short netdev_lock_type[] = {
+	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
@@ -291,22 +292,22 @@ static const unsigned short netdev_lock_type[] =
 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 
-static const char *const netdev_lock_name[] =
-	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
-	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
-	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
-	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
-	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
-	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
-	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
-	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
-	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
-	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
-	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
-	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
-	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
-	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
-	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+static const char *const netdev_lock_name[] = {
+	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 
 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
-- 
cgit v1.2.3


From eb13da1a103a808c05267816fa4d30d603bfda5e Mon Sep 17 00:00:00 2001
From: tcharding
Date: Thu, 9 Feb 2017 17:56:06 +1100
Subject: net: Fix checkpatch block comments warnings

Fix multiple occurrences of checkpatch warning. WARNING: Block
comments use * on subsequent lines. Also make comment blocks
more uniform.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 65 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 60a0be743cdb..e583820c21be 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -353,10 +353,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 #endif
 
 /*******************************************************************************
+ *
+ *		Protocol management and registration routines
+ *
+ *******************************************************************************/
 
-		Protocol management and registration routines
-
-*******************************************************************************/
 
 /*
  *	Add a protocol ID to the list. Now that the input handler is
@@ -539,10 +540,10 @@ void dev_remove_offload(struct packet_offload *po)
 EXPORT_SYMBOL(dev_remove_offload);
 
 /******************************************************************************
-
-		      Device Boot-time Settings Routines
-
-*******************************************************************************/
+ *
+ *		      Device Boot-time Settings Routines
+ *
+ ******************************************************************************/
 
 /* Boot time configuration table */
 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
@@ -664,10 +665,10 @@ int __init netdev_boot_setup(char *str)
 __setup("netdev=", netdev_boot_setup);
 
 /*******************************************************************************
-
-			    Device Interface Subroutines
-
-*******************************************************************************/
+ *
+ *			    Device Interface Subroutines
+ *
+ *******************************************************************************/
 
 /**
  *	dev_get_iflink	- get 'iflink' value of a interface
@@ -3326,16 +3327,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	}
 
 	/* The device has no queue. Common case for software devices:
-	   loopback, all the sorts of tunnels...
+	 * loopback, all the sorts of tunnels...
 
-	   Really, it is unlikely that netif_tx_lock protection is necessary
-	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
-	   counters.)
-	   However, it is possible, that they rely on protection
-	   made by us here.
+	 * Really, it is unlikely that netif_tx_lock protection is necessary
+	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
+	 * counters.)
+	 * However, it is possible, that they rely on protection
+	 * made by us here.
 
-	   Check this and shot the lock. It is not prone from deadlocks.
-	   Either shot noqueue qdisc, it is even simpler 8)
+	 * Check this and shot the lock. It is not prone from deadlocks.
+	 *Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
 		int cpu = smp_processor_id(); /* ok because BHs are off */
@@ -3397,9 +3398,9 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
 
-/*=======================================================================
-			Receiver routines
-  =======================================================================*/
+/*************************************************************************
+ *			Receiver routines
+ *************************************************************************/
 
 int netdev_max_backlog __read_mostly = 1000;
 EXPORT_SYMBOL(netdev_max_backlog);
@@ -6359,8 +6360,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
 	}
 
 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
-	   is important. Some (broken) drivers set IFF_PROMISC, when
-	   IFF_ALLMULTI is requested not asking us and not reporting.
+	 * is important. Some (broken) drivers set IFF_PROMISC, when
+	 * IFF_ALLMULTI is requested not asking us and not reporting.
 	 */
 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
@@ -6724,8 +6725,8 @@ static void rollback_registered_many(struct list_head *head)
 
 
 		/* Notify protocols, that we are about to destroy
-		   this device. They should clean all the things.
-		*/
+		 * this device. They should clean all the things.
+		 */
 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
 		if (!dev->rtnl_link_ops ||
@@ -7855,12 +7856,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	dev_shutdown(dev);
 
 	/* Notify protocols, that we are about to destroy
-	   this device. They should clean all the things.
-
-	   Note that dev->reg_state stays at NETREG_REGISTERED.
-	   This is wanted because this way 8021q and macvlan know
-	   the device is just moving and can keep their slaves up.
-	*/
+	 * this device. They should clean all the things.
+	 *
+	 * Note that dev->reg_state stays at NETREG_REGISTERED.
+	 * This is wanted because this way 8021q and macvlan know
+	 * the device is just moving and can keep their slaves up.
+	 */
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-- 
cgit v1.2.3


From f4563a75fb93d6a756416d97e13f0773ac373a24 Mon Sep 17 00:00:00 2001
From: tcharding
Date: Thu, 9 Feb 2017 17:56:07 +1100
Subject: net: Fix checkpatch, Missing a blank line after declarations

This patch fixes multiple occurrences of checkpatch WARNING: Missing
a blank line after declarations.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index e583820c21be..363c44b9be63 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2498,6 +2498,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
 
 	if (dev->num_tc) {
 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
 		qoffset = dev->tc_to_txq[tc].offset;
 		qcount = dev->tc_to_txq[tc].count;
 	}
@@ -2719,9 +2720,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_HIGHMEM
 	int i;
+
 	if (!(dev->features & NETIF_F_HIGHDMA)) {
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
 			if (PageHighMem(skb_frag_page(frag)))
 				return 1;
 		}
@@ -2735,6 +2738,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
+
 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
 				return 1;
 		}
@@ -3210,6 +3214,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 	if (queue_index < 0 || skb->ooo_okay ||
 	    queue_index >= dev->real_num_tx_queues) {
 		int new_index = get_xps_queue(dev, skb);
+
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, skb);
 
@@ -3239,6 +3244,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 
 	if (dev->real_num_tx_queues != 1) {
 		const struct net_device_ops *ops = dev->netdev_ops;
+
 		if (ops->ndo_select_queue)
 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
 							    __netdev_pick_tx);
@@ -3768,6 +3774,7 @@ static int netif_rx_internal(struct sk_buff *skb)
 #endif
 	{
 		unsigned int qtail;
+
 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 		put_cpu();
 	}
@@ -3827,6 +3834,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 
 		while (clist) {
 			struct sk_buff *skb = clist;
+
 			clist = clist->next;
 
 			WARN_ON(atomic_read(&skb->users));
@@ -5663,6 +5671,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev,
 			      struct list_head *dev_list)
 {
 	char linkname[IFNAMSIZ+7];
+
 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 		"upper_%s" : "lower_%s", adj_dev->name);
 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
@@ -5673,6 +5682,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
 			       struct list_head *dev_list)
 {
 	char linkname[IFNAMSIZ+7];
+
 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 		"upper_%s" : "lower_%s", name);
 	sysfs_remove_link(&(dev->dev.kobj), linkname);
@@ -5942,6 +5952,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
+
 	ASSERT_RTNL();
 
 	changeupper_info.upper_dev = upper_dev;
@@ -6659,6 +6670,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd);
 static int dev_new_index(struct net *net)
 {
 	int ifindex = net->ifindex;
+
 	for (;;) {
 		if (++ifindex <= 0)
 			ifindex = 1;
@@ -7072,6 +7084,7 @@ void netif_tx_stop_all_queues(struct net_device *dev)
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
 		netif_tx_stop_queue(txq);
 	}
 }
-- 
cgit v1.2.3


From f39b2dde4886450b965ccd0a6defe8bc33fcc938 Mon Sep 17 00:00:00 2001
From: Russell King
Date: Tue, 7 Feb 2017 15:02:54 -0800
Subject: net: sunrpc: fix build errors when linux/phy*.h is removed from
 net/dsa.h

Removing linux/phy.h from net/dsa.h reveals a build error in the sunrpc
code:

net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function 'xprt_rdma_bc_put':
net/sunrpc/xprtrdma/svc_rdma_backchannel.c:277:2: error: implicit declaration of function 'module_put' [-Werror=implicit-function-declaration]
net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function 'xprt_setup_rdma_bc':
net/sunrpc/xprtrdma/svc_rdma_backchannel.c:348:7: error: implicit declaration of function 'try_module_get' [-Werror=implicit-function-declaration]

Fix this by adding linux/module.h to svc_rdma_backchannel.c

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 288e35c2d8f4..cb1e48e54eb1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -4,6 +4,7 @@
  * Support for backward direction RPCs on RPC/RDMA (server-side).
  */
 
+#include <linux/module.h>
 #include <linux/sunrpc/svc_rdma.h>
 #include "xprt_rdma.h"
 
-- 
cgit v1.2.3


From 43cc277a93a2b74430e2ae03e92bd9bc4f303814 Mon Sep 17 00:00:00 2001
From: Russell King
Date: Tue, 7 Feb 2017 15:02:55 -0800
Subject: net: cgroups: fix build errors when linux/phy*.h is removed from
 net/dsa.h

net/core/netprio_cgroup.c:303:16: error: expected declaration specifiers or '...' before string constant
    MODULE_LICENSE("GPL v2");
                   ^~~~~~~~

Add linux/module.h to fix this.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netprio_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2ec86fc552df..756637dc7a57 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -13,6 +13,7 @@
 
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/skbuff.h>
-- 
cgit v1.2.3


From adf200f31c000d707e4afe238ed1d1199e0cce7c Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 15:54:33 +0100
Subject: devlink: fix the name of eswitch commands

The eswitch_[gs]et command is supposed to be similar to port_[gs]et
command - for multiple eswitch attributes. However, when it was introduced
by 08f4b5918b2d ("net/devlink: Add E-Switch mode control") it was wrongly
named with the word "mode" in it. So fix this now, make the oririnal
enum value existing but obsolete.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h | 10 ++++++++--
 net/core/devlink.c           | 18 +++++++++---------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 9014c33d4e77..0f1f3a12e23c 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -57,8 +57,14 @@ enum devlink_command {
 	DEVLINK_CMD_SB_OCC_SNAPSHOT,
 	DEVLINK_CMD_SB_OCC_MAX_CLEAR,
 
-	DEVLINK_CMD_ESWITCH_MODE_GET,
-	DEVLINK_CMD_ESWITCH_MODE_SET,
+	DEVLINK_CMD_ESWITCH_GET,
+#define DEVLINK_CMD_ESWITCH_MODE_GET /* obsolete, never use this! */ \
+	DEVLINK_CMD_ESWITCH_GET
+
+	DEVLINK_CMD_ESWITCH_SET,
+#define DEVLINK_CMD_ESWITCH_MODE_SET /* obsolete, never use this! */ \
+	DEVLINK_CMD_ESWITCH_SET
+
 	/* add new commands above here */
 
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2b5bf9efa720..7aa8e5369dc5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1435,8 +1435,8 @@ out:
 	return err;
 }
 
-static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
-						struct genl_info *info)
+static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
+					   struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
 	const struct devlink_ops *ops = devlink->ops;
@@ -1450,7 +1450,7 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
 	if (!msg)
 		return -ENOMEM;
 
-	err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
+	err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
 				   info->snd_portid, info->snd_seq, 0);
 
 	if (err) {
@@ -1461,8 +1461,8 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 }
 
-static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
-						struct genl_info *info)
+static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
+					   struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
 	const struct devlink_ops *ops = devlink->ops;
@@ -1629,15 +1629,15 @@ static const struct genl_ops devlink_nl_ops[] = {
 				  DEVLINK_NL_FLAG_LOCK_PORTS,
 	},
 	{
-		.cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
-		.doit = devlink_nl_cmd_eswitch_mode_get_doit,
+		.cmd = DEVLINK_CMD_ESWITCH_GET,
+		.doit = devlink_nl_cmd_eswitch_get_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
 	{
-		.cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
-		.doit = devlink_nl_cmd_eswitch_mode_set_doit,
+		.cmd = DEVLINK_CMD_ESWITCH_SET,
+		.doit = devlink_nl_cmd_eswitch_set_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
-- 
cgit v1.2.3


From 21e3d2dd4a19f842e7d134c341eb584970ff3b32 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 15:54:34 +0100
Subject: devlink: rename devlink_eswitch_fill to devlink_nl_eswitch_fill

Be aligned with the rest of the file and name the helper function
accordingly.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7aa8e5369dc5..f36192406f9a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1392,9 +1392,9 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
 	return -EOPNOTSUPP;
 }
 
-static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
-				enum devlink_command cmd, u32 portid,
-				u32 seq, int flags)
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+				   enum devlink_command cmd, u32 portid,
+				   u32 seq, int flags)
 {
 	const struct devlink_ops *ops = devlink->ops;
 	void *hdr;
@@ -1450,8 +1450,8 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
 	if (!msg)
 		return -ENOMEM;
 
-	err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
-				   info->snd_portid, info->snd_seq, 0);
+	err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+				      info->snd_portid, info->snd_seq, 0);
 
 	if (err) {
 		nlmsg_free(msg);
-- 
cgit v1.2.3


From 1a6aa36b6f92b1a2f2e6789f6785372d4d6ddca9 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 15:54:35 +0100
Subject: devlink: use nla_put_failure goto label instead of out

Be aligned with the rest of the code and use label named nla_put_failure.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index f36192406f9a..7f88cc879d43 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1408,29 +1408,29 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
 
 	err = devlink_nl_put_handle(msg, devlink);
 	if (err)
-		goto out;
+		goto nla_put_failure;
 
 	err = ops->eswitch_mode_get(devlink, &mode);
 	if (err)
-		goto out;
+		goto nla_put_failure;
 	err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
 	if (err)
-		goto out;
+		goto nla_put_failure;
 
 	if (ops->eswitch_inline_mode_get) {
 		err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
 		if (err)
-			goto out;
+			goto nla_put_failure;
 		err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
 				 inline_mode);
 		if (err)
-			goto out;
+			goto nla_put_failure;
 	}
 
 	genlmsg_end(msg, hdr);
 	return 0;
 
-out:
+nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return err;
 }
-- 
cgit v1.2.3


From 4456f61cfd2a589c4368fe0b9080b646b9bd470d Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Thu, 9 Feb 2017 15:54:36 +0100
Subject: devlink: allow to fillup eswitch attrs even if mode_get op does not
 exist

Even when mode_get op is not present, other eswitch attrs need to be
filled-up.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7f88cc879d43..e9c1e6acfb6d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1410,12 +1410,14 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (err)
 		goto nla_put_failure;
 
-	err = ops->eswitch_mode_get(devlink, &mode);
-	if (err)
-		goto nla_put_failure;
-	err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
-	if (err)
-		goto nla_put_failure;
+	if (ops->eswitch_mode_get) {
+		err = ops->eswitch_mode_get(devlink, &mode);
+		if (err)
+			goto nla_put_failure;
+		err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+		if (err)
+			goto nla_put_failure;
+	}
 
 	if (ops->eswitch_inline_mode_get) {
 		err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
@@ -1443,7 +1445,7 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
 	struct sk_buff *msg;
 	int err;
 
-	if (!ops || !ops->eswitch_mode_get)
+	if (!ops)
 		return -EOPNOTSUPP;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-- 
cgit v1.2.3


From c16ec18599c8c1722d476011786fd9e2529888f7 Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Sat, 11 Feb 2017 13:49:20 +0200
Subject: net: rename dst_neigh_output back to neigh_output

After the dst->pending_confirm flag was removed, we do not
need anymore to provide dst arg to dst_neigh_output.
So, rename it to neigh_output as before commit 5110effee8fd
("net: Do delayed neigh confirmation.").

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c       |  4 ++--
 include/net/dst.h       | 12 ------------
 include/net/neighbour.h | 10 ++++++++++
 net/ipv4/ip_output.c    |  2 +-
 net/ipv6/ip6_output.c   |  2 +-
 5 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 630eafdb79e8..22379da63400 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -379,7 +379,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
 		sock_confirm_neigh(skb, neigh);
-		ret = dst_neigh_output(dst, neigh, skb);
+		ret = neigh_output(neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
 	}
@@ -577,7 +577,7 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 	if (!IS_ERR(neigh)) {
 		sock_confirm_neigh(skb, neigh);
-		ret = dst_neigh_output(dst, neigh, skb);
+		ret = neigh_output(neigh, skb);
 	}
 
 	rcu_read_unlock_bh();
diff --git a/include/net/dst.h b/include/net/dst.h
index 84a1043dd6a1..049af33da3b6 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -442,18 +442,6 @@ static inline void dst_confirm(struct dst_entry *dst)
 {
 }
 
-static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
-				   struct sk_buff *skb)
-{
-	const struct hh_cache *hh;
-
-	hh = &n->hh;
-	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
-		return neigh_hh_output(hh, skb);
-	else
-		return n->output(n, skb);
-}
-
 static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 {
 	struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr);
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 8b683841e574..5ebf69491160 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -468,6 +468,16 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
 	return dev_queue_xmit(skb);
 }
 
+static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
+{
+	const struct hh_cache *hh = &n->hh;
+
+	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
+		return neigh_hh_output(hh, skb);
+	else
+		return n->output(n, skb);
+}
+
 static inline struct neighbour *
 __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
 {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7a719f1ae556..737ce826d7ec 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -225,7 +225,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 		int res;
 
 		sock_confirm_neigh(skb, neigh);
-		res = dst_neigh_output(dst, neigh, skb);
+		res = neigh_output(neigh, skb);
 
 		rcu_read_unlock_bh();
 		return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d299040613a0..a75871c62328 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -120,7 +120,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
 		sock_confirm_neigh(skb, neigh);
-		ret = dst_neigh_output(dst, neigh, skb);
+		ret = neigh_output(neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
 	}
-- 
cgit v1.2.3


From 87b60cfacf9f17cf71933c6e33b66e68160af71d Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 10 Feb 2017 10:31:49 -0800
Subject: net_sched: fix error recovery at qdisc creation

Dmitry reported uses after free in qdisc code [1]

The problem here is that ops->init() can return an error.

qdisc_create_dflt() then call ops->destroy(),
while qdisc_create() does _not_ call it.

Four qdisc chose to call their own ops->destroy(), assuming their caller
would not.

This patch makes sure qdisc_create() calls ops->destroy()
and fixes the four qdisc to avoid double free.

[1]
BUG: KASAN: use-after-free in mq_destroy+0x242/0x290 net/sched/sch_mq.c:33 at addr ffff8801d415d440
Read of size 8 by task syz-executor2/5030
CPU: 0 PID: 5030 Comm: syz-executor2 Not tainted 4.3.5-smp-DEV #119
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
 0000000000000046 ffff8801b435b870 ffffffff81bbbed4 ffff8801db000400
 ffff8801d415d440 ffff8801d415dc40 ffff8801c4988510 ffff8801b435b898
 ffffffff816682b1 ffff8801b435b928 ffff8801d415d440 ffff8801c49880c0
Call Trace:
 [<ffffffff81bbbed4>] __dump_stack lib/dump_stack.c:15 [inline]
 [<ffffffff81bbbed4>] dump_stack+0x6c/0x98 lib/dump_stack.c:51
 [<ffffffff816682b1>] kasan_object_err+0x21/0x70 mm/kasan/report.c:158
 [<ffffffff81668524>] print_address_description mm/kasan/report.c:196 [inline]
 [<ffffffff81668524>] kasan_report_error+0x1b4/0x4b0 mm/kasan/report.c:285
 [<ffffffff81668953>] kasan_report mm/kasan/report.c:305 [inline]
 [<ffffffff81668953>] __asan_report_load8_noabort+0x43/0x50 mm/kasan/report.c:326
 [<ffffffff82527b02>] mq_destroy+0x242/0x290 net/sched/sch_mq.c:33
 [<ffffffff82524bdd>] qdisc_destroy+0x12d/0x290 net/sched/sch_generic.c:953
 [<ffffffff82524e30>] qdisc_create_dflt+0xf0/0x120 net/sched/sch_generic.c:848
 [<ffffffff8252550d>] attach_default_qdiscs net/sched/sch_generic.c:1029 [inline]
 [<ffffffff8252550d>] dev_activate+0x6ad/0x880 net/sched/sch_generic.c:1064
 [<ffffffff824b1db1>] __dev_open+0x221/0x320 net/core/dev.c:1403
 [<ffffffff824b24ce>] __dev_change_flags+0x15e/0x3e0 net/core/dev.c:6858
 [<ffffffff824b27de>] dev_change_flags+0x8e/0x140 net/core/dev.c:6926
 [<ffffffff824f5bf6>] dev_ifsioc+0x446/0x890 net/core/dev_ioctl.c:260
 [<ffffffff824f61fa>] dev_ioctl+0x1ba/0xb80 net/core/dev_ioctl.c:546
 [<ffffffff82430509>] sock_do_ioctl+0x99/0xb0 net/socket.c:879
 [<ffffffff82430d30>] sock_ioctl+0x2a0/0x390 net/socket.c:958
 [<ffffffff816f3b68>] vfs_ioctl fs/ioctl.c:44 [inline]
 [<ffffffff816f3b68>] do_vfs_ioctl+0x8a8/0xe50 fs/ioctl.c:611
 [<ffffffff816f41a4>] SYSC_ioctl fs/ioctl.c:626 [inline]
 [<ffffffff816f41a4>] SyS_ioctl+0x94/0xc0 fs/ioctl.c:617
 [<ffffffff8123e357>] entry_SYSCALL_64_fastpath+0x12/0x17

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c    |  2 ++
 net/sched/sch_hhf.c    |  8 ++++++--
 net/sched/sch_mq.c     | 10 +++-------
 net/sched/sch_mqprio.c | 19 ++++++-------------
 net/sched/sch_sfq.c    |  3 ++-
 5 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index adeabaec0d0b..a13c15e8f087 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1019,6 +1019,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 
 		return sch;
 	}
+	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
+	ops->destroy(sch);
 err_out3:
 	dev_put(dev);
 	kfree((char *) sch - sch->padded);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index e3d0458af17b..2fae8b5f1b80 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -627,7 +627,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 			q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
 						      sizeof(u32));
 			if (!q->hhf_arrays[i]) {
-				hhf_destroy(sch);
+				/* Note: hhf_destroy() will be called
+				 * by our caller.
+				 */
 				return -ENOMEM;
 			}
 		}
@@ -638,7 +640,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 			q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
 							  BITS_PER_BYTE);
 			if (!q->hhf_valid_bits[i]) {
-				hhf_destroy(sch);
+				/* Note: hhf_destroy() will be called
+				 * by our caller.
+				 */
 				return -ENOMEM;
 			}
 		}
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 2bc8d7f8df16..20b7f1646f69 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -52,7 +52,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 	/* pre-allocate qdiscs, attachment can't fail */
 	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
 			       GFP_KERNEL);
-	if (priv->qdiscs == NULL)
+	if (!priv->qdiscs)
 		return -ENOMEM;
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
@@ -60,18 +60,14 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 		qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(ntx + 1)));
-		if (qdisc == NULL)
-			goto err;
+		if (!qdisc)
+			return -ENOMEM;
 		priv->qdiscs[ntx] = qdisc;
 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
 	}
 
 	sch->flags |= TCQ_F_MQROOT;
 	return 0;
-
-err:
-	mq_destroy(sch);
-	return -ENOMEM;
 }
 
 static void mq_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b5c502c78143..922683418e53 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -118,10 +118,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	/* pre-allocate qdisc, attachment can't fail */
 	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
 			       GFP_KERNEL);
-	if (priv->qdiscs == NULL) {
-		err = -ENOMEM;
-		goto err;
-	}
+	if (!priv->qdiscs)
+		return -ENOMEM;
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		dev_queue = netdev_get_tx_queue(dev, i);
@@ -129,10 +127,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 					  get_default_qdisc_ops(dev, i),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(i + 1)));
-		if (qdisc == NULL) {
-			err = -ENOMEM;
-			goto err;
-		}
+		if (!qdisc)
+			return -ENOMEM;
+
 		priv->qdiscs[i] = qdisc;
 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
 	}
@@ -148,7 +145,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 		priv->hw_owned = 1;
 		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
 		if (err)
-			goto err;
+			return err;
 	} else {
 		netdev_set_num_tc(dev, qopt->num_tc);
 		for (i = 0; i < qopt->num_tc; i++)
@@ -162,10 +159,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 
 	sch->flags |= TCQ_F_MQROOT;
 	return 0;
-
-err:
-	mqprio_destroy(sch);
-	return err;
 }
 
 static void mqprio_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 83d06e251b93..42e8c8615e65 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -743,9 +743,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 	if (!q->ht || !q->slots) {
-		sfq_destroy(sch);
+		/* Note: sfq_destroy() will be called by our caller */
 		return -ENOMEM;
 	}
+
 	for (i = 0; i < q->divisor; i++)
 		q->ht[i] = SFQ_EMPTY_SLOT;
 
-- 
cgit v1.2.3


From a96e66e702bc615f5927b15c739c91d5098dac59 Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Thu, 9 Feb 2017 16:46:45 +0800
Subject: netfilter: nf_ct_sip: Use mod_timer_pending()

timer_del() followed by timer_add() can be replaced by
mod_timer_pending().

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_sip.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index c3fc14e021ec..24174c520239 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -809,13 +809,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
 		    exp->tuple.dst.protonum != proto ||
 		    exp->tuple.dst.u.udp.port != port)
 			continue;
-		if (!del_timer(&exp->timeout))
-			continue;
-		exp->flags &= ~NF_CT_EXPECT_INACTIVE;
-		exp->timeout.expires = jiffies + expires * HZ;
-		add_timer(&exp->timeout);
-		found = 1;
-		break;
+		if (mod_timer_pending(&exp->timeout, jiffies + expires * HZ)) {
+			exp->flags &= ~NF_CT_EXPECT_INACTIVE;
+			found = 1;
+			break;
+		}
 	}
 	spin_unlock_bh(&nf_conntrack_expect_lock);
 	return found;
-- 
cgit v1.2.3


From 4dee62b1b9b43c9bbf49c93cd114e1bf1334fb6a Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Thu, 9 Feb 2017 21:40:38 +0800
Subject: netfilter: nf_ct_expect: nf_ct_expect_insert() returns void

Because nf_ct_expect_insert() always succeeds now, its return value can
be just void instead of int. And remove code that checks for its return
value.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_expect.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index f8dbacf66795..e19a69787d99 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -353,7 +353,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_put);
 
-static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct nf_conntrack_helper *helper;
@@ -380,7 +380,6 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	add_timer(&exp->timeout);
 
 	NF_CT_STAT_INC(net, expect_create);
-	return 0;
 }
 
 /* Race with expectations being used means we could have none to find; OK. */
@@ -464,9 +463,8 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
 	if (ret <= 0)
 		goto out;
 
-	ret = nf_ct_expect_insert(expect);
-	if (ret < 0)
-		goto out;
+	nf_ct_expect_insert(expect);
+
 	spin_unlock_bh(&nf_conntrack_expect_lock);
 	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
 	return ret;
-- 
cgit v1.2.3


From b745d0358db44ebcef31478314436d6d4ff93bfa Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Fri, 10 Feb 2017 12:08:08 +0100
Subject: netfilter: nfnetlink: get rid of u_intX_t types

Use uX types instead.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a09fa9fd8f3d..586212ebba9e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -100,9 +100,9 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
 
-static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type)
 {
-	u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+	u8 subsys_id = NFNL_SUBSYS_ID(type);
 
 	if (subsys_id >= NFNL_SUBSYS_COUNT)
 		return NULL;
@@ -111,9 +111,9 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t
 }
 
 static inline const struct nfnl_callback *
-nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss)
+nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
 {
-	u_int8_t cb_id = NFNL_MSG_TYPE(type);
+	u8 cb_id = NFNL_MSG_TYPE(type);
 
 	if (cb_id >= ss->cb_count)
 		return NULL;
@@ -185,7 +185,7 @@ replay:
 
 	{
 		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
-		u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+		u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
 		struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
 		struct nlattr *attr = (void *)nlh + min_len;
 		int attrlen = nlh->nlmsg_len - min_len;
@@ -273,7 +273,7 @@ enum {
 };
 
 static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
-				u_int16_t subsys_id)
+				u16 subsys_id)
 {
 	struct sk_buff *oskb = skb;
 	struct net *net = sock_net(skb->sk);
@@ -365,7 +365,7 @@ replay:
 
 		{
 			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
-			u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
 			struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
 			struct nlattr *attr = (void *)nlh + min_len;
 			int attrlen = nlh->nlmsg_len - min_len;
@@ -439,7 +439,7 @@ done:
 static void nfnetlink_rcv(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
-	u_int16_t res_id;
+	u16 res_id;
 	int msglen;
 
 	if (nlh->nlmsg_len < NLMSG_HDRLEN ||
-- 
cgit v1.2.3


From 48656835c0405aa2e6c0d6a4305c77b70758d168 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Fri, 10 Feb 2017 12:08:14 +0100
Subject: netfilter: nfnetlink: add nfnetlink_rcv_skb_batch()

Add new nfnetlink_rcv_skb_batch() to wrap initial nfnetlink batch
handling.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 51 ++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 586212ebba9e..ca645a3b1375 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -436,12 +436,35 @@ done:
 	kfree_skb(skb);
 }
 
-static void nfnetlink_rcv(struct sk_buff *skb)
+static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+	struct nfgenmsg *nfgenmsg;
 	u16 res_id;
 	int msglen;
 
+	msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+	if (msglen > skb->len)
+		msglen = skb->len;
+
+	if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+	    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
+		return;
+
+	nfgenmsg = nlmsg_data(nlh);
+	skb_pull(skb, msglen);
+	/* Work around old nft using host byte order */
+	if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+		res_id = NFNL_SUBSYS_NFTABLES;
+	else
+		res_id = ntohs(nfgenmsg->res_id);
+
+	nfnetlink_rcv_batch(skb, nlh, res_id);
+}
+
+static void nfnetlink_rcv(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+
 	if (nlh->nlmsg_len < NLMSG_HDRLEN ||
 	    skb->len < nlh->nlmsg_len)
 		return;
@@ -451,28 +474,10 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 		return;
 	}
 
-	if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) {
-		struct nfgenmsg *nfgenmsg;
-
-		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
-		if (msglen > skb->len)
-			msglen = skb->len;
-
-		if (nlh->nlmsg_len < NLMSG_HDRLEN ||
-		    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
-			return;
-
-		nfgenmsg = nlmsg_data(nlh);
-		skb_pull(skb, msglen);
-		/* Work around old nft using host byte order */
-		if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
-			res_id = NFNL_SUBSYS_NFTABLES;
-		else
-			res_id = ntohs(nfgenmsg->res_id);
-		nfnetlink_rcv_batch(skb, nlh, res_id);
-	} else {
+	if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN)
+		nfnetlink_rcv_skb_batch(skb, nlh);
+	else
 		netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
-	}
 }
 
 #ifdef CONFIG_MODULES
-- 
cgit v1.2.3


From 8c4d4e8b5626fec965fd5034e5bd5e57790f243f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Fri, 10 Feb 2017 12:08:17 +0100
Subject: netfilter: nfnetlink: allow to check for generation ID

This patch allows userspace to specify the generation ID that has been
used to build an incremental batch update.

If userspace specifies the generation ID in the batch message as
attribute, then nfnetlink compares it to the current generation ID so
you make sure that you work against the right baseline. Otherwise, bail
out with ERESTART so userspace knows that its changeset is stale and
needs to respin. Userspace can do this transparently at the cost of
taking slightly more time to refresh caches and rework the changeset.

This check is optional, if there is no NFNL_BATCH_GENID attribute in the
batch begin message, then no check is performed.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h      |  1 +
 include/uapi/linux/netfilter/nfnetlink.h | 12 ++++++++++++
 net/netfilter/nfnetlink.c                | 31 +++++++++++++++++++++++++++----
 3 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 1d82dd5e9a08..1b49209dd5c7 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -28,6 +28,7 @@ struct nfnetlink_subsystem {
 	const struct nfnl_callback *cb;	/* callback for individual types */
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
+	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 4bb8cb7730e7..a09906a30d77 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -65,4 +65,16 @@ struct nfgenmsg {
 #define NFNL_MSG_BATCH_BEGIN		NLMSG_MIN_TYPE
 #define NFNL_MSG_BATCH_END		NLMSG_MIN_TYPE+1
 
+/**
+ * enum nfnl_batch_attributes - nfnetlink batch netlink attributes
+ *
+ * @NFNL_BATCH_GENID: generation ID for this changeset (NLA_U32)
+ */
+enum nfnl_batch_attributes {
+        NFNL_BATCH_UNSPEC,
+        NFNL_BATCH_GENID,
+        __NFNL_BATCH_MAX
+};
+#define NFNL_BATCH_MAX			(__NFNL_BATCH_MAX - 1)
+
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index ca645a3b1375..a2148d0bc50e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
  *
  * (C) 2001 by Jay Schulist <jschlst@samba.org>,
  * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
- * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * Initial netfilter messages via netlink development funded and
  * generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -273,7 +273,7 @@ enum {
 };
 
 static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
-				u16 subsys_id)
+				u16 subsys_id, u32 genid)
 {
 	struct sk_buff *oskb = skb;
 	struct net *net = sock_net(skb->sk);
@@ -315,6 +315,12 @@ replay:
 		return kfree_skb(skb);
 	}
 
+	if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+		nfnl_unlock(subsys_id);
+		netlink_ack(oskb, nlh, -ERESTART);
+		return kfree_skb(skb);
+	}
+
 	while (skb->len >= nlmsg_total_size(0)) {
 		int msglen, type;
 
@@ -436,11 +442,20 @@ done:
 	kfree_skb(skb);
 }
 
+static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
+	[NFNL_BATCH_GENID]	= { .type = NLA_U32 },
+};
+
 static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
+	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+	struct nlattr *attr = (void *)nlh + min_len;
+	struct nlattr *cda[NFNL_BATCH_MAX + 1];
+	int attrlen = nlh->nlmsg_len - min_len;
 	struct nfgenmsg *nfgenmsg;
+	int msglen, err;
+	u32 gen_id = 0;
 	u16 res_id;
-	int msglen;
 
 	msglen = NLMSG_ALIGN(nlh->nlmsg_len);
 	if (msglen > skb->len)
@@ -450,6 +465,14 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 	    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
 		return;
 
+	err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
+	if (err < 0) {
+		netlink_ack(skb, nlh, err);
+		return;
+	}
+	if (cda[NFNL_BATCH_GENID])
+		gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
+
 	nfgenmsg = nlmsg_data(nlh);
 	skb_pull(skb, msglen);
 	/* Work around old nft using host byte order */
@@ -458,7 +481,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 	else
 		res_id = ntohs(nfgenmsg->res_id);
 
-	nfnetlink_rcv_batch(skb, nlh, res_id);
+	nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
 }
 
 static void nfnetlink_rcv(struct sk_buff *skb)
-- 
cgit v1.2.3


From 74e8bcd21c40dbbb3d74fa904536f8a3bddafed3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Fri, 10 Feb 2017 12:08:20 +0100
Subject: netfilter: nf_tables: add check_genid to the nfnetlink subsystem

This patch implements the check generation id as provided by nfnetlink.
This allows us to reject ruleset updates against stale baseline, so
userspace can retry update with a fresh ruleset cache.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index cb6ae46f6c48..71c60a04b66b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4972,6 +4972,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
+static bool nf_tables_valid_genid(struct net *net, u32 genid)
+{
+	return net->nft.base_seq == genid;
+}
+
 static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.name		= "nf_tables",
 	.subsys_id	= NFNL_SUBSYS_NFTABLES,
@@ -4979,6 +4984,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 	.commit		= nf_tables_commit,
 	.abort		= nf_tables_abort,
+	.valid_genid	= nf_tables_valid_genid,
 };
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
-- 
cgit v1.2.3


From 1a94e38d254b3622d5d53f74b3b716b0fcab0ba8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Fri, 10 Feb 2017 12:08:23 +0100
Subject: netfilter: nf_tables: add NFTA_RULE_ID attribute

This new attribute allows us to uniquely identify a rule in transaction.
Robots may trigger an insertion followed by deletion in a batch, in that
scenario we still don't have a public rule handle that we can use to
delete the rule. This is similar to the NFTA_SET_ID attribute that
allows us to refer to an anonymous set from a batch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  3 +++
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c            | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 21ce50e6d0c5..ac84686aaafb 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1202,10 +1202,13 @@ struct nft_trans {
 
 struct nft_trans_rule {
 	struct nft_rule			*rule;
+	u32				rule_id;
 };
 
 #define nft_trans_rule(trans)	\
 	(((struct nft_trans_rule *)trans->data)->rule)
+#define nft_trans_rule_id(trans)	\
+	(((struct nft_trans_rule *)trans->data)->rule_id)
 
 struct nft_trans_set {
 	struct nft_set			*set;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 207951516ede..05215d30fe5c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -207,6 +207,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
  * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64)
  * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN)
+ * @NFTA_RULE_ID: uniquely identifies a rule in a transaction (NLA_U32)
  */
 enum nft_rule_attributes {
 	NFTA_RULE_UNSPEC,
@@ -218,6 +219,7 @@ enum nft_rule_attributes {
 	NFTA_RULE_POSITION,
 	NFTA_RULE_USERDATA,
 	NFTA_RULE_PAD,
+	NFTA_RULE_ID,
 	__NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 71c60a04b66b..6c782532615f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -240,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
 	if (trans == NULL)
 		return NULL;
 
+	if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
+		nft_trans_rule_id(trans) =
+			ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
+	}
 	nft_trans_rule(trans) = rule;
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
 
@@ -2293,6 +2297,22 @@ err1:
 	return err;
 }
 
+static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+					     const struct nlattr *nla)
+{
+	u32 id = ntohl(nla_get_be32(nla));
+	struct nft_trans *trans;
+
+	list_for_each_entry(trans, &net->nft.commit_list, list) {
+		struct nft_rule *rule = nft_trans_rule(trans);
+
+		if (trans->msg_type == NFT_MSG_NEWRULE &&
+		    id == nft_trans_rule_id(trans))
+			return rule;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			     struct sk_buff *skb, const struct nlmsghdr *nlh,
 			     const struct nlattr * const nla[])
@@ -2330,6 +2350,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			if (IS_ERR(rule))
 				return PTR_ERR(rule);
 
+			err = nft_delrule(&ctx, rule);
+		} else if (nla[NFTA_RULE_ID]) {
+			rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+			if (IS_ERR(rule))
+				return PTR_ERR(rule);
+
 			err = nft_delrule(&ctx, rule);
 		} else {
 			err = nft_delrule_by_chain(&ctx);
-- 
cgit v1.2.3


From 7286ff7fde9f963736c7e575572899d8e16b06b7 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Fri, 10 Feb 2017 19:59:36 +0100
Subject: netfilter: nf_tables: honor NFT_SET_OBJECT in set backend selection

Check for NFT_SET_OBJECT feature flag, otherwise we may end up selecting
the wrong set backend.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c  | 3 ++-
 net/netfilter/nft_set_hash.c   | 2 +-
 net/netfilter/nft_set_rbtree.c | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6c782532615f..ff7304ae58ac 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2424,7 +2424,8 @@ nft_select_set_ops(const struct nlattr * const nla[],
 	features = 0;
 	if (nla[NFTA_SET_FLAGS] != NULL) {
 		features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
-		features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT;
+		features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT |
+			    NFT_SET_OBJECT;
 	}
 
 	bops	    = NULL;
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 6938bc890f31..5f652720fc78 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -404,7 +404,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
 	.lookup		= nft_hash_lookup,
 	.update		= nft_hash_update,
 	.walk		= nft_hash_walk,
-	.features	= NFT_SET_MAP | NFT_SET_TIMEOUT,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 3387ed7dd231..71e8fb886a73 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -310,7 +310,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
 	.activate	= nft_rbtree_activate,
 	.lookup		= nft_rbtree_lookup,
 	.walk		= nft_rbtree_walk,
-	.features	= NFT_SET_INTERVAL | NFT_SET_MAP,
+	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
 	.owner		= THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 37fabbf4d489cc2e1cbf7cde816d9453a65ddfb7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 10 Feb 2017 05:46:46 -0800
Subject: net: busy-poll: remove LL_FLUSH_FAILED and LL_FLUSH_BUSY

Commit 79e7fff47b7b ("net: remove support for per driver
ndo_busy_poll()") made them obsolete.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 4 ----
 net/core/dev.c          | 3 ---
 2 files changed, 7 deletions(-)

(limited to 'net')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index d73b849e29a6..b8d637225a07 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -33,10 +33,6 @@ struct napi_struct;
 extern unsigned int sysctl_net_busy_read __read_mostly;
 extern unsigned int sysctl_net_busy_poll __read_mostly;
 
-/* return values from ndo_ll_poll */
-#define LL_FLUSH_FAILED		-1
-#define LL_FLUSH_BUSY		-2
-
 static inline bool net_busy_loop_on(void)
 {
 	return sysctl_net_busy_poll;
diff --git a/net/core/dev.c b/net/core/dev.c
index 363c44b9be63..2f1bbe1bf67c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5008,9 +5008,6 @@ count:
 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
 		local_bh_enable();
 
-		if (rc == LL_FLUSH_FAILED)
-			break; /* permanent failure */
-
 		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
 		    busy_loop_timeout(end_time))
 			break;
-- 
cgit v1.2.3


From d7cf52c249d7aa797ea27808e1526604309ec142 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Tue, 14 Feb 2017 16:27:13 +0100
Subject: sched: Fix accidental removal of errout goto

Bring back the goto that was removed by accident.

Reported-by: Colin Ian King <colin.king@canonical.com>
Fixes: 40c81b25b16c ("sched: check negative err value to safe one level of indent")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index abe1fe13ae54..732f7cae459d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -389,6 +389,7 @@ replay:
 				       RTM_DELTFILTER, false);
 			if (tcf_proto_destroy(tp, false))
 				RCU_INIT_POINTER(*back, next);
+			goto errout;
 		case RTM_GETTFILTER:
 			err = tfilter_notify(net, skb, n, tp, fh,
 					     RTM_NEWTFILTER, true);
-- 
cgit v1.2.3


From bfd0aeac52f74bfb44c0974131e44abb33a13e78 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Mon, 13 Feb 2017 14:59:09 +0100
Subject: bridge: fdb: converge fdb searching functions into one

Before this patch we had 3 different fdb searching functions which was
confusing. This patch reduces all of them to one - fdb_find_rcu(), and
two flavors: br_fdb_find() which requires hash_lock and br_fdb_find_rcu
which requires RCU. This makes it clear what needs to be used, we also
remove two abusers of __br_fdb_get which called it under hash_lock and
replace them with br_fdb_find().

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c  |   2 +-
 net/bridge/br_fdb.c     | 113 ++++++++++++++++++++----------------------------
 net/bridge/br_input.c   |   4 +-
 net/bridge/br_private.h |   5 ++-
 4 files changed, 54 insertions(+), 70 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index d208ee9ab60a..ea71513fca21 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -79,7 +79,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 			br_multicast_flood(mdst, skb, false, true);
 		else
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
-	} else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) {
+	} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
 		br_forward(dst->dst, skb, false, true);
 	} else {
 		br_flood(br, skb, BR_PKT_UNICAST, false, true);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 5693168e88b6..04a53a0b733d 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,9 +28,6 @@
 #include "br_private.h"
 
 static struct kmem_cache *br_fdb_cache __read_mostly;
-static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
-					     const unsigned char *addr,
-					     __u16 vid);
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		      const unsigned char *addr, u16 vid);
 static void fdb_notify(struct net_bridge *br,
@@ -86,6 +83,43 @@ static void fdb_rcu_free(struct rcu_head *head)
 	kmem_cache_free(br_fdb_cache, ent);
 }
 
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+						 const unsigned char *addr,
+						 __u16 vid)
+{
+	struct net_bridge_fdb_entry *f;
+
+	hlist_for_each_entry_rcu(f, head, hlist)
+		if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
+			break;
+
+	return f;
+}
+
+/* requires bridge hash_lock */
+static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
+						const unsigned char *addr,
+						__u16 vid)
+{
+	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+	struct net_bridge_fdb_entry *fdb;
+
+	rcu_read_lock();
+	fdb = fdb_find_rcu(head, addr, vid);
+	rcu_read_unlock();
+
+	return fdb;
+}
+
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+					     const unsigned char *addr,
+					     __u16 vid)
+{
+	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+
+	return fdb_find_rcu(head, addr, vid);
+}
+
 /* When a static FDB entry is added, the mac address from the entry is
  * added to the bridge private HW address list and all required ports
  * are then updated with the new information.
@@ -198,11 +232,10 @@ void br_fdb_find_delete_local(struct net_bridge *br,
 			      const struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *f;
 
 	spin_lock_bh(&br->hash_lock);
-	f = fdb_find(head, addr, vid);
+	f = br_fdb_find(br, addr, vid);
 	if (f && f->is_local && !f->added_by_user && f->dst == p)
 		fdb_delete_local(br, p, f);
 	spin_unlock_bh(&br->hash_lock);
@@ -266,7 +299,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 	spin_lock_bh(&br->hash_lock);
 
 	/* If old entry was unassociated with any port, then delete it. */
-	f = __br_fdb_get(br, br->dev->dev_addr, 0);
+	f = br_fdb_find(br, br->dev->dev_addr, 0);
 	if (f && f->is_local && !f->dst && !f->added_by_user)
 		fdb_delete_local(br, NULL, f);
 
@@ -281,7 +314,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 	list_for_each_entry(v, &vg->vlan_list, vlist) {
 		if (!br_vlan_should_use(v))
 			continue;
-		f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
+		f = br_fdb_find(br, br->dev->dev_addr, v->vid);
 		if (f && f->is_local && !f->dst && !f->added_by_user)
 			fdb_delete_local(br, NULL, f);
 		fdb_insert(br, NULL, newaddr, v->vid);
@@ -380,24 +413,6 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 	spin_unlock_bh(&br->hash_lock);
 }
 
-/* No locking or refcounting, assumes caller has rcu_read_lock */
-struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
-					  const unsigned char *addr,
-					  __u16 vid)
-{
-	struct net_bridge_fdb_entry *fdb;
-
-	hlist_for_each_entry_rcu(fdb,
-				&br->hash[br_mac_hash(addr, vid)], hlist) {
-		if (ether_addr_equal(fdb->addr.addr, addr) &&
-		    fdb->vlan_id == vid) {
-			return fdb;
-		}
-	}
-
-	return NULL;
-}
-
 #if IS_ENABLED(CONFIG_ATM_LANE)
 /* Interface used by ATM LANE hook to test
  * if an addr is on some other bridge port */
@@ -412,7 +427,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 	if (!port)
 		ret = 0;
 	else {
-		fdb = __br_fdb_get(port->br, addr, 0);
+		fdb = br_fdb_find_rcu(port->br, addr, 0);
 		ret = fdb && fdb->dst && fdb->dst->dev != dev &&
 			fdb->dst->state == BR_STATE_FORWARDING;
 	}
@@ -474,34 +489,6 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 	return num;
 }
 
-static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
-					     const unsigned char *addr,
-					     __u16 vid)
-{
-	struct net_bridge_fdb_entry *fdb;
-
-	hlist_for_each_entry(fdb, head, hlist) {
-		if (ether_addr_equal(fdb->addr.addr, addr) &&
-		    fdb->vlan_id == vid)
-			return fdb;
-	}
-	return NULL;
-}
-
-static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
-						 const unsigned char *addr,
-						 __u16 vid)
-{
-	struct net_bridge_fdb_entry *fdb;
-
-	hlist_for_each_entry_rcu(fdb, head, hlist) {
-		if (ether_addr_equal(fdb->addr.addr, addr) &&
-		    fdb->vlan_id == vid)
-			return fdb;
-	}
-	return NULL;
-}
-
 static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 					       struct net_bridge_port *source,
 					       const unsigned char *addr,
@@ -535,7 +522,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	if (!is_valid_ether_addr(addr))
 		return -EINVAL;
 
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (fdb) {
 		/* it is okay to have multiple ports with same
 		 * address, just use the first one.
@@ -608,7 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		}
 	} else {
 		spin_lock(&br->hash_lock);
-		if (likely(!fdb_find(head, addr, vid))) {
+		if (likely(!fdb_find_rcu(head, addr, vid))) {
 			fdb = fdb_create(head, source, addr, vid, 0, 0);
 			if (fdb) {
 				if (unlikely(added_by_user))
@@ -792,7 +779,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		return -EINVAL;
 	}
 
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (fdb == NULL) {
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
@@ -942,10 +929,9 @@ out:
 static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr,
 			      u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (!fdb)
 		return -ENOENT;
 
@@ -969,10 +955,9 @@ static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
 				       const u8 *addr, u16 vlan)
 {
 	struct net_bridge *br = p->br;
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)];
 	struct net_bridge_fdb_entry *fdb;
 
-	fdb = fdb_find(head, addr, vlan);
+	fdb = br_fdb_find(br, addr, vlan);
 	if (!fdb || fdb->dst != p)
 		return -ENOENT;
 
@@ -1117,7 +1102,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 	spin_lock_bh(&br->hash_lock);
 
 	head = &br->hash[br_mac_hash(addr, vid)];
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (!fdb) {
 		fdb = fdb_create(head, p, addr, vid, 0, 0);
 		if (!fdb) {
@@ -1145,15 +1130,13 @@ err_unlock:
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct hlist_head *head;
 	struct net_bridge_fdb_entry *fdb;
 	int err = 0;
 
 	ASSERT_RTNL();
 	spin_lock_bh(&br->hash_lock);
 
-	head = &br->hash[br_mac_hash(addr, vid)];
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (fdb && fdb->added_by_external_learn)
 		fdb_delete(br, fdb);
 	else
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 4615a9b3e26c..236f34244dbe 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -114,7 +114,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
 			return;
 		}
 
-		f = __br_fdb_get(br, n->ha, vid);
+		f = br_fdb_find_rcu(br, n->ha, vid);
 		if (f && ((p->flags & BR_PROXYARP) ||
 			  (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
 			arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
@@ -189,7 +189,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		}
 		break;
 	case BR_PKT_UNICAST:
-		dst = __br_fdb_get(br, dest, vid);
+		dst = br_fdb_find_rcu(br, dest, vid);
 	default:
 		break;
 	}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1cbbf63a5ef7..61368186edea 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -507,8 +507,9 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
 void br_fdb_cleanup(struct work_struct *work);
 void br_fdb_delete_by_port(struct net_bridge *br,
 			   const struct net_bridge_port *p, u16 vid, int do_all);
-struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
-					  const unsigned char *addr, __u16 vid);
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+					     const unsigned char *addr,
+					     __u16 vid);
 int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
 int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
 		   unsigned long off);
-- 
cgit v1.2.3


From 410b3d48f5111a28bb8d4c3d3dc5984c1baf7fc9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Mon, 13 Feb 2017 14:59:10 +0100
Subject: bridge: fdb: add proper lock checks in searching functions

In order to avoid new errors add checks to br_fdb_find and fdb_find_rcu
functions. The first requires hash_lock, the second obviously RCU.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_fdb.c     | 4 ++++
 net/bridge/br_private.h | 9 +++++++++
 2 files changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 04a53a0b733d..da5ed7d0a3c8 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -89,6 +89,8 @@ static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
 {
 	struct net_bridge_fdb_entry *f;
 
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
 	hlist_for_each_entry_rcu(f, head, hlist)
 		if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
 			break;
@@ -104,6 +106,8 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
 	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 
+	WARN_ON_ONCE(!br_hash_lock_held(br));
+
 	rcu_read_lock();
 	fdb = fdb_find_rcu(head, addr, vid);
 	rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 61368186edea..2288fca7756c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -531,6 +531,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid);
 
+static inline bool br_hash_lock_held(struct net_bridge *br)
+{
+#ifdef CONFIG_LOCKDEP
+	return lockdep_is_held(&br->hash_lock);
+#else
+	return true;
+#endif
+}
+
 /* br_forward.c */
 enum br_pkt_type {
 	BR_PKT_UNICAST,
-- 
cgit v1.2.3


From 5019ab50f26c247abb510a80635b25b64b6c1f4b Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Mon, 13 Feb 2017 14:59:11 +0100
Subject: bridge: fdb: converge fdb_delete_by functions into one

We can simplify the logic of entries pointing to the bridge by
converging the fdb_delete_by functions, this would allow us to use the
same function for both cases since the fdb's dst is set to NULL if it is
pointing to the bridge thus we can always check for a port match.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_fdb.c | 62 +++++++++++++----------------------------------------
 1 file changed, 15 insertions(+), 47 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index da5ed7d0a3c8..4ac11574117c 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -930,35 +930,10 @@ out:
 	return err;
 }
 
-static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr,
-			      u16 vid)
-{
-	struct net_bridge_fdb_entry *fdb;
-
-	fdb = br_fdb_find(br, addr, vid);
-	if (!fdb)
-		return -ENOENT;
-
-	fdb_delete(br, fdb);
-	return 0;
-}
-
-static int __br_fdb_delete_by_addr(struct net_bridge *br,
-				   const unsigned char *addr, u16 vid)
-{
-	int err;
-
-	spin_lock_bh(&br->hash_lock);
-	err = fdb_delete_by_addr(br, addr, vid);
-	spin_unlock_bh(&br->hash_lock);
-
-	return err;
-}
-
-static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
+static int fdb_delete_by_addr_and_port(struct net_bridge *br,
+				       const struct net_bridge_port *p,
 				       const u8 *addr, u16 vlan)
 {
-	struct net_bridge *br = p->br;
 	struct net_bridge_fdb_entry *fdb;
 
 	fdb = br_fdb_find(br, addr, vlan);
@@ -966,17 +941,19 @@ static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
 		return -ENOENT;
 
 	fdb_delete(br, fdb);
+
 	return 0;
 }
 
-static int __br_fdb_delete(struct net_bridge_port *p,
+static int __br_fdb_delete(struct net_bridge *br,
+			   const struct net_bridge_port *p,
 			   const unsigned char *addr, u16 vid)
 {
 	int err;
 
-	spin_lock_bh(&p->br->hash_lock);
-	err = fdb_delete_by_addr_and_port(p, addr, vid);
-	spin_unlock_bh(&p->br->hash_lock);
+	spin_lock_bh(&br->hash_lock);
+	err = fdb_delete_by_addr_and_port(br, p, addr, vid);
+	spin_unlock_bh(&br->hash_lock);
 
 	return err;
 }
@@ -989,7 +966,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	struct net_bridge_vlan_group *vg;
 	struct net_bridge_port *p = NULL;
 	struct net_bridge_vlan *v;
-	struct net_bridge *br = NULL;
+	struct net_bridge *br;
 	int err;
 
 	if (dev->priv_flags & IFF_EBRIDGE) {
@@ -1003,6 +980,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			return -EINVAL;
 		}
 		vg = nbp_vlan_group(p);
+		br = p->br;
 	}
 
 	if (vid) {
@@ -1012,30 +990,20 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			return -EINVAL;
 		}
 
-		if (dev->priv_flags & IFF_EBRIDGE)
-			err = __br_fdb_delete_by_addr(br, addr, vid);
-		else
-			err = __br_fdb_delete(p, addr, vid);
+		err = __br_fdb_delete(br, p, addr, vid);
 	} else {
 		err = -ENOENT;
-		if (dev->priv_flags & IFF_EBRIDGE)
-			err = __br_fdb_delete_by_addr(br, addr, 0);
-		else
-			err &= __br_fdb_delete(p, addr, 0);
-
+		err &= __br_fdb_delete(br, p, addr, 0);
 		if (!vg || !vg->num_vlans)
-			goto out;
+			return err;
 
 		list_for_each_entry(v, &vg->vlan_list, vlist) {
 			if (!br_vlan_should_use(v))
 				continue;
-			if (dev->priv_flags & IFF_EBRIDGE)
-				err = __br_fdb_delete_by_addr(br, addr, v->vid);
-			else
-				err &= __br_fdb_delete(p, addr, v->vid);
+			err &= __br_fdb_delete(br, p, addr, v->vid);
 		}
 	}
-out:
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 6f2e3f7d9785dacb358b48b44950182b5c13e4bc Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Tue, 14 Feb 2017 14:19:32 +0000
Subject: net_sched: nla_memdup_cookie() can be static

Fixes the following sparse warning:

net/sched/act_api.c:532:5: warning:
 symbol 'nla_memdup_cookie' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3c5e29ba6594..f219ff325ed4 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -529,7 +529,7 @@ errout:
 	return err;
 }
 
-int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb)
+static int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb)
 {
 	a->act_cookie = kzalloc(sizeof(*a->act_cookie), GFP_KERNEL);
 	if (!a->act_cookie)
-- 
cgit v1.2.3


From b0fcee825c0ad05057a97d1f4685e1b9e9d00c53 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Wed, 15 Feb 2017 09:39:24 +0100
Subject: xfrm: Add a secpath_set helper.

Add a new helper to set the secpath to the skb.
This avoids code duplication, as this is used
in multiple places.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  1 +
 net/ipv6/xfrm6_input.c | 15 +++------------
 net/xfrm/xfrm_input.c  | 34 ++++++++++++++++++++++------------
 3 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 6e061309adca..287635df4eef 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1006,6 +1006,7 @@ secpath_put(struct sec_path *sp)
 }
 
 struct sec_path *secpath_dup(struct sec_path *src);
+int secpath_set(struct sk_buff *skb);
 
 static inline void
 secpath_reset(struct sk_buff *skb)
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index b5789562aded..662fb2c3e765 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -69,18 +69,9 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 	struct xfrm_state *x = NULL;
 	int i = 0;
 
-	/* Allocate new secpath or COW existing one. */
-	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
-		struct sec_path *sp;
-
-		sp = secpath_dup(skb->sp);
-		if (!sp) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
-			goto drop;
-		}
-		if (skb->sp)
-			secpath_put(skb->sp);
-		skb->sp = sp;
+	if (secpath_set(skb)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+		goto drop;
 	}
 
 	if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 8722294c6e59..d8f913bb6919 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -117,6 +117,24 @@ struct sec_path *secpath_dup(struct sec_path *src)
 }
 EXPORT_SYMBOL(secpath_dup);
 
+int secpath_set(struct sk_buff *skb)
+{
+	struct sec_path *sp;
+
+	/* Allocate new secpath or COW existing one. */
+	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+		sp = secpath_dup(skb->sp);
+		if (!sp)
+			return -ENOMEM;
+
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(secpath_set);
+
 /* Fetch spi and seq from ipsec header */
 
 int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
@@ -212,18 +230,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		break;
 	}
 
-	/* Allocate new secpath or COW existing one. */
-	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
-		struct sec_path *sp;
-
-		sp = secpath_dup(skb->sp);
-		if (!sp) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
-			goto drop;
-		}
-		if (skb->sp)
-			secpath_put(skb->sp);
-		skb->sp = sp;
+	err = secpath_set(skb);
+	if (err) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+		goto drop;
 	}
 
 	seq = 0;
-- 
cgit v1.2.3


From 5f114163f2f5eb2edbb49c4d3e0b405c7a8a7e2a Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Wed, 15 Feb 2017 09:39:39 +0100
Subject: net: Add a skb_gro_flush_final helper.

Add a skb_gro_flush_final helper to prepare for  consuming
skbs in call_gro_receive. We will extend this helper to not
touch the skb if the skb is consumed by a gro callback with
a followup patch. We need this to handle the upcomming IPsec
ESP callbacks as they reinject the skb to the napi_gro_receive
asynchronous. The handler is used in all gro_receive functions
that can call the ESP gro handlers.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h | 5 +++++
 net/ethernet/eth.c        | 2 +-
 net/ipv4/af_inet.c        | 2 +-
 net/ipv6/ip6_offload.c    | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 58afbd1cc659..f9da3acfee9a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2661,6 +2661,11 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+{
+	NAPI_GRO_CB(skb)->flush |= flush;
+}
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index efdaaab735fc..c666ff0dd88b 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -474,7 +474,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 685ba53df2d1..602d40f43687 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1423,7 +1423,7 @@ out_unlock:
 	rcu_read_unlock();
 
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index fc7b4017ba24..0838e6d01d2e 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -253,7 +253,7 @@ out_unlock:
 	rcu_read_unlock();
 
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
-- 
cgit v1.2.3


From 25393d3fc055b76587fcc91627aee8c345400c3a Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Wed, 15 Feb 2017 09:39:44 +0100
Subject: net: Prepare gro for packet consuming gro callbacks

The upcomming IPsec ESP gro callbacks will consume the skb,
so prepare for that.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h | 9 +++++++++
 net/core/dev.c            | 7 +++++++
 net/xfrm/Kconfig          | 4 ++++
 3 files changed, 20 insertions(+)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f9da3acfee9a..9e5d1cd9d975 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -352,6 +352,7 @@ enum gro_result {
 	GRO_HELD,
 	GRO_NORMAL,
 	GRO_DROP,
+	GRO_CONSUMED,
 };
 typedef enum gro_result gro_result_t;
 
@@ -2661,10 +2662,18 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
+#ifdef CONFIG_XFRM_OFFLOAD
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+{
+	if (PTR_ERR(pp) != -EINPROGRESS)
+		NAPI_GRO_CB(skb)->flush |= flush;
+}
+#else
 static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
 {
 	NAPI_GRO_CB(skb)->flush |= flush;
 }
+#endif
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
diff --git a/net/core/dev.c b/net/core/dev.c
index 3e1a60102e64..64efbb9e4436 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4505,6 +4505,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (&ptype->list == head)
 		goto normal;
 
+	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+		ret = GRO_CONSUMED;
+		goto ok;
+	}
+
 	same_flow = NAPI_GRO_CB(skb)->same_flow;
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
@@ -4609,6 +4614,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 	case GRO_HELD:
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
@@ -4680,6 +4686,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
 		break;
 
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..a484451c72ec 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -5,6 +5,10 @@ config XFRM
        bool
        depends on NET
 
+config XFRM_OFFLOAD
+       bool
+       depends on XFRM
+
 config XFRM_ALGO
 	tristate
 	select XFRM
-- 
cgit v1.2.3


From 1e29537034e388c7e72eac43cfcda1d23131623b Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Wed, 15 Feb 2017 09:39:49 +0100
Subject: xfrm: Export xfrm_parse_spi.

We need it in the ESP offload handlers, so export it.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    | 1 +
 net/xfrm/xfrm_input.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 287635df4eef..fe8db3d87e4f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1520,6 +1520,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
 		    int encap_type);
 int xfrm4_transport_finish(struct sk_buff *skb, int async);
 int xfrm4_rcv(struct sk_buff *skb);
+int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq);
 
 static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 {
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index d8f913bb6919..86f8a8de5252 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -170,6 +170,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
 	*seq = *(__be32 *)(skb_transport_header(skb) + offset_seq);
 	return 0;
 }
+EXPORT_SYMBOL(xfrm_parse_spi);
 
 int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 54ef207ac8f7a17d677082157a29f4df8499dc81 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Wed, 15 Feb 2017 09:39:54 +0100
Subject: xfrm: Extend the sec_path for IPsec offloading

We need to keep per packet offloading informations across
the layers. So we extend the sec_path to carry these for
the input and output offload codepath.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    | 41 +++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_input.c |  2 ++
 2 files changed, 43 insertions(+)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index fe8db3d87e4f..10086a0986f8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -498,6 +498,7 @@ struct xfrm_tmpl {
 };
 
 #define XFRM_MAX_DEPTH		6
+#define XFRM_MAX_OFFLOAD_DEPTH	1
 
 struct xfrm_policy_walk_entry {
 	struct list_head	all;
@@ -973,10 +974,41 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 
 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
 
+struct xfrm_offload {
+	/* Output sequence number for replay protection on offloading. */
+	struct {
+		__u32 low;
+		__u32 hi;
+	} seq;
+
+	__u32			flags;
+#define	SA_DELETE_REQ		1
+#define	CRYPTO_DONE		2
+#define	CRYPTO_NEXT_DONE	4
+#define	CRYPTO_FALLBACK		8
+#define	XFRM_GSO_SEGMENT	16
+#define	XFRM_GRO		32
+
+	__u32			status;
+#define CRYPTO_SUCCESS				1
+#define CRYPTO_GENERIC_ERROR			2
+#define CRYPTO_TRANSPORT_AH_AUTH_FAILED		4
+#define CRYPTO_TRANSPORT_ESP_AUTH_FAILED	8
+#define CRYPTO_TUNNEL_AH_AUTH_FAILED		16
+#define CRYPTO_TUNNEL_ESP_AUTH_FAILED		32
+#define CRYPTO_INVALID_PACKET_SYNTAX		64
+#define CRYPTO_INVALID_PROTOCOL			128
+
+	__u8			proto;
+};
+
 struct sec_path {
 	atomic_t		refcnt;
 	int			len;
+	int			olen;
+
 	struct xfrm_state	*xvec[XFRM_MAX_DEPTH];
+	struct xfrm_offload	ovec[XFRM_MAX_OFFLOAD_DEPTH];
 };
 
 static inline int secpath_exists(struct sk_buff *skb)
@@ -1776,6 +1808,15 @@ static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
 {
 	return skb->sp->xvec[skb->sp->len - 1];
 }
+static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
+{
+	struct sec_path *sp = skb->sp;
+
+	if (!sp || !sp->olen || sp->len != sp->olen)
+		return NULL;
+
+	return &sp->ovec[sp->olen - 1];
+}
 #endif
 
 static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 86f8a8de5252..d2ff71230864 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -105,6 +105,8 @@ struct sec_path *secpath_dup(struct sec_path *src)
 		return NULL;
 
 	sp->len = 0;
+	sp->olen = 0;
+
 	if (src) {
 		int i;
 
-- 
cgit v1.2.3


From 7785bba299a8dc8fe8390a0183dad3cafb3f1d80 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Wed, 15 Feb 2017 09:40:00 +0100
Subject: esp: Add a software GRO codepath

This patch adds GRO ifrastructure and callbacks for ESP on
ipv4 and ipv6.

In case the GRO layer detects an ESP packet, the
esp{4,6}_gro_receive() function does a xfrm state lookup
and calls the xfrm input layer if it finds a matching state.
The packet will be decapsulated and reinjected it into layer 2.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h              |   1 +
 net/ipv4/Kconfig                |  13 +++++
 net/ipv4/Makefile               |   1 +
 net/ipv4/esp4_offload.c         | 106 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/xfrm4_input.c          |   6 +++
 net/ipv4/xfrm4_mode_transport.c |   4 +-
 net/ipv6/Kconfig                |  13 +++++
 net/ipv6/Makefile               |   1 +
 net/ipv6/esp6_offload.c         | 108 ++++++++++++++++++++++++++++++++++++++++
 net/ipv6/xfrm6_input.c          |   7 +++
 net/ipv6/xfrm6_mode_transport.c |   4 +-
 net/xfrm/xfrm_input.c           |  31 ++++++++++--
 12 files changed, 288 insertions(+), 7 deletions(-)
 create mode 100644 net/ipv4/esp4_offload.c
 create mode 100644 net/ipv6/esp6_offload.c

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 10086a0986f8..14d82bf16692 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -682,6 +682,7 @@ struct xfrm_spi_skb_cb {
 
 	unsigned int daddroff;
 	unsigned int family;
+	__be32 seq;
 };
 
 #define XFRM_SPI_SKB_CB(__skb) ((struct xfrm_spi_skb_cb *)&((__skb)->cb[0]))
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6e7baaf814c6..e0878c3f66a8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -360,6 +360,19 @@ config INET_ESP
 
 	  If unsure, say Y.
 
+config INET_ESP_OFFLOAD
+	tristate "IP: ESP transformation offload"
+	depends on INET_ESP
+	select XFRM_OFFLOAD
+	default n
+	---help---
+	  Support for ESP transformation offload. This makes sense
+	  only if this system really does IPsec and want to do it
+	  with high throughput. A typical desktop system does not
+	  need it, even if it does IPsec.
+
+	  If unsure, say N.
+
 config INET_IPCOMP
 	tristate "IP: IPComp transformation"
 	select INET_XFRM_TUNNEL
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 48af58a5686e..c6d4238ff94a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
 obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 000000000000..1de442632406
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,106 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/udp.h>
+
+static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	int offset = skb_gro_offset(skb);
+	struct xfrm_offload *xo;
+	struct xfrm_state *x;
+	__be32 seq;
+	__be32 spi;
+	int err;
+
+	skb_pull(skb, offset);
+
+	if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+		goto out;
+
+	err = secpath_set(skb);
+	if (err)
+		goto out;
+
+	if (skb->sp->len == XFRM_MAX_DEPTH)
+		goto out;
+
+	x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+			      (xfrm_address_t *)&ip_hdr(skb)->daddr,
+			      spi, IPPROTO_ESP, AF_INET);
+	if (!x)
+		goto out;
+
+	skb->sp->xvec[skb->sp->len++] = x;
+	skb->sp->olen++;
+
+	xo = xfrm_offload(skb);
+	if (!xo) {
+		xfrm_state_put(x);
+		goto out;
+	}
+	xo->flags |= XFRM_GRO;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+	XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+	/* We don't need to handle errors from xfrm_input, it does all
+	 * the error handling and frees the resources on error. */
+	xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+	return ERR_PTR(-EINPROGRESS);
+out:
+	skb_push(skb, offset);
+	NAPI_GRO_CB(skb)->same_flow = 0;
+	NAPI_GRO_CB(skb)->flush = 1;
+
+	return NULL;
+}
+
+static const struct net_offload esp4_offload = {
+	.callbacks = {
+		.gro_receive = esp4_gro_receive,
+	},
+};
+
+static int __init esp4_offload_init(void)
+{
+	return inet_add_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+static void __exit esp4_offload_exit(void)
+{
+	inet_del_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+module_init(esp4_offload_init);
+module_exit(esp4_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 62e1e72db461..1fc684111ce6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -40,6 +40,7 @@ drop:
 
 int xfrm4_transport_finish(struct sk_buff *skb, int async)
 {
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	struct iphdr *iph = ip_hdr(skb);
 
 	iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
@@ -53,6 +54,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
 	iph->tot_len = htons(skb->len);
 	ip_send_check(iph);
 
+	if (xo && (xo->flags & XFRM_GRO)) {
+		skb_mac_header_rebuild(skb);
+		return 0;
+	}
+
 	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
 		xfrm4_rcv_encap_finish);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7d75ea..4acc0508c5eb 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -43,6 +43,7 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int ihl = skb->data - skb_transport_header(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (skb->transport_header != skb->network_header) {
 		memmove(skb_transport_header(skb),
@@ -50,7 +51,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 		skb->network_header = skb->transport_header;
 	}
 	ip_hdr(skb)->tot_len = htons(skb->len + ihl);
-	skb_reset_transport_header(skb);
+	if (!xo || !(xo->flags & XFRM_GRO))
+		skb_reset_transport_header(skb);
 	return 0;
 }
 
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ec1267e2bd1f..b2a85cca91f7 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -75,6 +75,19 @@ config INET6_ESP
 
 	  If unsure, say Y.
 
+config INET6_ESP_OFFLOAD
+	tristate "IPv6: ESP transformation offload"
+	depends on INET6_ESP
+	select XFRM_OFFLOAD
+	default n
+	---help---
+	  Support for ESP transformation offload. This makes sense
+	  only if this system really does IPsec and want to do it
+	  with high throughput. A typical desktop system does not
+	  need it, even if it does IPsec.
+
+	  If unsure, say N.
+
 config INET6_IPCOMP
 	tristate "IPv6: IPComp transformation"
 	select INET6_XFRM_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a9e9fec387ce..217e9ff0e24b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -30,6 +30,7 @@ ipv6-objs += $(ipv6-y)
 
 obj-$(CONFIG_INET6_AH) += ah6.o
 obj-$(CONFIG_INET6_ESP) += esp6.o
+obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
new file mode 100644
index 000000000000..d914eb93204a
--- /dev/null
+++ b/net/ipv6/esp6_offload.c
@@ -0,0 +1,108 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <linux/icmpv6.h>
+
+static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	int offset = skb_gro_offset(skb);
+	struct xfrm_offload *xo;
+	struct xfrm_state *x;
+	__be32 seq;
+	__be32 spi;
+	int err;
+
+	skb_pull(skb, offset);
+
+	if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+		goto out;
+
+	err = secpath_set(skb);
+	if (err)
+		goto out;
+
+	if (skb->sp->len == XFRM_MAX_DEPTH)
+		goto out;
+
+	x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+			      (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
+			      spi, IPPROTO_ESP, AF_INET6);
+	if (!x)
+		goto out;
+
+	skb->sp->xvec[skb->sp->len++] = x;
+	skb->sp->olen++;
+
+	xo = xfrm_offload(skb);
+	if (!xo) {
+		xfrm_state_put(x);
+		goto out;
+	}
+	xo->flags |= XFRM_GRO;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+	XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+	/* We don't need to handle errors from xfrm_input, it does all
+	 * the error handling and frees the resources on error. */
+	xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+	return ERR_PTR(-EINPROGRESS);
+out:
+	skb_push(skb, offset);
+	NAPI_GRO_CB(skb)->same_flow = 0;
+	NAPI_GRO_CB(skb)->flush = 1;
+
+	return NULL;
+}
+
+static const struct net_offload esp6_offload = {
+	.callbacks = {
+		.gro_receive = esp6_gro_receive,
+	},
+};
+
+static int __init esp6_offload_init(void)
+{
+	return inet6_add_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+static void __exit esp6_offload_exit(void)
+{
+	inet6_del_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+module_init(esp6_offload_init);
+module_exit(esp6_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 662fb2c3e765..08a807b29298 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -33,6 +33,8 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
 
 int xfrm6_transport_finish(struct sk_buff *skb, int async)
 {
+	struct xfrm_offload *xo = xfrm_offload(skb);
+
 	skb_network_header(skb)[IP6CB(skb)->nhoff] =
 		XFRM_MODE_SKB_CB(skb)->protocol;
 
@@ -44,6 +46,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	ipv6_hdr(skb)->payload_len = htons(skb->len);
 	__skb_push(skb, skb->data - skb_network_header(skb));
 
+	if (xo && (xo->flags & XFRM_GRO)) {
+		skb_mac_header_rebuild(skb);
+		return -1;
+	}
+
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
 		ip6_rcv_finish);
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 4e344105b3fd..4439ee44c8b0 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -47,6 +47,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int ihl = skb->data - skb_transport_header(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (skb->transport_header != skb->network_header) {
 		memmove(skb_transport_header(skb),
@@ -55,7 +56,8 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 	}
 	ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
 					   sizeof(struct ipv6hdr));
-	skb_reset_transport_header(skb);
+	if (!xo || !(xo->flags & XFRM_GRO))
+		skb_reset_transport_header(skb);
 	return 0;
 }
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index d2ff71230864..46bdb4fbed0b 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -207,14 +207,23 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	unsigned int family;
 	int decaps = 0;
 	int async = 0;
+	struct xfrm_offload *xo;
+	bool xfrm_gro = false;
 
-	/* A negative encap_type indicates async resumption. */
 	if (encap_type < 0) {
-		async = 1;
 		x = xfrm_input_state(skb);
-		seq = XFRM_SKB_CB(skb)->seq.input.low;
 		family = x->outer_mode->afinfo->family;
-		goto resume;
+
+		/* An encap_type of -1 indicates async resumption. */
+		if (encap_type == -1) {
+			async = 1;
+			seq = XFRM_SKB_CB(skb)->seq.input.low;
+			goto resume;
+		}
+		/* encap_type < -1 indicates a GRO call. */
+		encap_type = 0;
+		seq = XFRM_SPI_SKB_CB(skb)->seq;
+		goto lock;
 	}
 
 	daddr = (xfrm_address_t *)(skb_network_header(skb) +
@@ -260,6 +269,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 		skb->sp->xvec[skb->sp->len++] = x;
 
+lock:
 		spin_lock(&x->lock);
 
 		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
@@ -381,7 +391,18 @@ resume:
 		gro_cells_receive(&gro_cells, skb);
 		return 0;
 	} else {
-		return x->inner_mode->afinfo->transport_finish(skb, async);
+		xo = xfrm_offload(skb);
+		if (xo)
+			xfrm_gro = xo->flags & XFRM_GRO;
+
+		err = x->inner_mode->afinfo->transport_finish(skb, async);
+		if (xfrm_gro) {
+			skb_dst_drop(skb);
+			gro_cells_receive(&gro_cells, skb);
+			return err;
+		}
+
+		return err;
 	}
 
 drop_unlock:
-- 
cgit v1.2.3


From 425df17ce3a26d98f76e2b6b0af2acf4aeb0b026 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Tue, 14 Feb 2017 21:16:28 -0800
Subject: openvswitch: Set internal device max mtu to ETH_MAX_MTU.

Commit 91572088e3fd ("net: use core MTU range checking in core net
infra") changed the openvswitch internal device to use the core net
infra for controlling the MTU range, but failed to actually set the
max_mtu as described in the commit message, which now defaults to
ETH_DATA_LEN.

This patch fixes this by setting max_mtu to ETH_MAX_MTU after
ether_setup() call.

Fixes: 91572088e3fd ("net: use core MTU range checking in core net infra")
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/vport-internal_dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 09141a18ee2d..89193a634da4 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -149,6 +149,8 @@ static void do_setup(struct net_device *netdev)
 {
 	ether_setup(netdev);
 
+	netdev->max_mtu = ETH_MAX_MTU;
+
 	netdev->netdev_ops = &internal_dev_netdev_ops;
 
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
-- 
cgit v1.2.3


From 9dcbc313cd4ada386487b3055d9b6ec169d51765 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia
Date: Thu, 29 Dec 2016 09:51:19 -0300
Subject: Bluetooth: Fix NULL pointer dereference in bt_sock_recvmsg

As per the comment in include/linux/net.h, the recvfrom handlers
should expect msg_name to be NULL. However, bt_sock_recvmsg()
is currently not checking it, which could lead to a NULL pointer
dereference.

The following NULL pointer dereference was produced while testing
L2CAP datagram reception. Note that the kernel is tainted due to
the r8723bs module being inserted. However, it seems the fix still
applies.

$ l2test -r -G
l2test[326]: Receiving ...
Unable to handle kernel NULL pointer dereference at virtual address 00000000
pgd = ee008000
[00000000] *pgd=7f896835
Internal error: Oops: 817 [#1] PREEMPT SMP ARM
Modules linked in: r8723bs(O)
CPU: 0 PID: 326 Comm: l2test Tainted: G           O 4.8.0 #1
Hardware name: Allwinner sun7i (A20) Family
task: ef1c6880 task.stack: eea70000
PC is at __memzero+0x58/0x80
LR is at l2cap_skb_msg_name+0x1c/0x4c
pc : [<c02c47d8>]    lr : [<c0506278>]    psr: 00070013
sp : eea71e60  ip : 00000000  fp : 00034e1c
r10: 00000000  r9 : 00000000  r8 : eea71ed4
r7 : 000002a0  r6 : eea71ed8  r5 : 00000000  r4 : ee4a5d80
r3 : 00000000  r2 : 00000000  r1 : 0000000e  r0 : 00000000
Flags: nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM Segment none
Control: 10c5387d  Table: 7600806a  DAC: 00000051
Process l2test (pid: 326, stack limit = 0xeea70210)
Stack: (0xeea71e60 to 0xeea72000)
1e60: ee4a5d80 eeac2800 000002a0 c04d7114 173eefa0 00000000 c06ca68e 00000000
1e80: 00000001 eeac2800 eef23500 00000000 000002a0 eea71ed4 eea70000 c0504d50
1ea0: 00000000 00000000 eef23500 00000000 00000000 c044e8a0 eea71edc eea9f904
1ec0: bef89aa0 fffffff7 00000000 00035008 000002a0 00000000 00000000 00000000
1ee0: 00000000 00000000 eea71ed4 00000000 00000000 00000000 00004000 00000000
1f00: 0000011b c01078c4 eea70000 c044e5e4 00000000 00000000 642f0001 6c2f7665
1f20: 0000676f 00000000 00000000 00000000 00000000 00000000 00000000 00000000
1f40: 00000000 00000000 00000000 00000000 00000000 ffffffff 00000001 bef89ad8
1f60: 000000a8 c01078c4 eea70000 00000000 00034e1c c01e6c74 00000000 00000000
1f80: 00034e1c 000341f8 00000000 00000123 c01078c4 c044e90c 00000000 00000000
1fa0: 000002a0 c0107700 00034e1c 000341f8 00000003 00035008 000002a0 00000000
1fc0: 00034e1c 000341f8 00000000 00000123 00000000 00000000 00011ffc 00034e1c
1fe0: 00000000 bef89aa4 0001211c b6eebb60 60070010 00000003 00000000 00000000
[<c02c47d8>] (__memzero) from [<c0506278>] (l2cap_skb_msg_name+0x1c/0x4c)
[<c0506278>] (l2cap_skb_msg_name) from [<c04d7114>] (bt_sock_recvmsg+0x128/0x160)
[<c04d7114>] (bt_sock_recvmsg) from [<c0504d50>] (l2cap_sock_recvmsg+0x98/0x134)
[<c0504d50>] (l2cap_sock_recvmsg) from [<c044e8a0>] (SyS_recvfrom+0x94/0xec)
[<c044e8a0>] (SyS_recvfrom) from [<c044e90c>] (SyS_recv+0x14/0x1c)
[<c044e90c>] (SyS_recv) from [<c0107700>] (ret_fast_syscall+0x0/0x3c)
Code: e3110010 18a0500c e49de004 e3110008 (18a0000c)
---[ end trace 224e35e79fe06b42 ]---

Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/af_bluetooth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1aff2da9bc74..cfb2faba46de 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -245,7 +245,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	if (err == 0) {
 		sock_recv_ts_and_drops(msg, sk, skb);
 
-		if (bt_sk(sk)->skb_msg_name)
+		if (msg->msg_name && bt_sk(sk)->skb_msg_name)
 			bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
 						&msg->msg_namelen);
 	}
-- 
cgit v1.2.3


From 27d1c469a5d3a104fc5a9313dac84e726f8f3de9 Mon Sep 17 00:00:00 2001
From: Colin Ian King
Date: Wed, 28 Dec 2016 21:17:54 +0000
Subject: Bluetooth: fix spelling mistake: "advetising" -> "advertising"

trivial fix to spelling mistake in BT_ERR_RATELIMITED error message

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e17aacbc5630..0b4dba08a14e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4749,7 +4749,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 	case LE_ADV_SCAN_RSP:
 		break;
 	default:
-		BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x",
+		BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
 				   type);
 		return;
 	}
-- 
cgit v1.2.3


From 749e6720d2ee10d5221d5d7b8cee8ac5d1cd690e Mon Sep 17 00:00:00 2001
From: Or Gerlitz
Date: Thu, 16 Feb 2017 10:31:10 +0200
Subject: net/sched: cls_flower: Properly handle classifier flags dumping

Dump the classifier flags only if non zero and make sure to check
the return status of the handler that puts them into the netlink msg.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 0826c8ec3a76..850d98294bf6 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1229,7 +1229,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 	if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
 		goto nla_put_failure;
 
-	nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
+	if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+		goto nla_put_failure;
 
 	if (tcf_exts_dump(skb, &f->exts))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 7a335adad8b06778c0876aa5a5eb8954cd835bf5 Mon Sep 17 00:00:00 2001
From: Or Gerlitz
Date: Thu, 16 Feb 2017 10:31:11 +0200
Subject: net/sched: cls_matchall: Dump the classifier flags

The classifier flags are not dumped to user-space, do that.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index f2141cb55562..35ef1c1be1f3 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -244,6 +244,9 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 	    nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid))
 		goto nla_put_failure;
 
+	if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
+		goto nla_put_failure;
+
 	if (tcf_exts_dump(skb, &head->exts))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 55593960d0d88c6d80b7b3a615dbe09de85f2541 Mon Sep 17 00:00:00 2001
From: Or Gerlitz
Date: Thu, 16 Feb 2017 10:31:13 +0200
Subject: net/sched: cls_flower: Reflect HW offload status

Flower support for the "in hw" offloading flags.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 850d98294bf6..9d0c99d2e9fb 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -273,6 +273,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
 					    tc);
+	if (!err)
+		f->flags |= TCA_CLS_FLAGS_IN_HW;
 
 	if (tc_skip_sw(f->flags))
 		return err;
@@ -912,6 +914,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 			goto errout;
 	}
 
+	if (!tc_in_hw(fnew->flags))
+		fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 	if (fold) {
 		if (!tc_skip_sw(fold->flags))
 			rhashtable_remove_fast(&head->ht, &fold->ht_node,
-- 
cgit v1.2.3


From c7d2b2f5eebe6e76efc11cfd7a600c0748234f3a Mon Sep 17 00:00:00 2001
From: Or Gerlitz
Date: Thu, 16 Feb 2017 10:31:14 +0200
Subject: net/sched: cls_matchall: Reflect HW offloading status

Matchall support for the "in hw" offloading flags.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 35ef1c1be1f3..224eb2c14346 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -56,6 +56,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_to_netdev offload;
 	struct tc_cls_matchall_offload mall_offload = {0};
+	int err;
 
 	offload.type = TC_SETUP_MATCHALL;
 	offload.cls_mall = &mall_offload;
@@ -63,8 +64,12 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	offload.cls_mall->exts = &head->exts;
 	offload.cls_mall->cookie = cookie;
 
-	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
-					     &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+					    &offload);
+	if (!err)
+		head->flags |= TCA_CLS_FLAGS_IN_HW;
+
+	return err;
 }
 
 static void mall_destroy_hw_filter(struct tcf_proto *tp,
@@ -194,6 +199,9 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 		}
 	}
 
+	if (!tc_in_hw(new->flags))
+		new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 	*arg = (unsigned long) head;
 	rcu_assign_pointer(tp->root, new);
 	if (head)
-- 
cgit v1.2.3


From 24d3dc6d27eae19f422a5e216e25d3a16628d4ff Mon Sep 17 00:00:00 2001
From: Or Gerlitz
Date: Thu, 16 Feb 2017 10:31:15 +0200
Subject: net/sched: cls_u32: Reflect HW offload status

U32 support for the "in hw" offloading flags.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index a6ec3e4b57ab..4dbe0c680fe6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -523,6 +523,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
 					    tp->protocol, &offload);
+
+	if (!err)
+		n->flags |= TCA_CLS_FLAGS_IN_HW;
+
 	if (tc_skip_sw(flags))
 		return err;
 
@@ -895,6 +899,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return err;
 		}
 
+		if (!tc_in_hw(new->flags))
+			new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
@@ -1014,6 +1021,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (err)
 			goto errhw;
 
+		if (!tc_in_hw(n->flags))
+			n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 		ins = &ht->ht[TC_U32_HASH(handle)];
 		for (pins = rtnl_dereference(*ins); pins;
 		     ins = &pins->next, pins = rtnl_dereference(*ins))
-- 
cgit v1.2.3


From 5cecb6cc008148b4afc51f7bacfa753e1a957483 Mon Sep 17 00:00:00 2001
From: Or Gerlitz
Date: Thu, 16 Feb 2017 10:31:16 +0200
Subject: net/sched: cls_bpf: Reflect HW offload status

BPF classifier support for the "in hw" offloading flags.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index d9c97018317d..80f688436dd7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -148,6 +148,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_bpf_offload bpf_offload = {};
 	struct tc_to_netdev offload;
+	int err;
 
 	offload.type = TC_SETUP_CLSBPF;
 	offload.cls_bpf = &bpf_offload;
@@ -159,8 +160,13 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	bpf_offload.exts_integrated = prog->exts_integrated;
 	bpf_offload.gen_flags = prog->gen_flags;
 
-	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					     tp->protocol, &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					    tp->protocol, &offload);
+
+	if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
+		prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
+
+	return err;
 }
 
 static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
@@ -511,6 +517,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		return ret;
 	}
 
+	if (!tc_in_hw(prog->gen_flags))
+		prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 	if (oldprog) {
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
-- 
cgit v1.2.3


From 40f9f439706073b4b0a654b3b99e18296b7990b3 Mon Sep 17 00:00:00 2001
From: Herbert Xu
Date: Sat, 11 Feb 2017 19:26:46 +0800
Subject: tipc: Fix tipc_sk_reinit race conditions

There are two problems with the function tipc_sk_reinit.  Firstly
it's doing a manual walk over an rhashtable.  This is broken as
an rhashtable can be resized and if you manually walk over it
during a resize then you may miss entries.

Secondly it's missing memory barriers as previously the code used
spinlocks which provide the barriers implicitly.

This patch fixes both problems.

Fixes: 07f6c4bc048a ("tipc: convert tipc reference table to...")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/net.c    |  4 ++++
 net/tipc/socket.c | 30 +++++++++++++++++++-----------
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/tipc/net.c b/net/tipc/net.c
index 28bf4feeb81c..ab8a2d5d1e32 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr)
 	char addr_string[16];
 
 	tn->own_addr = addr;
+
+	/* Ensure that the new address is visible before we reinit. */
+	smp_mb();
+
 	tipc_named_reinit(net);
 	tipc_sk_reinit(net);
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 103d1fd058c0..6b09a778cc71 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -430,8 +430,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	INIT_LIST_HEAD(&tsk->cong_links);
 	msg = &tsk->phdr;
 	tn = net_generic(sock_net(sk), tipc_net_id);
-	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
-		      NAMED_H_SIZE, 0);
 
 	/* Finish initializing socket data structures */
 	sock->ops = ops;
@@ -441,6 +439,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 		pr_warn("Socket create failed; port number exhausted\n");
 		return -EINVAL;
 	}
+
+	/* Ensure tsk is visible before we read own_addr. */
+	smp_mb();
+
+	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
+		      NAMED_H_SIZE, 0);
+
 	msg_set_origport(msg, tsk->portid);
 	setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
 	sk->sk_shutdown = 0;
@@ -2234,24 +2239,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 void tipc_sk_reinit(struct net *net)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	const struct bucket_table *tbl;
-	struct rhash_head *pos;
+	struct rhashtable_iter iter;
 	struct tipc_sock *tsk;
 	struct tipc_msg *msg;
-	int i;
 
-	rcu_read_lock();
-	tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
-	for (i = 0; i < tbl->size; i++) {
-		rht_for_each_entry_rcu(tsk, pos, tbl, i, node) {
+	rhashtable_walk_enter(&tn->sk_rht, &iter);
+
+	do {
+		tsk = ERR_PTR(rhashtable_walk_start(&iter));
+		if (tsk)
+			continue;
+
+		while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
 			spin_lock_bh(&tsk->sk.sk_lock.slock);
 			msg = &tsk->phdr;
 			msg_set_prevnode(msg, tn->own_addr);
 			msg_set_orignode(msg, tn->own_addr);
 			spin_unlock_bh(&tsk->sk.sk_lock.slock);
 		}
-	}
-	rcu_read_unlock();
+
+		rhashtable_walk_stop(&iter);
+	} while (tsk == ERR_PTR(-EAGAIN));
 }
 
 static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
-- 
cgit v1.2.3


From afcb50ba7f745eea32f91d7f63d6aa88f929f9c4 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu
Date: Thu, 16 Feb 2017 11:29:21 -0800
Subject: bridge: vlan_tunnel: explicitly reset metadata attrs to NULL on
 failure

Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_vlan_tunnel.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index b2b79a070162..6d2c4eed2dc8 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -85,6 +85,8 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
 	return 0;
 out:
 	dst_release(&vlan->tinfo.tunnel_dst->dst);
+	vlan->tinfo.tunnel_dst = NULL;
+	vlan->tinfo.tunnel_id = 0;
 
 	return err;
 }
-- 
cgit v1.2.3


From c78f8bdfa11fcceb9723c61212e4bd8f76c87f9e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Thu, 16 Feb 2017 22:24:48 +0100
Subject: bpf: mark all registered map/prog types as __ro_after_init

All map types and prog types are registered to the BPF core through
bpf_register_map_type() and bpf_register_prog_type() during init and
remain unchanged thereafter. As by design we don't (and never will)
have any pluggable code that can register to that at any later point
in time, lets mark all the existing bpf_{map,prog}_type_list objects
in the tree as __ro_after_init, so they can be moved to read-only
section from then onwards.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c    | 10 +++++-----
 kernel/bpf/hashtab.c     |  8 ++++----
 kernel/bpf/lpm_trie.c    |  2 +-
 kernel/bpf/stackmap.c    |  2 +-
 kernel/trace/bpf_trace.c |  6 +++---
 net/core/filter.c        | 18 +++++++++---------
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3d55d95dcf49..6b6f41f0b211 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -269,7 +269,7 @@ static const struct bpf_map_ops array_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map_type_list array_type __read_mostly = {
+static struct bpf_map_type_list array_type __ro_after_init = {
 	.ops = &array_ops,
 	.type = BPF_MAP_TYPE_ARRAY,
 };
@@ -283,7 +283,7 @@ static const struct bpf_map_ops percpu_array_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map_type_list percpu_array_type __read_mostly = {
+static struct bpf_map_type_list percpu_array_type __ro_after_init = {
 	.ops = &percpu_array_ops,
 	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
 };
@@ -409,7 +409,7 @@ static const struct bpf_map_ops prog_array_ops = {
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 
-static struct bpf_map_type_list prog_array_type __read_mostly = {
+static struct bpf_map_type_list prog_array_type __ro_after_init = {
 	.ops = &prog_array_ops,
 	.type = BPF_MAP_TYPE_PROG_ARRAY,
 };
@@ -522,7 +522,7 @@ static const struct bpf_map_ops perf_event_array_ops = {
 	.map_release = perf_event_fd_array_release,
 };
 
-static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
 	.ops = &perf_event_array_ops,
 	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
@@ -564,7 +564,7 @@ static const struct bpf_map_ops cgroup_array_ops = {
 	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
 };
 
-static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
 	.ops = &cgroup_array_ops,
 	.type = BPF_MAP_TYPE_CGROUP_ARRAY,
 };
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a753bbe7df0a..3ea87fb19a94 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1023,7 +1023,7 @@ static const struct bpf_map_ops htab_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_type __read_mostly = {
+static struct bpf_map_type_list htab_type __ro_after_init = {
 	.ops = &htab_ops,
 	.type = BPF_MAP_TYPE_HASH,
 };
@@ -1037,7 +1037,7 @@ static const struct bpf_map_ops htab_lru_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_lru_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_type __ro_after_init = {
 	.ops = &htab_lru_ops,
 	.type = BPF_MAP_TYPE_LRU_HASH,
 };
@@ -1124,7 +1124,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
 	.ops = &htab_percpu_ops,
 	.type = BPF_MAP_TYPE_PERCPU_HASH,
 };
@@ -1138,7 +1138,7 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
 	.ops = &htab_lru_percpu_ops,
 	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
 };
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e0f6a0bd279b..8bfe0afaee10 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -508,7 +508,7 @@ static const struct bpf_map_ops trie_ops = {
 	.map_delete_elem = trie_delete_elem,
 };
 
-static struct bpf_map_type_list trie_type __read_mostly = {
+static struct bpf_map_type_list trie_type __ro_after_init = {
 	.ops = &trie_ops,
 	.type = BPF_MAP_TYPE_LPM_TRIE,
 };
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index be8519148c25..22aa45cd0324 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -273,7 +273,7 @@ static const struct bpf_map_ops stack_map_ops = {
 	.map_delete_elem = stack_map_delete_elem,
 };
 
-static struct bpf_map_type_list stack_map_type __read_mostly = {
+static struct bpf_map_type_list stack_map_type __ro_after_init = {
 	.ops = &stack_map_ops,
 	.type = BPF_MAP_TYPE_STACK_TRACE,
 };
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 424daa4586d1..cee9802cf3e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -506,7 +506,7 @@ static const struct bpf_verifier_ops kprobe_prog_ops = {
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
-static struct bpf_prog_type_list kprobe_tl = {
+static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
 	.ops	= &kprobe_prog_ops,
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
@@ -589,7 +589,7 @@ static const struct bpf_verifier_ops tracepoint_prog_ops = {
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
-static struct bpf_prog_type_list tracepoint_tl = {
+static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
 	.ops	= &tracepoint_prog_ops,
 	.type	= BPF_PROG_TYPE_TRACEPOINT,
 };
@@ -648,7 +648,7 @@ static const struct bpf_verifier_ops perf_event_prog_ops = {
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
 
-static struct bpf_prog_type_list perf_event_tl = {
+static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
 	.ops	= &perf_event_prog_ops,
 	.type	= BPF_PROG_TYPE_PERF_EVENT,
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index 0b753cbb2536..e466e0040137 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3296,47 +3296,47 @@ static const struct bpf_verifier_ops cg_sock_ops = {
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-static struct bpf_prog_type_list sk_filter_type __read_mostly = {
+static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
-static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
 	.ops	= &tc_cls_act_ops,
 	.type	= BPF_PROG_TYPE_SCHED_CLS,
 };
 
-static struct bpf_prog_type_list sched_act_type __read_mostly = {
+static struct bpf_prog_type_list sched_act_type __ro_after_init = {
 	.ops	= &tc_cls_act_ops,
 	.type	= BPF_PROG_TYPE_SCHED_ACT,
 };
 
-static struct bpf_prog_type_list xdp_type __read_mostly = {
+static struct bpf_prog_type_list xdp_type __ro_after_init = {
 	.ops	= &xdp_ops,
 	.type	= BPF_PROG_TYPE_XDP,
 };
 
-static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
 	.ops	= &cg_skb_ops,
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
-static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
 	.ops	= &lwt_inout_ops,
 	.type	= BPF_PROG_TYPE_LWT_IN,
 };
 
-static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
 	.ops	= &lwt_inout_ops,
 	.type	= BPF_PROG_TYPE_LWT_OUT,
 };
 
-static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
 	.ops	= &lwt_xmit_ops,
 	.type	= BPF_PROG_TYPE_LWT_XMIT,
 };
 
-static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
 	.ops	= &cg_sock_ops,
 	.type	= BPF_PROG_TYPE_CGROUP_SOCK
 };
-- 
cgit v1.2.3


From 74451e66d516c55e309e8d89a4a1e7596e46aacd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Thu, 16 Feb 2017 22:24:50 +0100
Subject: bpf: make jited programs visible in traces

Long standing issue with JITed programs is that stack traces from
function tracing check whether a given address is kernel code
through {__,}kernel_text_address(), which checks for code in core
kernel, modules and dynamically allocated ftrace trampolines. But
what is still missing is BPF JITed programs (interpreted programs
are not an issue as __bpf_prog_run() will be attributed to them),
thus when a stack trace is triggered, the code walking the stack
won't see any of the JITed ones. The same for address correlation
done from user space via reading /proc/kallsyms. This is read by
tools like perf, but the latter is also useful for permanent live
tracing with eBPF itself in combination with stack maps when other
eBPF types are part of the callchain. See offwaketime example on
dumping stack from a map.

This work tries to tackle that issue by making the addresses and
symbols known to the kernel. The lookup from *kernel_text_address()
is implemented through a latched RB tree that can be read under
RCU in fast-path that is also shared for symbol/size/offset lookup
for a specific given address in kallsyms. The slow-path iteration
through all symbols in the seq file done via RCU list, which holds
a tiny fraction of all exported ksyms, usually below 0.1 percent.
Function symbols are exported as bpf_prog_<tag>, in order to aide
debugging and attribution. This facility is currently enabled for
root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening
is active in any mode. The rationale behind this is that still a lot
of systems ship with world read permissions on kallsyms thus addresses
should not get suddenly exposed for them. If that situation gets
much better in future, we always have the option to change the
default on this. Likewise, unprivileged programs are not allowed
to add entries there either, but that is less of a concern as most
such programs types relevant in this context are for root-only anyway.
If enabled, call graphs and stack traces will then show a correct
attribution; one example is illustrated below, where the trace is
now visible in tooling such as perf script --kallsyms=/proc/kallsyms
and friends.

Before:

  7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so)

After:

  7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux)
  [...]
  7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so)

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt      |  12 ++
 arch/arm64/net/bpf_jit_comp.c     |  15 ---
 arch/powerpc/net/bpf_jit_comp64.c |   1 +
 arch/s390/net/bpf_jit_comp.c      |  18 ---
 arch/x86/net/bpf_jit_comp.c       |  15 ---
 include/linux/bpf.h               |   4 +
 include/linux/filter.h            | 112 ++++++++++++++++++-
 kernel/bpf/core.c                 | 223 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c              |   2 +
 kernel/extable.c                  |   9 +-
 kernel/kallsyms.c                 |  61 +++++++++--
 net/Kconfig                       |   3 +-
 net/core/sysctl_net_core.c        |   7 ++
 13 files changed, 419 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index b80fbd4e5575..2ebabc93014a 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -54,6 +54,18 @@ Values :
 	1 - enable JIT hardening for unprivileged users only
 	2 - enable JIT hardening for all users
 
+bpf_jit_kallsyms
+----------------
+
+When Berkeley Packet Filter Just in Time compiler is enabled, then compiled
+images are unknown addresses to the kernel, meaning they neither show up in
+traces nor in /proc/kallsyms. This enables export of these addresses, which
+can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature
+is disabled.
+Values :
+	0 - disable JIT kallsyms export (default value)
+	1 - enable JIT kallsyms export for privileged users only
+
 dev_weight
 --------------
 
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index c444408d5a8c..05d12104d270 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -910,18 +910,3 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
-
-void bpf_jit_free(struct bpf_prog *prog)
-{
-	unsigned long addr = (unsigned long)prog->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!prog->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(prog);
-}
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index f9ebd02260da..c34166ef76fc 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1064,6 +1064,7 @@ out:
 	return fp;
 }
 
+/* Overriding bpf_jit_free() as we don't set images read-only. */
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 6454efd22e63..f1d0e62ec1dd 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1339,21 +1339,3 @@ out:
 					   tmp : orig_fp);
 	return fp;
 }
-
-/*
- * Free eBPF program
- */
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!fp->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(fp);
-}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26123d0ae13a..18a62e208826 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1180,18 +1180,3 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
-
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!fp->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(fp);
-}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 57d60dc5b600..909fc033173a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -8,10 +8,12 @@
 #define _LINUX_BPF_H 1
 
 #include <uapi/linux/bpf.h>
+
 #include <linux/workqueue.h>
 #include <linux/file.h>
 #include <linux/percpu.h>
 #include <linux/err.h>
+#include <linux/rbtree_latch.h>
 
 struct perf_event;
 struct bpf_map;
@@ -177,6 +179,8 @@ struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
+	struct latch_tree_node ksym_tnode;
+	struct list_head ksym_lnode;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c7a70e0cc3a0..0c1cc9143cb2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -54,6 +54,12 @@ struct bpf_prog_aux;
 #define BPF_REG_AX		MAX_BPF_REG
 #define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
 
+/* As per nm, we expose JITed images as text (code) section for
+ * kallsyms. That way, tools like perf can find it to match
+ * addresses.
+ */
+#define BPF_SYM_ELF_TYPE	't'
+
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
@@ -555,6 +561,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 	set_memory_rw((unsigned long)fp, fp->pages);
 }
+
+static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
+{
+	set_memory_rw((unsigned long)hdr, hdr->pages);
+}
 #else
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
@@ -563,8 +574,21 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 }
+
+static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
+{
+}
 #endif /* CONFIG_DEBUG_SET_MODULE_RONX */
 
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+	unsigned long real_start = (unsigned long)fp->bpf_func;
+	unsigned long addr = real_start & PAGE_MASK;
+
+	return (void *)addr;
+}
+
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
@@ -617,6 +641,7 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
+extern int bpf_jit_kallsyms;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
@@ -651,6 +676,11 @@ static inline bool bpf_jit_is_ebpf(void)
 # endif
 }
 
+static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
+{
+	return fp->jited && bpf_jit_is_ebpf();
+}
+
 static inline bool bpf_jit_blinding_enabled(void)
 {
 	/* These are the prerequisites, should someone ever have the
@@ -668,11 +698,91 @@ static inline bool bpf_jit_blinding_enabled(void)
 
 	return true;
 }
-#else
+
+static inline bool bpf_jit_kallsyms_enabled(void)
+{
+	/* There are a couple of corner cases where kallsyms should
+	 * not be enabled f.e. on hardening.
+	 */
+	if (bpf_jit_harden)
+		return false;
+	if (!bpf_jit_kallsyms)
+		return false;
+	if (bpf_jit_kallsyms == 1)
+		return true;
+
+	return false;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+				 unsigned long *off, char *sym);
+bool is_bpf_text_address(unsigned long addr);
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		    char *sym);
+
+static inline const char *
+bpf_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	const char *ret = __bpf_address_lookup(addr, size, off, sym);
+
+	if (ret && modname)
+		*modname = NULL;
+	return ret;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp);
+void bpf_prog_kallsyms_del(struct bpf_prog *fp);
+
+#else /* CONFIG_BPF_JIT */
+
+static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
+{
+	return false;
+}
+
 static inline void bpf_jit_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_free(fp);
 }
+
+static inline bool bpf_jit_kallsyms_enabled(void)
+{
+	return false;
+}
+
+static inline const char *
+__bpf_address_lookup(unsigned long addr, unsigned long *size,
+		     unsigned long *off, char *sym)
+{
+	return NULL;
+}
+
+static inline bool is_bpf_text_address(unsigned long addr)
+{
+	return false;
+}
+
+static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
+				  char *type, char *sym)
+{
+	return -ERANGE;
+}
+
+static inline const char *
+bpf_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	return NULL;
+}
+
+static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+}
+
+static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+}
 #endif /* CONFIG_BPF_JIT */
 
 #define BPF_ANC		BIT(15)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2831ba1e71c1..f45827e205d3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -28,6 +28,9 @@
 #include <linux/moduleloader.h>
 #include <linux/bpf.h>
 #include <linux/frame.h>
+#include <linux/rbtree_latch.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
 
 #include <asm/unaligned.h>
 
@@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	fp->aux = aux;
 	fp->aux->prog = fp;
 
+	INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
+
 	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
@@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }
 
 #ifdef CONFIG_BPF_JIT
+static __always_inline void
+bpf_get_prog_addr_region(const struct bpf_prog *prog,
+			 unsigned long *symbol_start,
+			 unsigned long *symbol_end)
+{
+	const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
+	unsigned long addr = (unsigned long)hdr;
+
+	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
+
+	*symbol_start = addr;
+	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
+}
+
+static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+	BUILD_BUG_ON(sizeof("bpf_prog_") +
+		     sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+
+	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
+	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
+	*sym = 0;
+}
+
+static __always_inline unsigned long
+bpf_get_prog_addr_start(struct latch_tree_node *n)
+{
+	unsigned long symbol_start, symbol_end;
+	const struct bpf_prog_aux *aux;
+
+	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+	return symbol_start;
+}
+
+static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
+					  struct latch_tree_node *b)
+{
+	return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
+}
+
+static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long symbol_start, symbol_end;
+	const struct bpf_prog_aux *aux;
+
+	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+	if (val < symbol_start)
+		return -1;
+	if (val >= symbol_end)
+		return  1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops bpf_tree_ops = {
+	.less	= bpf_tree_less,
+	.comp	= bpf_tree_comp,
+};
+
+static DEFINE_SPINLOCK(bpf_lock);
+static LIST_HEAD(bpf_kallsyms);
+static struct latch_tree_root bpf_tree __cacheline_aligned;
+
+int bpf_jit_kallsyms __read_mostly;
+
+static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
+{
+	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
+	list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
+	latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+}
+
+static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
+{
+	if (list_empty(&aux->ksym_lnode))
+		return;
+
+	latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+	list_del_rcu(&aux->ksym_lnode);
+}
+
+static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
+{
+	return fp->jited && !bpf_prog_was_classic(fp);
+}
+
+static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
+{
+	return list_empty(&fp->aux->ksym_lnode) ||
+	       fp->aux->ksym_lnode.prev == LIST_POISON2;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+	unsigned long flags;
+
+	if (!bpf_prog_kallsyms_candidate(fp) ||
+	    !capable(CAP_SYS_ADMIN))
+		return;
+
+	spin_lock_irqsave(&bpf_lock, flags);
+	bpf_prog_ksym_node_add(fp->aux);
+	spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+	unsigned long flags;
+
+	if (!bpf_prog_kallsyms_candidate(fp))
+		return;
+
+	spin_lock_irqsave(&bpf_lock, flags);
+	bpf_prog_ksym_node_del(fp->aux);
+	spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
+{
+	struct latch_tree_node *n;
+
+	if (!bpf_jit_kallsyms_enabled())
+		return NULL;
+
+	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
+	return n ?
+	       container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
+	       NULL;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+				 unsigned long *off, char *sym)
+{
+	unsigned long symbol_start, symbol_end;
+	struct bpf_prog *prog;
+	char *ret = NULL;
+
+	rcu_read_lock();
+	prog = bpf_prog_kallsyms_find(addr);
+	if (prog) {
+		bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
+		bpf_get_prog_name(prog, sym);
+
+		ret = sym;
+		if (size)
+			*size = symbol_end - symbol_start;
+		if (off)
+			*off  = addr - symbol_start;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool is_bpf_text_address(unsigned long addr)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = bpf_prog_kallsyms_find(addr) != NULL;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		    char *sym)
+{
+	unsigned long symbol_start, symbol_end;
+	struct bpf_prog_aux *aux;
+	unsigned int it = 0;
+	int ret = -ERANGE;
+
+	if (!bpf_jit_kallsyms_enabled())
+		return ret;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
+		if (it++ != symnum)
+			continue;
+
+		bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+		bpf_get_prog_name(aux->prog, sym);
+
+		*value = symbol_start;
+		*type  = BPF_SYM_ELF_TYPE;
+
+		ret = 0;
+		break;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
@@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	module_memfree(hdr);
 }
 
+/* This symbol is only overridden by archs that have different
+ * requirements than the usual eBPF JITs, f.e. when they only
+ * implement cBPF JIT, do not set images read-only, etc.
+ */
+void __weak bpf_jit_free(struct bpf_prog *fp)
+{
+	if (fp->jited) {
+		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+		bpf_jit_binary_unlock_ro(hdr);
+		bpf_jit_binary_free(hdr);
+
+		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
+	}
+
+	bpf_prog_unlock_free(fp);
+}
+
 int bpf_jit_harden __read_mostly;
 
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f74ca17af64a..461eb1e66a0f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -707,6 +707,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
@@ -903,6 +904,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		/* failed to allocate fd */
 		goto free_used_maps;
 
+	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..bd82117ad424 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
+#include <linux/filter.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_ftrace_trampoline(addr))
 		return 1;
+	if (is_bpf_text_address(addr))
+		return 1;
 	/*
 	 * There might be init symbols in saved stacktraces.
 	 * Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_module_text_address(addr))
 		return 1;
-	return is_ftrace_trampoline(addr);
+	if (is_ftrace_trampoline(addr))
+		return 1;
+	if (is_bpf_text_address(addr))
+		return 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fafd1a3ef0da..6a3b249a2ae1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
+#include <linux/filter.h>
 #include <linux/compiler.h>
 
 #include <asm/sections.h>
@@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 				unsigned long *offset)
 {
 	char namebuf[KSYM_NAME_LEN];
+
 	if (is_ksym_addr(addr))
 		return !!get_symbol_pos(addr, symbolsize, offset);
-
-	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+	       !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
 }
 
 /*
@@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *offset,
 			    char **modname, char *namebuf)
 {
+	const char *ret;
+
 	namebuf[KSYM_NAME_LEN - 1] = 0;
 	namebuf[0] = 0;
 
@@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr,
 		return namebuf;
 	}
 
-	/* See if it's in a module. */
-	return module_address_lookup(addr, symbolsize, offset, modname,
-				     namebuf);
+	/* See if it's in a module or a BPF JITed image. */
+	ret = module_address_lookup(addr, symbolsize, offset,
+				    modname, namebuf);
+	if (!ret)
+		ret = bpf_address_lookup(addr, symbolsize,
+					 offset, modname, namebuf);
+	return ret;
 }
 
 int lookup_symbol_name(unsigned long addr, char *symname)
@@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
 struct kallsym_iter {
 	loff_t pos;
+	loff_t pos_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -481,13 +490,27 @@ struct kallsym_iter {
 
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
-	if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
-				&iter->type, iter->name, iter->module_name,
-				&iter->exported) < 0)
+	int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
+				     &iter->value, &iter->type,
+				     iter->name, iter->module_name,
+				     &iter->exported);
+	if (ret < 0) {
+		iter->pos_mod_end = iter->pos;
 		return 0;
+	}
+
 	return 1;
 }
 
+static int get_ksymbol_bpf(struct kallsym_iter *iter)
+{
+	iter->module_name[0] = '\0';
+	iter->exported = 0;
+	return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+			       &iter->value, &iter->type,
+			       iter->name) < 0 ? 0 : 1;
+}
+
 /* Returns space to next name. */
 static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
 {
@@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 	iter->name[0] = '\0';
 	iter->nameoff = get_symbol_offset(new_pos);
 	iter->pos = new_pos;
+	if (new_pos == 0)
+		iter->pos_mod_end = 0;
+}
+
+static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
+{
+	iter->pos = pos;
+
+	if (iter->pos_mod_end > 0 &&
+	    iter->pos_mod_end < iter->pos)
+		return get_ksymbol_bpf(iter);
+
+	if (!get_ksymbol_mod(iter))
+		return get_ksymbol_bpf(iter);
+
+	return 1;
 }
 
 /* Returns false if pos at or past end of file. */
 static int update_iter(struct kallsym_iter *iter, loff_t pos)
 {
 	/* Module symbols can be accessed randomly. */
-	if (pos >= kallsyms_num_syms) {
-		iter->pos = pos;
-		return get_ksymbol_mod(iter);
-	}
+	if (pos >= kallsyms_num_syms)
+		return update_iter_mod(iter, pos);
 
 	/* If we're not on the desired position, reset to new position. */
 	if (pos != iter->pos)
diff --git a/net/Kconfig b/net/Kconfig
index f19c0c3b9589..102f781a0131 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -297,7 +297,8 @@ config BPF_JIT
 
 	  Note, admin should enable this feature changing:
 	  /proc/sys/net/core/bpf_jit_enable
-	  /proc/sys/net/core/bpf_jit_harden (optional)
+	  /proc/sys/net/core/bpf_jit_harden   (optional)
+	  /proc/sys/net/core/bpf_jit_kallsyms (optional)
 
 config NET_FLOW_LIMIT
 	bool
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index eaa72eb0399c..4ead336e14ea 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -334,6 +334,13 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "bpf_jit_kallsyms",
+		.data		= &bpf_jit_kallsyms,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
 # endif
 #endif
 	{
-- 
cgit v1.2.3


From eda7a5e88d95d8cc4315451dcfb74064773cbc02 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu
Date: Thu, 16 Feb 2017 13:38:04 -0800
Subject: bridge: don't indicate expiry on NTF_EXT_LEARNED fdb entries

added_by_external_learn fdb entries are added and expired by
external entities like switchdev driver or external controllers.
ageing is already disabled for such entries. Hence, don't
indicate expiry for such fdb entries.

CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_fdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 4ac11574117c..4f598dc2d916 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -65,7 +65,7 @@ static inline unsigned long hold_time(const struct net_bridge *br)
 static inline int has_expired(const struct net_bridge *br,
 				  const struct net_bridge_fdb_entry *fdb)
 {
-	return !fdb->is_static &&
+	return !fdb->is_static && !fdb->added_by_external_learn &&
 		time_before_eq(fdb->updated + hold_time(br), jiffies);
 }
 
-- 
cgit v1.2.3


From 025331df34f6722f86b467cb13a69326444ab1bc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Fri, 17 Feb 2017 01:56:11 +0100
Subject: rtnl: don't account unused struct ifla_port_vsi in rtnl_port_size

When allocating rtnl dump messages, struct ifla_port_vsi is never dumped,
so we can save header plus payload in rtnl_port_size(). Infact, attribute
IFLA_PORT_VSI_TYPE and struct ifla_port_vsi are not used anywhere in
the kernel. We only need to keep the nla policy should applications in
user space be filling this out. Same NLA_BINARY issue exists as was fixed
in 364d5716a7ad ("rtnetlink: ifla_vf_policy: fix misuses of NLA_BINARY")
and others, but then again IFLA_PORT_VSI_TYPE is not used anywhere, so
just add a comment that it's unused.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index adfb54b896da..e3286d32eca5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -876,8 +876,6 @@ static size_t rtnl_port_size(const struct net_device *dev,
 {
 	size_t port_size = nla_total_size(4)		/* PORT_VF */
 		+ nla_total_size(PORT_PROFILE_MAX)	/* PORT_PROFILE */
-		+ nla_total_size(sizeof(struct ifla_port_vsi))
-							/* PORT_VSI_TYPE */
 		+ nla_total_size(PORT_UUID_MAX)		/* PORT_INSTANCE_UUID */
 		+ nla_total_size(PORT_UUID_MAX)		/* PORT_HOST_UUID */
 		+ nla_total_size(1)			/* PROT_VDP_REQUEST */
@@ -1491,14 +1489,19 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 	[IFLA_PORT_VF]		= { .type = NLA_U32 },
 	[IFLA_PORT_PROFILE]	= { .type = NLA_STRING,
 				    .len = PORT_PROFILE_MAX },
-	[IFLA_PORT_VSI_TYPE]	= { .type = NLA_BINARY,
-				    .len = sizeof(struct ifla_port_vsi)},
 	[IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
 				      .len = PORT_UUID_MAX },
 	[IFLA_PORT_HOST_UUID]	= { .type = NLA_STRING,
 				    .len = PORT_UUID_MAX },
 	[IFLA_PORT_REQUEST]	= { .type = NLA_U8, },
 	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },
+
+	/* Unused, but we need to keep it here since user space could
+	 * fill it. It's also broken with regard to NLA_BINARY use in
+	 * combination with structs.
+	 */
+	[IFLA_PORT_VSI_TYPE]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_port_vsi) },
 };
 
 static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
-- 
cgit v1.2.3


From 88c4845d7dec835ba7ad1379e30a09658b05495d Mon Sep 17 00:00:00 2001
From: David Howells
Date: Fri, 17 Feb 2017 18:16:21 +0000
Subject: rxrpc: Change module filename to rxrpc.ko

Change module filename from af-rxrpc.ko to rxrpc.ko so as to be consistent
with the other protocol drivers.

Also adjust the documentation to reflect this.

Further, there is no longer a standalone rxkad module, as it has been
merged into the rxrpc core, so get rid of references to that.

Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/filesystems/afs.txt | 34 +---------------------------------
 net/rxrpc/Makefile                | 12 ++++++------
 2 files changed, 7 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/Documentation/filesystems/afs.txt b/Documentation/filesystems/afs.txt
index ffef91c4e0d6..060da408923b 100644
--- a/Documentation/filesystems/afs.txt
+++ b/Documentation/filesystems/afs.txt
@@ -64,8 +64,7 @@ USAGE
 When inserting the driver modules the root cell must be specified along with a
 list of volume location server IP addresses:
 
-	modprobe af_rxrpc
-	modprobe rxkad
+	modprobe rxrpc
 	modprobe kafs rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91
 
 The first module is the AF_RXRPC network protocol driver.  This provides the
@@ -214,34 +213,3 @@ If a file is opened with a particular key and then the file descriptor is
 passed to a process that doesn't have that key (perhaps over an AF_UNIX
 socket), then the operations on the file will be made with key that was used to
 open the file.
-
-
-========
-EXAMPLES
-========
-
-Here's what I use to test this.  Some of the names and IP addresses are local
-to my internal DNS.  My "root.afs" partition has a mount point within it for
-some public volumes volumes.
-
-insmod /tmp/rxrpc.o
-insmod /tmp/rxkad.o
-insmod /tmp/kafs.o rootcell=cambridge.redhat.com:172.16.18.91
-
-mount -t afs \%root.afs. /afs
-mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/
-
-echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 > /proc/fs/afs/cells
-mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/
-mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive
-mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib
-mount -t afs "#grand.central.org:root.doc." /afs/grand.central.org/doc
-mount -t afs "#grand.central.org:root.project." /afs/grand.central.org/project
-mount -t afs "#grand.central.org:root.service." /afs/grand.central.org/service
-mount -t afs "#grand.central.org:root.software." /afs/grand.central.org/software
-mount -t afs "#grand.central.org:root.user." /afs/grand.central.org/user
-
-umount /afs
-rmmod kafs
-rmmod rxkad
-rmmod rxrpc
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 8fc6ea347182..b9da4d6b914f 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -2,7 +2,9 @@
 # Makefile for Linux kernel RxRPC
 #
 
-af-rxrpc-y := \
+obj-$(CONFIG_AF_RXRPC) += rxrpc.o
+
+rxrpc-y := \
 	af_rxrpc.o \
 	call_accept.o \
 	call_event.o \
@@ -26,8 +28,6 @@ af-rxrpc-y := \
 	skbuff.o \
 	utils.o
 
-af-rxrpc-$(CONFIG_PROC_FS) += proc.o
-af-rxrpc-$(CONFIG_RXKAD) += rxkad.o
-af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o
-
-obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
+rxrpc-$(CONFIG_PROC_FS) += proc.o
+rxrpc-$(CONFIG_RXKAD) += rxkad.o
+rxrpc-$(CONFIG_SYSCTL) += sysctl.o
-- 
cgit v1.2.3


From 806a837650501f5eee359a197824db8752fa6e6c Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Fri, 17 Feb 2017 14:34:19 +0800
Subject: pkt_sched: Remove useless qdisc_stab_lock

The qdisc_stab_lock is used in qdisc_get_stab and qdisc_put_stab.
These two functions are invoked in qdisc_create, qdisc_change, and
qdisc_destroy which run fully under RTNL.

So it already makes sure only one could access the qdisc_stab_list at
the same time. Then it is unnecessary to use qdisc_stab_lock now.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a13c15e8f087..bcf49cd22786 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -440,7 +440,6 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
 EXPORT_SYMBOL(qdisc_put_rtab);
 
 static LIST_HEAD(qdisc_stab_list);
-static DEFINE_SPINLOCK(qdisc_stab_lock);
 
 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
@@ -474,20 +473,15 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 	if (tsize != s->tsize || (!tab && tsize > 0))
 		return ERR_PTR(-EINVAL);
 
-	spin_lock(&qdisc_stab_lock);
-
 	list_for_each_entry(stab, &qdisc_stab_list, list) {
 		if (memcmp(&stab->szopts, s, sizeof(*s)))
 			continue;
 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 			continue;
 		stab->refcnt++;
-		spin_unlock(&qdisc_stab_lock);
 		return stab;
 	}
 
-	spin_unlock(&qdisc_stab_lock);
-
 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
@@ -497,9 +491,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 	if (tsize > 0)
 		memcpy(stab->data, tab, tsize * sizeof(u16));
 
-	spin_lock(&qdisc_stab_lock);
 	list_add_tail(&stab->list, &qdisc_stab_list);
-	spin_unlock(&qdisc_stab_lock);
 
 	return stab;
 }
@@ -514,14 +506,10 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
 	if (!tab)
 		return;
 
-	spin_lock(&qdisc_stab_lock);
-
 	if (--tab->refcnt == 0) {
 		list_del(&tab->list);
 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 	}
-
-	spin_unlock(&qdisc_stab_lock);
 }
 EXPORT_SYMBOL(qdisc_put_stab);
 
-- 
cgit v1.2.3


From d2c58294f5416467ae0c5d00675bce3cd19595dd Mon Sep 17 00:00:00 2001
From: Zhu Yanjun
Date: Fri, 17 Feb 2017 04:16:22 -0500
Subject: rds:Remove unnecessary ib_ring unalloc

In the function rds_ib_xmit_atomic, ib_ring is not allocated
successfully. As such, it is not necessary to unalloc it.

Cc: Joe Jin <joe.jin@oracle.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_send.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 5e72de10c484..6ab39dbcca01 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -770,7 +770,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 
 	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
 	if (work_alloc != 1) {
-		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 		rds_ib_stats_inc(s_ib_tx_ring_full);
 		ret = -ENOMEM;
 		goto out;
-- 
cgit v1.2.3


From a4ecb15a2432880353d0de4a35204ab8b65b8751 Mon Sep 17 00:00:00 2001
From: Cui, Cheng
Date: Fri, 17 Feb 2017 17:04:55 +0000
Subject: tcp: accommodate sequence number to a peer's shrunk receive window
 caused by precision loss in window scaling

Prevent sending out a left-shifted sequence number from a Linux sender in
response to a peer's shrunk receive-window caused by losing least significant
bits in window-scaling.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Cheng Cui <Cheng.Cui@netapp.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 61f272a99a49..22548b5f05cb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -83,7 +83,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 		      tcp_skb_pcount(skb));
 }
 
-/* SND.NXT, if window was not shrunk.
+/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
+ * window scaling factor due to loss of precision.
  * If window has been shrunk, what should we make? It is not clear at all.
  * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
  * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -93,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
-	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+	if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
+	    (tp->rx_opt.wscale_ok &&
+	     ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
 		return tp->snd_nxt;
 	else
 		return tcp_wnd_end(tp);
-- 
cgit v1.2.3


From 4e33e34625103593a71d2bae471ce49cef62ef06 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 17 Feb 2017 09:11:42 -0800
Subject: tcp: use page_ref_inc() in tcp_sendmsg()

sk_page_frag_refill() allocates either a compound page or an order-0
page. We can use page_ref_inc() which is slightly faster than get_page()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d44a6989e76d..da385ae997a3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1288,7 +1288,7 @@ new_segment:
 			} else {
 				skb_fill_page_desc(skb, i, pfrag->page,
 						   pfrag->offset, copy);
-				get_page(pfrag->page);
+				page_ref_inc(pfrag->page);
 			}
 			pfrag->offset += copy;
 		}
-- 
cgit v1.2.3


From bd4b9f8b4af7be15fd162276ec9a2a1d49f10270 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 12:45:37 +0800
Subject: sctp: add support for generating stream reconf resp chunk

This patch is to define Re-configuration Response Parameter described
in rfc6525 section 4.4. As optional fields are only for SSN/TSN Reset
Request Parameter, it uses another function to make that.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 24 ++++++++++++++++
 include/net/sctp/sm.h    |  7 +++++
 net/sctp/sm_make_chunk.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b055788de0cf..7a4804c4a593 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -749,4 +749,28 @@ struct sctp_strreset_addstrm {
 	__u16 reserved;
 };
 
+enum {
+	SCTP_STRRESET_NOTHING_TO_DO	= 0x00,
+	SCTP_STRRESET_PERFORMED		= 0x01,
+	SCTP_STRRESET_DENIED		= 0x02,
+	SCTP_STRRESET_ERR_WRONG_SSN	= 0x03,
+	SCTP_STRRESET_ERR_IN_PROGRESS	= 0x04,
+	SCTP_STRRESET_ERR_BAD_SEQNO	= 0x05,
+	SCTP_STRRESET_IN_PROGRESS	= 0x06,
+};
+
+struct sctp_strreset_resp {
+	sctp_paramhdr_t param_hdr;
+	__u32 response_seq;
+	__u32 result;
+};
+
+struct sctp_strreset_resptsn {
+	sctp_paramhdr_t param_hdr;
+	__u32 response_seq;
+	__u32 result;
+	__u32 senders_next_tsn;
+	__u32 receivers_next_tsn;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 3675fde3a26e..8a85dd7fc1ec 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -270,6 +270,13 @@ struct sctp_chunk *sctp_make_strreset_tsnreq(
 struct sctp_chunk *sctp_make_strreset_addstrm(
 				const struct sctp_association *asoc,
 				__u16 out, __u16 in);
+struct sctp_chunk *sctp_make_strreset_resp(
+				const struct sctp_association *asoc,
+				__u32 result, __u32 sn);
+struct sctp_chunk *sctp_make_strreset_tsnresp(
+				struct sctp_association *asoc,
+				__u32 result, __u32 sn,
+				__u32 sender_tsn, __u32 receiver_tsn);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 7f8dbf2c6cee..9680c580759b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3733,3 +3733,77 @@ struct sctp_chunk *sctp_make_strreset_addstrm(
 
 	return retval;
 }
+
+/* RE-CONFIG 4.4 (RESP)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 16       |      Parameter Length         |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Response Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                            Result                             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_resp(
+				const struct sctp_association *asoc,
+				__u32 result, __u32 sn)
+{
+	struct sctp_strreset_resp resp;
+	__u16 length = sizeof(resp);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+	resp.param_hdr.length = htons(length);
+	resp.response_seq = htonl(sn);
+	resp.result = htonl(result);
+
+	sctp_addto_chunk(retval, sizeof(resp), &resp);
+
+	return retval;
+}
+
+/* RE-CONFIG 4.4 OPTIONAL (TSNRESP)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 16       |      Parameter Length         |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Response Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                            Result                             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                   Sender's Next TSN (optional)                |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                  Receiver's Next TSN (optional)               |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnresp(
+					struct sctp_association *asoc,
+					__u32 result, __u32 sn,
+					__u32 sender_tsn, __u32 receiver_tsn)
+{
+	struct sctp_strreset_resptsn tsnresp;
+	__u16 length = sizeof(tsnresp);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+	tsnresp.param_hdr.length = htons(length);
+
+	tsnresp.response_seq = htonl(sn);
+	tsnresp.result = htonl(result);
+	tsnresp.senders_next_tsn = htonl(sender_tsn);
+	tsnresp.receivers_next_tsn = htonl(receiver_tsn);
+
+	sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp);
+
+	return retval;
+}
-- 
cgit v1.2.3


From 35ea82d611da59f8bea44a37996b3b11bb1d3fd7 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 12:45:38 +0800
Subject: sctp: add support for generating stream ssn reset event notification

This patch is to add Stream Reset Event described in rfc6525
section 6.1.1.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpevent.h |  4 ++++
 include/uapi/linux/sctp.h   | 16 ++++++++++++++++
 net/sctp/ulpevent.c         | 29 +++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 2c098cd7e7e2..324b5965fc4d 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -128,6 +128,10 @@ struct sctp_ulpevent *sctp_ulpevent_make_authkey(
 struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
 	const struct sctp_association *asoc, gfp_t gfp);
 
+struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
+	const struct sctp_association *asoc, __u16 flags,
+	__u16 stream_num, __u16 *stream_list, gfp_t gfp);
+
 void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
 				   struct msghdr *);
 void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event,
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index a91a9cccbae6..d3ae381fcf33 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -490,6 +490,18 @@ struct sctp_sender_dry_event {
 	sctp_assoc_t sender_dry_assoc_id;
 };
 
+#define SCTP_STREAM_RESET_INCOMING_SSN	0x0001
+#define SCTP_STREAM_RESET_OUTGOING_SSN	0x0002
+#define SCTP_STREAM_RESET_DENIED	0x0004
+#define SCTP_STREAM_RESET_FAILED	0x0008
+struct sctp_stream_reset_event {
+	__u16 strreset_type;
+	__u16 strreset_flags;
+	__u32 strreset_length;
+	sctp_assoc_t strreset_assoc_id;
+	__u16 strreset_stream_list[];
+};
+
 /*
  * Described in Section 7.3
  *   Ancillary Data and Notification Interest Options
@@ -505,6 +517,7 @@ struct sctp_event_subscribe {
 	__u8 sctp_adaptation_layer_event;
 	__u8 sctp_authentication_event;
 	__u8 sctp_sender_dry_event;
+	__u8 sctp_stream_reset_event;
 };
 
 /*
@@ -529,6 +542,7 @@ union sctp_notification {
 	struct sctp_pdapi_event sn_pdapi_event;
 	struct sctp_authkey_event sn_authkey_event;
 	struct sctp_sender_dry_event sn_sender_dry_event;
+	struct sctp_stream_reset_event sn_strreset_event;
 };
 
 /* Section 5.3.1
@@ -556,6 +570,8 @@ enum sctp_sn_type {
 #define SCTP_AUTHENTICATION_INDICATION	SCTP_AUTHENTICATION_EVENT
 	SCTP_SENDER_DRY_EVENT,
 #define SCTP_SENDER_DRY_EVENT		SCTP_SENDER_DRY_EVENT
+	SCTP_STREAM_RESET_EVENT,
+#define SCTP_STREAM_RESET_EVENT		SCTP_STREAM_RESET_EVENT
 };
 
 /* Notification error codes used to fill up the error fields in some
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index bea00058ce35..c8881bc542a0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -854,6 +854,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
 	return event;
 }
 
+struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
+	const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
+	__u16 *stream_list, gfp_t gfp)
+{
+	struct sctp_stream_reset_event *sreset;
+	struct sctp_ulpevent *event;
+	struct sk_buff *skb;
+	int length, i;
+
+	length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num;
+	event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp);
+	if (!event)
+		return NULL;
+
+	skb = sctp_event2skb(event);
+	sreset = (struct sctp_stream_reset_event *)skb_put(skb, length);
+
+	sreset->strreset_type = SCTP_STREAM_RESET_EVENT;
+	sreset->strreset_flags = flags;
+	sreset->strreset_length = length;
+	sctp_ulpevent_set_owner(event, asoc);
+	sreset->strreset_assoc_id = sctp_assoc2id(asoc);
+
+	for (i = 0; i < stream_num; i++)
+		sreset->strreset_stream_list[i] = ntohs(stream_list[i]);
+
+	return event;
+}
+
 /* Return the notification type, assuming this is a notification
  * event.
  */
-- 
cgit v1.2.3


From 81054476453640159149cb6704e834893e16736b Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 12:45:39 +0800
Subject: sctp: implement receiver-side procedures for the Outgoing SSN Reset
 Request Parameter

This patch is to implement Receiver-Side Procedures for the Outgoing
SSN Reset Request Parameter described in rfc6525 section 5.2.2.

Note that some checks must be after request_seq check, as even those
checks fail, strreset_inseq still has to be increase by 1.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h |   6 +++
 net/sctp/stream.c     | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 8a85dd7fc1ec..c0f9bea4fa92 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -280,6 +280,12 @@ struct sctp_chunk *sctp_make_strreset_tsnresp(
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
+/* Prototypes for stream-processing functions.  */
+struct sctp_chunk *sctp_process_strreset_outreq(
+				struct sctp_association *asoc,
+				union sctp_params param,
+				struct sctp_ulpevent **evp);
+
 /* Prototypes for statetable processing. */
 
 int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index eb02490245ba..531fcc4cd112 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -294,3 +294,116 @@ int sctp_send_add_streams(struct sctp_association *asoc,
 out:
 	return retval;
 }
+
+static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param(
+			struct sctp_association *asoc, __u32 resp_seq)
+{
+	struct sctp_chunk *chunk = asoc->strreset_chunk;
+	struct sctp_reconf_chunk *hdr;
+	union sctp_params param;
+
+	if (ntohl(resp_seq) != asoc->strreset_outseq || !chunk)
+		return NULL;
+
+	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+	sctp_walk_params(param, hdr, params) {
+		/* sctp_strreset_tsnreq is actually the basic structure
+		 * of all stream reconf params, so it's safe to use it
+		 * to access request_seq.
+		 */
+		struct sctp_strreset_tsnreq *req = param.v;
+
+		if (req->request_seq == resp_seq)
+			return param.v;
+	}
+
+	return NULL;
+}
+
+struct sctp_chunk *sctp_process_strreset_outreq(
+				struct sctp_association *asoc,
+				union sctp_params param,
+				struct sctp_ulpevent **evp)
+{
+	struct sctp_strreset_outreq *outreq = param.v;
+	struct sctp_stream *stream = asoc->stream;
+	__u16 i, nums, flags = 0, *str_p = NULL;
+	__u32 result = SCTP_STRRESET_DENIED;
+	__u32 request_seq;
+
+	request_seq = ntohl(outreq->request_seq);
+
+	if (ntohl(outreq->send_reset_at_tsn) >
+	    sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) {
+		result = SCTP_STRRESET_IN_PROGRESS;
+		goto out;
+	}
+
+	if (request_seq > asoc->strreset_inseq) {
+		result = SCTP_STRRESET_ERR_BAD_SEQNO;
+		goto out;
+	} else if (request_seq == asoc->strreset_inseq) {
+		asoc->strreset_inseq++;
+	}
+
+	/* Check strreset_enable after inseq inc, as sender cannot tell
+	 * the peer doesn't enable strreset after receiving response with
+	 * result denied, as well as to keep consistent with bsd.
+	 */
+	if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
+		goto out;
+
+	if (asoc->strreset_chunk) {
+		sctp_paramhdr_t *param_hdr;
+		struct sctp_transport *t;
+
+		param_hdr = sctp_chunk_lookup_strreset_param(
+					asoc, outreq->response_seq);
+		if (!param_hdr || param_hdr->type !=
+					SCTP_PARAM_RESET_IN_REQUEST) {
+			/* same process with outstanding isn't 0 */
+			result = SCTP_STRRESET_ERR_IN_PROGRESS;
+			goto out;
+		}
+
+		asoc->strreset_outstanding--;
+		asoc->strreset_outseq++;
+
+		if (!asoc->strreset_outstanding) {
+			t = asoc->strreset_chunk->transport;
+			if (del_timer(&t->reconf_timer))
+				sctp_transport_put(t);
+
+			sctp_chunk_put(asoc->strreset_chunk);
+			asoc->strreset_chunk = NULL;
+		}
+
+		flags = SCTP_STREAM_RESET_INCOMING_SSN;
+	}
+
+	nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2;
+	if (nums) {
+		str_p = outreq->list_of_streams;
+		for (i = 0; i < nums; i++) {
+			if (ntohs(str_p[i]) >= stream->incnt) {
+				result = SCTP_STRRESET_ERR_WRONG_SSN;
+				goto out;
+			}
+		}
+
+		for (i = 0; i < nums; i++)
+			stream->in[ntohs(str_p[i])].ssn = 0;
+	} else {
+		for (i = 0; i < stream->incnt; i++)
+			stream->in[i].ssn = 0;
+	}
+
+	result = SCTP_STRRESET_PERFORMED;
+
+	*evp = sctp_ulpevent_make_stream_reset_event(asoc,
+		flags | SCTP_STREAM_RESET_OUTGOING_SSN, nums, str_p,
+		GFP_ATOMIC);
+
+out:
+	return sctp_make_strreset_resp(asoc, result, request_seq);
+}
-- 
cgit v1.2.3


From 16e1a91965b02fe24d24e8b8d7b2245d29ed6a70 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 12:45:40 +0800
Subject: sctp: implement receiver-side procedures for the Incoming SSN Reset
 Request Parameter

This patch is to implement Receiver-Side Procedures for the Incoming
SSN Reset Request Parameter described in rfc6525 section 5.2.3.

It's also to move str_list endian conversion out of sctp_make_strreset_req,
so that sctp_make_strreset_req can be used more conveniently to process
inreq.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    |  4 +++
 net/sctp/sm_make_chunk.c |  8 +-----
 net/sctp/stream.c        | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index c0f9bea4fa92..f6a828d3d072 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -285,6 +285,10 @@ struct sctp_chunk *sctp_process_strreset_outreq(
 				struct sctp_association *asoc,
 				union sctp_params param,
 				struct sctp_ulpevent **evp);
+struct sctp_chunk *sctp_process_strreset_inreq(
+				struct sctp_association *asoc,
+				union sctp_params param,
+				struct sctp_ulpevent **evp);
 
 /* Prototypes for statetable processing. */
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9680c580759b..60d9fdcef440 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3617,7 +3617,7 @@ struct sctp_chunk *sctp_make_strreset_req(
 	__u16 stream_len = stream_num * 2;
 	struct sctp_strreset_inreq inreq;
 	struct sctp_chunk *retval;
-	__u16 outlen, inlen, i;
+	__u16 outlen, inlen;
 
 	outlen = (sizeof(outreq) + stream_len) * out;
 	inlen = (sizeof(inreq) + stream_len) * in;
@@ -3626,9 +3626,6 @@ struct sctp_chunk *sctp_make_strreset_req(
 	if (!retval)
 		return NULL;
 
-	for (i = 0; i < stream_num; i++)
-		stream_list[i] = htons(stream_list[i]);
-
 	if (outlen) {
 		outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
 		outreq.param_hdr.length = htons(outlen);
@@ -3653,9 +3650,6 @@ struct sctp_chunk *sctp_make_strreset_req(
 			sctp_addto_chunk(retval, stream_len, stream_list);
 	}
 
-	for (i = 0; i < stream_num; i++)
-		stream_list[i] = ntohs(stream_list[i]);
-
 	return retval;
 }
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 531fcc4cd112..1c6cc04fa3a4 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -135,7 +135,14 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 			if (str_list[i] >= stream->incnt)
 				goto out;
 
+	for (i = 0; i < str_nums; i++)
+		str_list[i] = htons(str_list[i]);
+
 	chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
+
+	for (i = 0; i < str_nums; i++)
+		str_list[i] = ntohs(str_list[i]);
+
 	if (!chunk) {
 		retval = -ENOMEM;
 		goto out;
@@ -407,3 +414,66 @@ struct sctp_chunk *sctp_process_strreset_outreq(
 out:
 	return sctp_make_strreset_resp(asoc, result, request_seq);
 }
+
+struct sctp_chunk *sctp_process_strreset_inreq(
+				struct sctp_association *asoc,
+				union sctp_params param,
+				struct sctp_ulpevent **evp)
+{
+	struct sctp_strreset_inreq *inreq = param.v;
+	struct sctp_stream *stream = asoc->stream;
+	__u32 result = SCTP_STRRESET_DENIED;
+	struct sctp_chunk *chunk = NULL;
+	__u16 i, nums, *str_p;
+	__u32 request_seq;
+
+	request_seq = ntohl(inreq->request_seq);
+	if (request_seq > asoc->strreset_inseq) {
+		result = SCTP_STRRESET_ERR_BAD_SEQNO;
+		goto out;
+	} else if (request_seq == asoc->strreset_inseq) {
+		asoc->strreset_inseq++;
+	}
+
+	if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
+		goto out;
+
+	if (asoc->strreset_outstanding) {
+		result = SCTP_STRRESET_ERR_IN_PROGRESS;
+		goto out;
+	}
+
+	nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2;
+	str_p = inreq->list_of_streams;
+	for (i = 0; i < nums; i++) {
+		if (ntohs(str_p[i]) >= stream->outcnt) {
+			result = SCTP_STRRESET_ERR_WRONG_SSN;
+			goto out;
+		}
+	}
+
+	chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0);
+	if (!chunk)
+		goto out;
+
+	if (nums)
+		for (i = 0; i < nums; i++)
+			stream->out[ntohs(str_p[i])].state =
+					       SCTP_STREAM_CLOSED;
+	else
+		for (i = 0; i < stream->outcnt; i++)
+			stream->out[i].state = SCTP_STREAM_CLOSED;
+
+	asoc->strreset_chunk = chunk;
+	asoc->strreset_outstanding = 1;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	*evp = sctp_ulpevent_make_stream_reset_event(asoc,
+		SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC);
+
+out:
+	if (!chunk)
+		chunk =  sctp_make_strreset_resp(asoc, result, request_seq);
+
+	return chunk;
+}
-- 
cgit v1.2.3


From ea62504373fa9ff3ced27e07e1eac041888ecc46 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 12:45:41 +0800
Subject: sctp: add a function to verify the sctp reconf chunk

This patch is to add a function sctp_verify_reconf to do some length
check and multi-params check for sctp stream reconf according to rfc6525
section 3.1.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    |  3 +++
 net/sctp/sm_make_chunk.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index f6a828d3d072..ca9fbfb2084c 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -277,6 +277,9 @@ struct sctp_chunk *sctp_make_strreset_tsnresp(
 				struct sctp_association *asoc,
 				__u32 result, __u32 sn,
 				__u32 sender_tsn, __u32 receiver_tsn);
+bool sctp_verify_reconf(const struct sctp_association *asoc,
+			struct sctp_chunk *chunk,
+			struct sctp_paramhdr **errp);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 60d9fdcef440..969a30c7bb54 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3801,3 +3801,62 @@ struct sctp_chunk *sctp_make_strreset_tsnresp(
 
 	return retval;
 }
+
+bool sctp_verify_reconf(const struct sctp_association *asoc,
+			struct sctp_chunk *chunk,
+			struct sctp_paramhdr **errp)
+{
+	struct sctp_reconf_chunk *hdr;
+	union sctp_params param;
+	__u16 last = 0, cnt = 0;
+
+	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+	sctp_walk_params(param, hdr, params) {
+		__u16 length = ntohs(param.p->length);
+
+		*errp = param.p;
+		if (cnt++ > 2)
+			return false;
+		switch (param.p->type) {
+		case SCTP_PARAM_RESET_OUT_REQUEST:
+			if (length < sizeof(struct sctp_strreset_outreq) ||
+			    (last && last != SCTP_PARAM_RESET_RESPONSE &&
+			     last != SCTP_PARAM_RESET_IN_REQUEST))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_IN_REQUEST:
+			if (length < sizeof(struct sctp_strreset_inreq) ||
+			    (last && last != SCTP_PARAM_RESET_OUT_REQUEST))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_RESPONSE:
+			if ((length != sizeof(struct sctp_strreset_resp) &&
+			     length != sizeof(struct sctp_strreset_resptsn)) ||
+			    (last && last != SCTP_PARAM_RESET_RESPONSE &&
+			     last != SCTP_PARAM_RESET_OUT_REQUEST))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_TSN_REQUEST:
+			if (length !=
+			    sizeof(struct sctp_strreset_tsnreq) || last)
+				return false;
+			break;
+		case SCTP_PARAM_RESET_ADD_IN_STREAMS:
+			if (length != sizeof(struct sctp_strreset_addstrm) ||
+			    (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_ADD_OUT_STREAMS:
+			if (length != sizeof(struct sctp_strreset_addstrm) ||
+			    (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS))
+				return false;
+			break;
+		default:
+			return false;
+		}
+
+		last = param.p->type;
+	}
+
+	return true;
+}
-- 
cgit v1.2.3


From 2040d3d7a36265d4a26af4e19a6a8a6154ab4bcc Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 12:45:42 +0800
Subject: sctp: add reconf chunk process

This patch is to add a function to process the incoming reconf chunk,
in which it verifies the chunk, and traverses the param and process
it with the right function one by one.

sctp_sf_do_reconf would be the process function of reconf chunk event.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h   |  1 +
 net/sctp/sm_statefuns.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index ca9fbfb2084c..b6f682ec184a 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -135,6 +135,7 @@ sctp_state_fn_t sctp_sf_do_8_5_1_E_sa;
 sctp_state_fn_t sctp_sf_cookie_echoed_err;
 sctp_state_fn_t sctp_sf_do_asconf;
 sctp_state_fn_t sctp_sf_do_asconf_ack;
+sctp_state_fn_t sctp_sf_do_reconf;
 sctp_state_fn_t sctp_sf_do_9_2_reshutack;
 sctp_state_fn_t sctp_sf_eat_fwd_tsn;
 sctp_state_fn_t sctp_sf_eat_fwd_tsn_fast;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d8798ddda726..e03bb1aab4d0 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3834,6 +3834,60 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 	return SCTP_DISPOSITION_DISCARD;
 }
 
+/* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */
+sctp_disposition_t sctp_sf_do_reconf(struct net *net,
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type, void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_paramhdr *err_param = NULL;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_reconf_chunk *hdr;
+	union sctp_params param;
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	}
+
+	/* Make sure that the RECONF chunk has a valid length.  */
+	if (!sctp_chunk_length_valid(chunk, sizeof(*hdr)))
+		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+						  commands);
+
+	if (!sctp_verify_reconf(asoc, chunk, &err_param))
+		return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
+						  (void *)err_param, commands);
+
+	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+	sctp_walk_params(param, hdr, params) {
+		struct sctp_chunk *reply = NULL;
+		struct sctp_ulpevent *ev = NULL;
+
+		if (param.p->type == SCTP_PARAM_RESET_OUT_REQUEST)
+			reply = sctp_process_strreset_outreq(
+				(struct sctp_association *)asoc, param, &ev);
+		else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST)
+			reply = sctp_process_strreset_inreq(
+				(struct sctp_association *)asoc, param, &ev);
+		/* More handles for other types will be added here, by now it
+		 * just ignores other types.
+		 */
+
+		if (ev)
+			sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+					SCTP_ULPEVENT(ev));
+
+		if (reply)
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(reply));
+	}
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
 /*
  * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
  *
-- 
cgit v1.2.3


From d884aa635be6247a1bc5b0fa60d8a16b5f48e279 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 12:45:43 +0800
Subject: sctp: add reconf chunk event

This patch is to add reconf chunk event based on the sctp event
frame in rx path, it will call sctp_sf_do_reconf to process the
reconf chunk.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  3 +++
 net/sctp/sm_statetable.c     | 30 ++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'net')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 3567c971cf3b..b07a745ab69f 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -60,11 +60,14 @@ enum { SCTP_DEFAULT_INSTREAMS = SCTP_MAX_STREAM };
 
 #define SCTP_NUM_PRSCTP_CHUNK_TYPES	1
 
+#define SCTP_NUM_RECONF_CHUNK_TYPES	1
+
 #define SCTP_NUM_AUTH_CHUNK_TYPES	1
 
 #define SCTP_NUM_CHUNK_TYPES		(SCTP_NUM_BASE_CHUNK_TYPES + \
 					 SCTP_NUM_ADDIP_CHUNK_TYPES +\
 					 SCTP_NUM_PRSCTP_CHUNK_TYPES +\
+					 SCTP_NUM_RECONF_CHUNK_TYPES +\
 					 SCTP_NUM_AUTH_CHUNK_TYPES)
 
 /* These are the different flavours of event.  */
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index b5438b4f6c1e..419b18ebb056 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -482,6 +482,32 @@ static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUN
 	TYPE_SCTP_FWD_TSN,
 }; /*state_fn_t prsctp_chunk_event_table[][] */
 
+#define TYPE_SCTP_RECONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_RECONF */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_RECONF,
+}; /*state_fn_t reconf_chunk_event_table[][] */
+
 #define TYPE_SCTP_AUTH { \
 	/* SCTP_STATE_CLOSED */ \
 	TYPE_SCTP_FUNC(sctp_sf_ootb), \
@@ -964,6 +990,10 @@ static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
 			return &addip_chunk_event_table[1][state];
 	}
 
+	if (net->sctp.reconf_enable)
+		if (cid == SCTP_CID_RECONF)
+			return &reconf_chunk_event_table[0][state];
+
 	if (net->sctp.auth_enable) {
 		if (cid == SCTP_CID_AUTH)
 			return &auth_chunk_event_table[0][state];
-- 
cgit v1.2.3


From cd2b708750582e327789d8fb07c6eb5f79f7759f Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Fri, 17 Feb 2017 16:35:24 +0800
Subject: sctp: check duplicate node before inserting a new transport

sctp has changed to use rhlist for transport rhashtable since commit
7fda702f9315 ("sctp: use new rhlist interface on sctp transport
rhashtable").

But rhltable_insert_key doesn't check the duplicate node when inserting
a node, unlike rhashtable_lookup_insert_key. It may cause duplicate
assoc/transport in rhashtable. like:

 client (addr A, B)                 server (addr X, Y)
    connect to X           INIT (1)
                        ------------>
    connect to Y           INIT (2)
                        ------------>
                         INIT_ACK (1)
                        <------------
                         INIT_ACK (2)
                        <------------

After sending INIT (2), one transport will be created and hashed into
rhashtable. But when receiving INIT_ACK (1) and processing the address
params, another transport will be created and hashed into rhashtable
with the same addr Y and EP as the last transport. This will confuse
the assoc/transport's lookup.

This patch is to fix it by returning err if any duplicate node exists
before inserting it.

Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport rhashtable")
Reported-by: Fabio M. Di Nitto <fdinitto@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 704ad19c1565..fc458968fe4b 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -872,6 +872,8 @@ void sctp_transport_hashtable_destroy(void)
 
 int sctp_hash_transport(struct sctp_transport *t)
 {
+	struct sctp_transport *transport;
+	struct rhlist_head *tmp, *list;
 	struct sctp_hash_cmp_arg arg;
 	int err;
 
@@ -882,8 +884,19 @@ int sctp_hash_transport(struct sctp_transport *t)
 	arg.paddr = &t->ipaddr;
 	arg.lport = htons(t->asoc->base.bind_addr.port);
 
+	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+			       sctp_hash_params);
+
+	rhl_for_each_entry_rcu(transport, tmp, list, node)
+		if (transport->asoc->ep == t->asoc->ep) {
+			err = -EEXIST;
+			goto out;
+		}
+
 	err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
 				  &t->node, sctp_hash_params);
+
+out:
 	if (err)
 		pr_err_once("insert transport fail, errno %d\n", err);
 
-- 
cgit v1.2.3


From 2317c6b51e4249dbfa093e1b88cab0a9f0564b7f Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme
Date: Fri, 17 Feb 2017 18:11:58 -0800
Subject: openvswitch: Set event bit after initializing labels.

Connlabels are included in conntrack netlink event messages only if
the IPCT_LABEL bit is set in the event cache (see
ctnetlink_conntrack_event()).  Set it after initializing labels for a
new connection.

Found upon further system testing, where it was noticed that labels
were missing from the conntrack events.

Fixes: 193e30967897 ("openvswitch: Do not trigger events for unconfirmed connections.")
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c2d452eab0c5..85cd59526670 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -339,9 +339,7 @@ static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
 
 /* Initialize labels for a new, yet to be committed conntrack entry.  Note that
  * since the new connection is not yet confirmed, and thus no-one else has
- * access to it's labels, we simply write them over.  Also, we refrain from
- * triggering events, as receiving change events before the create event would
- * be confusing.
+ * access to it's labels, we simply write them over.
  */
 static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
 			      const struct ovs_key_ct_labels *labels,
@@ -374,6 +372,11 @@ static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
 				 & mask->ct_labels_32[i]);
 	}
 
+	/* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
+	 * IPCT_LABEL bit it set in the event cache.
+	 */
+	nf_conntrack_event_cache(IPCT_LABEL, ct);
+
 	memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
 
 	return 0;
-- 
cgit v1.2.3


From b7018d0b63003c0474451991797113605f26ee81 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Sun, 19 Feb 2017 01:52:45 +0800
Subject: sctp: flush out queue once assoc state falls into SHUTDOWN_PENDING

This patch is to flush out queue when assoc state falls into
SHUTDOWN_PENDING if there are still chunks in it, so that the
data can be sent out as soon as possible before sending SHUTDOWN
chunk.

When sctp supports MSG_MORE flag in next patch, this improvement
can also solve the problem that the chunks with MSG_MORE flag
may be stuck in queue when closing an assoc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_sideeffect.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 51abcc90fe75..25384fa82ba9 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -872,6 +872,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
 		if (!sctp_style(sk, UDP))
 			sk->sk_state_change(sk);
 	}
+
+	if (sctp_state(asoc, SHUTDOWN_PENDING) &&
+	    !sctp_outq_is_empty(&asoc->outqueue))
+		sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC);
 }
 
 /* Helper function to delete an association. */
-- 
cgit v1.2.3


From 4ea0c32f5f42f7ef33a7ecfb9b61ff0cad9b3c08 Mon Sep 17 00:00:00 2001
From: Xin Long
Date: Sun, 19 Feb 2017 01:52:46 +0800
Subject: sctp: add support for MSG_MORE

This patch is to add support for MSG_MORE on sctp.

It adds force_delay in sctp_datamsg to save MSG_MORE, and sets it after
creating datamsg according to the send flag. sctp_packet_can_append_data
then uses it to decide if the chunks of this msg will be sent at once or
delay it.

Note that unlike [1], this patch saves MSG_MORE in datamsg, instead of
in assoc. As sctp enqueues the chunks first, then dequeue them one by
one. If it's saved in assoc,the current msg's send flag (MSG_MORE) may
affect other chunks' bundling.

Since last patch, sctp flush out queue once assoc state falls into
SHUTDOWN_PENDING, the close block problem mentioned in [1] has been
solved as well.

[1] https://patchwork.ozlabs.org/patch/372404/

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 1 +
 net/sctp/output.c          | 9 +++------
 net/sctp/socket.c          | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 387c802bf248..a244db5e5ff7 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -497,6 +497,7 @@ struct sctp_datamsg {
 	/* Did the messenge fail to send? */
 	int send_error;
 	u8 send_failed:1,
+	   force_delay:1,
 	   can_delay;	    /* should this message be Nagle delayed */
 };
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 814eac047467..85406d5f8f41 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -704,18 +704,15 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
 	 * unacknowledged.
 	 */
 
-	if (sctp_sk(asoc->base.sk)->nodelay)
-		/* Nagle disabled */
+	if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) &&
+	    !chunk->msg->force_delay)
+		/* Nothing unacked */
 		return SCTP_XMIT_OK;
 
 	if (!sctp_packet_empty(packet))
 		/* Append to packet */
 		return SCTP_XMIT_OK;
 
-	if (inflight == 0)
-		/* Nothing unacked */
-		return SCTP_XMIT_OK;
-
 	if (!sctp_state(asoc, ESTABLISHED))
 		return SCTP_XMIT_OK;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 75f35cea4371..b5321486fbed 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1964,6 +1964,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		err = PTR_ERR(datamsg);
 		goto out_free;
 	}
+	datamsg->force_delay = !!(msg->msg_flags & MSG_MORE);
 
 	/* Now send the (possibly) fragmented message. */
 	list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
-- 
cgit v1.2.3


From 24045a03b8796e3e1ddb370dfe4bc592a9f5f301 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Mon, 20 Feb 2017 08:03:30 -0800
Subject: net: mpls: Add support for netconf

Add netconf support to MPLS. Allows userpsace to learn and be notified
of changes to 'input' enable setting per interface.

Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netconf.h   |   1 +
 include/uapi/linux/rtnetlink.h |   2 +
 net/mpls/af_mpls.c             | 212 ++++++++++++++++++++++++++++++++++++++++-
 net/mpls/internal.h            |   2 +-
 4 files changed, 214 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index 45dfad509c4d..7e5f0f3e31bf 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -16,6 +16,7 @@ enum {
 	NETCONFA_MC_FORWARDING,
 	NETCONFA_PROXY_NEIGH,
 	NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+	NETCONFA_INPUT,
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 8c93ad1ef9ab..6546917d605a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -658,6 +658,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_MPLS_ROUTE	RTNLGRP_MPLS_ROUTE
 	RTNLGRP_NSID,
 #define RTNLGRP_NSID		RTNLGRP_NSID
+	RTNLGRP_MPLS_NETCONF,
+#define RTNLGRP_MPLS_NETCONF	RTNLGRP_MPLS_NETCONF
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 64d3bf269a26..3818686182b2 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -7,6 +7,7 @@
 #include <linux/if_arp.h>
 #include <linux/ipv6.h>
 #include <linux/mpls.h>
+#include <linux/netconf.h>
 #include <linux/vmalloc.h>
 #include <linux/percpu.h>
 #include <net/ip.h>
@@ -960,15 +961,215 @@ static size_t mpls_get_stats_af_size(const struct net_device *dev)
 	return nla_total_size_64bit(sizeof(struct mpls_link_stats));
 }
 
+static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev,
+				     u32 portid, u32 seq, int event,
+				     unsigned int flags, int type)
+{
+	struct nlmsghdr  *nlh;
+	struct netconfmsg *ncm;
+	bool all = false;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	ncm = nlmsg_data(nlh);
+	ncm->ncm_family = AF_MPLS;
+
+	if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0)
+		goto nla_put_failure;
+
+	if ((all || type == NETCONFA_INPUT) &&
+	    nla_put_s32(skb, NETCONFA_INPUT,
+			mdev->input_enabled) < 0)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int mpls_netconf_msgsize_devconf(int type)
+{
+	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+			+ nla_total_size(4); /* NETCONFA_IFINDEX */
+	bool all = false;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_INPUT)
+		size += nla_total_size(4);
+
+	return size;
+}
+
+static void mpls_netconf_notify_devconf(struct net *net, int type,
+					struct mpls_dev *mdev)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF,
+					0, type);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err);
+}
+
+static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = {
+	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },
+};
+
+static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
+				    struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nlattr *tb[NETCONFA_MAX + 1];
+	struct netconfmsg *ncm;
+	struct net_device *dev;
+	struct mpls_dev *mdev;
+	struct sk_buff *skb;
+	int ifindex;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+			  devconf_mpls_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	if (!tb[NETCONFA_IFINDEX])
+		goto errout;
+
+	ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev)
+		goto errout;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		goto errout;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = mpls_netconf_fill_devconf(skb, mdev,
+					NETLINK_CB(in_skb).portid,
+					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+					NETCONFA_ALL);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+	return err;
+}
+
+static int mpls_netconf_dump_devconf(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct hlist_head *head;
+	struct net_device *dev;
+	struct mpls_dev *mdev;
+	int idx, s_idx;
+	int h, s_h;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		rcu_read_lock();
+		cb->seq = net->dev_base_seq;
+		hlist_for_each_entry_rcu(dev, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			mdev = mpls_dev_get(dev);
+			if (!mdev)
+				goto cont;
+			if (mpls_netconf_fill_devconf(skb, mdev,
+						      NETLINK_CB(cb->skb).portid,
+						      cb->nlh->nlmsg_seq,
+						      RTM_NEWNETCONF,
+						      NLM_F_MULTI,
+						      NETCONFA_ALL) < 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+			idx++;
+		}
+		rcu_read_unlock();
+	}
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+
+	return skb->len;
+}
+
 #define MPLS_PERDEV_SYSCTL_OFFSET(field)	\
 	(&((struct mpls_dev *)0)->field)
 
+static int mpls_conf_proc(struct ctl_table *ctl, int write,
+			  void __user *buffer,
+			  size_t *lenp, loff_t *ppos)
+{
+	int oval = *(int *)ctl->data;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write) {
+		struct mpls_dev *mdev = ctl->extra1;
+		int i = (int *)ctl->data - (int *)mdev;
+		struct net *net = ctl->extra2;
+		int val = *(int *)ctl->data;
+
+		if (i == offsetof(struct mpls_dev, input_enabled) &&
+		    val != oval) {
+			mpls_netconf_notify_devconf(net,
+						    NETCONFA_INPUT,
+						    mdev);
+		}
+	}
+
+	return ret;
+}
+
 static const struct ctl_table mpls_dev_table[] = {
 	{
 		.procname	= "input",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= mpls_conf_proc,
 		.data		= MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
 	},
 	{ }
@@ -978,6 +1179,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
 				    struct mpls_dev *mdev)
 {
 	char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+	struct net *net = dev_net(dev);
 	struct ctl_table *table;
 	int i;
 
@@ -988,8 +1190,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
 	/* Table data contains only offsets relative to the base of
 	 * the mdev at this point, so make them absolute.
 	 */
-	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) {
 		table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+		table[i].extra1 = mdev;
+		table[i].extra2 = net;
+	}
 
 	snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
 
@@ -1041,6 +1246,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	if (err)
 		goto free;
 
+	mdev->dev = dev;
 	rcu_assign_pointer(dev->mpls_ptr, mdev);
 
 	return mdev;
@@ -1861,6 +2067,8 @@ static int __init mpls_init(void)
 	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
+	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
+		      mpls_netconf_dump_devconf, NULL);
 	err = 0;
 out:
 	return err;
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index d97243034605..76360d8b9579 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -16,7 +16,7 @@ struct mpls_pcpu_stats {
 
 struct mpls_dev {
 	int				input_enabled;
-
+	struct net_device		*dev;
 	struct mpls_pcpu_stats __percpu	*stats;
 
 	struct ctl_table_header		*sysctl;
-- 
cgit v1.2.3


From d1892e4ec96aad4b733dfab3ed989718f04f03e9 Mon Sep 17 00:00:00 2001
From: Tobias Klauser
Date: Mon, 20 Feb 2017 16:32:06 +0100
Subject: rtnl: simplify error return path in rtnl_create_link()

There is only one possible error path which reaches the err label, so
return ERR_PTR(-ENOMEM) directly if alloc_netdev_mqs() fails. This also
allows to omit the err variable.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e3286d32eca5..c4e84c558240 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2358,7 +2358,6 @@ struct net_device *rtnl_create_link(struct net *net,
 	const char *ifname, unsigned char name_assign_type,
 	const struct rtnl_link_ops *ops, struct nlattr *tb[])
 {
-	int err;
 	struct net_device *dev;
 	unsigned int num_tx_queues = 1;
 	unsigned int num_rx_queues = 1;
@@ -2373,11 +2372,10 @@ struct net_device *rtnl_create_link(struct net *net,
 	else if (ops->get_num_rx_queues)
 		num_rx_queues = ops->get_num_rx_queues();
 
-	err = -ENOMEM;
 	dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
 			       ops->setup, num_tx_queues, num_rx_queues);
 	if (!dev)
-		goto err;
+		return ERR_PTR(-ENOMEM);
 
 	dev_net_set(dev, net);
 	dev->rtnl_link_ops = ops;
@@ -2403,9 +2401,6 @@ struct net_device *rtnl_create_link(struct net *net,
 		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
 
 	return dev;
-
-err:
-	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(rtnl_create_link);
 
-- 
cgit v1.2.3


From ca4ef4574f1ee5252e2cd365f8f5d5bafd048f32 Mon Sep 17 00:00:00 2001
From: Paolo Abeni
Date: Tue, 21 Feb 2017 09:33:18 +0100
Subject: ip: fix IP_CHECKSUM handling

The skbs processed by ip_cmsg_recv() are not guaranteed to
be linear e.g. when sending UDP packets over loopback with
MSGMORE.
Using csum_partial() on [potentially] the whole skb len
is dangerous; instead be on the safe side and use skb_checksum().

Thanks to syzkaller team to detect the issue and provide the
reproducer.

v1 -> v2:
 - move the variable declaration in a tighter scope

Fixes: ad6f939ab193 ("ip: Add offset parameter to ip_cmsg_recv")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ce1386a67e24..ebd953bc5607 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -116,10 +116,10 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
 	if (skb->ip_summed != CHECKSUM_COMPLETE)
 		return;
 
-	if (offset != 0)
-		csum = csum_sub(csum,
-				csum_partial(skb_transport_header(skb) + tlen,
-					     offset, 0));
+	if (offset != 0) {
+		int tend_off = skb_transport_offset(skb) + tlen;
+		csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
+	}
 
 	put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
 }
-- 
cgit v1.2.3


From 8ccde4c562076f280c0321c549d822448b64e565 Mon Sep 17 00:00:00 2001
From: Gao Feng
Date: Tue, 21 Feb 2017 17:09:19 +0800
Subject: net: sock: Use USEC_PER_SEC macro instead of literal 1000000

The USEC_PER_SEC is used once in sock_set_timeout as the max value of
tv_usec. But there are other similar codes which use the literal
1000000 in this file.
It is minor cleanup to keep consitent.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index b74356535559..e7d74940e863 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -367,7 +367,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 		return 0;
 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
-		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 	return 0;
 }
 
@@ -1146,7 +1146,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 			v.tm.tv_usec = 0;
 		} else {
 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
-			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
 		}
 		break;
 
@@ -1157,7 +1157,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 			v.tm.tv_usec = 0;
 		} else {
 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
-			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
 		}
 		break;
 
-- 
cgit v1.2.3


From d4af4c0ad0a4d070ef756010a2a10d96ff53d73d Mon Sep 17 00:00:00 2001
From: Tobias Klauser
Date: Tue, 21 Feb 2017 14:04:59 +0100
Subject: net/hsr: use eth_hw_addr_random()

Use eth_hw_addr_random() to set a random MAC address in order to make sure
dev->addr_assign_type will be properly set to NET_ADDR_RANDOM.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/hsr/hsr_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index fc65b145f6e7..c73160fb11e7 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -395,7 +395,7 @@ static struct device_type hsr_type = {
 
 void hsr_dev_setup(struct net_device *dev)
 {
-	random_ether_addr(dev->dev_addr);
+	eth_hw_addr_random(dev);
 
 	ether_setup(dev);
 	dev->min_mtu = 0;
-- 
cgit v1.2.3


From 29869d66870a715177bfb505f66a7e0e8bcc89c3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Tue, 21 Feb 2017 06:21:47 -0800
Subject: tcp: Revert "tcp: tcp_probe: use spin_lock_bh()"

This reverts commit e70ac171658679ecf6bea4bbd9e9325cd6079d2b.

jtcp_rcv_established() is in fact called with hard irq being disabled.

Initial bug report from Ricardo Nabinger Sanchez [1] still needs
to be investigated, but does not look like a TCP bug.

[1] https://www.spinics.net/lists/netdev/msg420960.html

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Cc: Ricardo Nabinger Sanchez <rnsanchez@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 3d063eb37848..f6c50af24a64 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -117,7 +117,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	     (fwmark > 0 && skb->mark == fwmark)) &&
 	    (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
 
-		spin_lock_bh(&tcp_probe.lock);
+		spin_lock(&tcp_probe.lock);
 		/* If log fills, just silently drop */
 		if (tcp_probe_avail() > 1) {
 			struct tcp_log *p = tcp_probe.log + tcp_probe.head;
@@ -157,7 +157,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
 		}
 		tcp_probe.lastcwnd = tp->snd_cwnd;
-		spin_unlock_bh(&tcp_probe.lock);
+		spin_unlock(&tcp_probe.lock);
 
 		wake_up(&tcp_probe.wait);
 	}
-- 
cgit v1.2.3


From 559c59b238ebb7d39b732b6b08a59693b972e75c Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Tue, 21 Feb 2017 08:20:56 -0800
Subject: net: napi_watchdog() can use napi_schedule_irqoff()

hrtimer handlers run with masked hard IRQ, we can therefore
use napi_schedule_irqoff()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 05d19c6acf94..304f2deae5f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5089,7 +5089,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 
 	napi = container_of(timer, struct napi_struct, timer);
 	if (napi->gro_list)
-		napi_schedule(napi);
+		napi_schedule_irqoff(napi);
 
 	return HRTIMER_NORESTART;
 }
-- 
cgit v1.2.3


From e623a9e9dec29ae811d11f83d0074ba254aba374 Mon Sep 17 00:00:00 2001
From: Maxime Jayat
Date: Tue, 21 Feb 2017 18:35:51 +0100
Subject: net: socket: fix recvmmsg not returning error from sock_error

Commit 34b88a68f26a ("net: Fix use after free in the recvmmsg exit path"),
changed the exit path of recvmmsg to always return the datagrams
variable and modified the error paths to set the variable to the error
code returned by recvmsg if necessary.

However in the case sock_error returned an error, the error code was
then ignored, and recvmmsg returned 0.

Change the error path of recvmmsg to correctly return the error code
of sock_error.

The bug was triggered by using recvmmsg on a CAN interface which was
not up. Linux 4.6 and later return 0 in this case while earlier
releases returned -ENETDOWN.

Fixes: 34b88a68f26a ("net: Fix use after free in the recvmmsg exit path")
Signed-off-by: Maxime Jayat <maxime.jayat@mobile-devices.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index b7a63d5bc915..2c1e8677ff2d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2228,8 +2228,10 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		return err;
 
 	err = sock_error(sock->sk);
-	if (err)
+	if (err) {
+		datagrams = err;
 		goto out_put;
+	}
 
 	entry = mmsg;
 	compat_entry = (struct compat_mmsghdr __user *)mmsg;
-- 
cgit v1.2.3