From 5037a00992e5fcb3d8509964313565a3dab6697c Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Thu, 25 Jan 2018 17:13:37 +0200 Subject: nl80211: Introduce scan flags to emphasize requested scan behavior This commit defines new scan flags (LOW_SPAN, LOW_POWER, HIGH_LATENCY) to emphasize the requested scan behavior for the driver. These flags are optional and are mutually exclusive. The implementation of the respective functionality can be driver/hardware specific. These flags can be used to control the compromise between how long a scan takes, how much power it uses, and high accurate/complete the scan is in finding the BSSs. Signed-off-by: Sunil Dutt Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c587a61c32bf..6f60503fa617 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4945,6 +4945,9 @@ enum nl80211_feature_flags { * probe request tx deferral and suppression * @NL80211_EXT_FEATURE_MFP_OPTIONAL: Driver supports the %NL80211_MFP_OPTIONAL * value in %NL80211_ATTR_USE_MFP. + * @NL80211_EXT_FEATURE_LOW_SPAN_SCAN: Driver supports low span scan. + * @NL80211_EXT_FEATURE_LOW_POWER_SCAN: Driver supports low power scan. + * @NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN: Driver supports high accuracy scan. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4972,6 +4975,9 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE, NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, NL80211_EXT_FEATURE_MFP_OPTIONAL, + NL80211_EXT_FEATURE_LOW_SPAN_SCAN, + NL80211_EXT_FEATURE_LOW_POWER_SCAN, + NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5032,6 +5038,10 @@ enum nl80211_timeout_reason { * of NL80211_CMD_TRIGGER_SCAN and NL80211_CMD_START_SCHED_SCAN * requests. * + * NL80211_SCAN_FLAG_LOW_SPAN, NL80211_SCAN_FLAG_LOW_POWER, and + * NL80211_SCAN_FLAG_HIGH_ACCURACY flags are exclusive of each other, i.e., only + * one of them can be used in the request. + * * @NL80211_SCAN_FLAG_LOW_PRIORITY: scan request has low priority * @NL80211_SCAN_FLAG_FLUSH: flush cache before scanning * @NL80211_SCAN_FLAG_AP: force a scan even if the interface is configured @@ -5059,7 +5069,20 @@ enum nl80211_timeout_reason { * and suppression (if it has received a broadcast Probe Response frame, * Beacon frame or FILS Discovery frame from an AP that the STA considers * a suitable candidate for (re-)association - suitable in terms of - * SSID and/or RSSI + * SSID and/or RSSI. + * @NL80211_SCAN_FLAG_LOW_SPAN: Span corresponds to the total time taken to + * accomplish the scan. Thus, this flag intends the driver to perform the + * scan request with lesser span/duration. It is specific to the driver + * implementations on how this is accomplished. Scan accuracy may get + * impacted with this flag. + * @NL80211_SCAN_FLAG_LOW_POWER: This flag intends the scan attempts to consume + * optimal possible power. Drivers can resort to their specific means to + * optimize the power. Scan accuracy may get impacted with this flag. + * @NL80211_SCAN_FLAG_HIGH_ACCURACY: Accuracy here intends to the extent of scan + * results obtained. Thus HIGH_ACCURACY scan flag aims to get maximum + * possible scan results. This flag hints the driver to use the best + * possible scan configuration to improve the accuracy in scanning. + * Latency and power use may get impacted with this flag. */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -5070,6 +5093,9 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP = 1<<5, NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE = 1<<6, NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION = 1<<7, + NL80211_SCAN_FLAG_LOW_SPAN = 1<<8, + NL80211_SCAN_FLAG_LOW_POWER = 1<<9, + NL80211_SCAN_FLAG_HIGH_ACCURACY = 1<<10, }; /** -- cgit v1.2.3 From 40cbfa90218bc570a7959b436b9d48a18c361041 Mon Sep 17 00:00:00 2001 From: Srinivas Dasari Date: Thu, 25 Jan 2018 17:13:38 +0200 Subject: cfg80211/nl80211: Optional authentication offload to userspace This interface allows the host driver to offload the authentication to user space. This is exclusively defined for host drivers that do not define separate commands for authentication and association, but rely on userspace SME (e.g., in wpa_supplicant for the ~WPA_DRIVER_FLAGS_SME case) for the authentication to happen. This can be used to implement SAE without full implementation in the kernel/firmware while still being able to use NL80211_CMD_CONNECT with driver-based BSS selection. Host driver sends NL80211_CMD_EXTERNAL_AUTH event to start/abort authentication to the port on which connect is triggered and status of authentication is further indicated by user space to host driver through the same command response interface. User space entities advertise this capability through the NL80211_ATTR_EXTERNAL_AUTH_SUPP flag in the NL80211_CMD_CONNECT request. Host drivers shall look at this capability to offload the authentication. Signed-off-by: Srinivas Dasari Signed-off-by: Jouni Malinen [add socket connection ownership check] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 54 +++++++++++++++++++++++-- include/uapi/linux/nl80211.h | 47 ++++++++++++++++++++++ net/wireless/nl80211.c | 94 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 15 +++++++ net/wireless/trace.h | 23 +++++++++++ 5 files changed, 230 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 81174f9b8d14..68def3e5b013 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1905,11 +1905,16 @@ struct cfg80211_auth_request { * @ASSOC_REQ_DISABLE_HT: Disable HT (802.11n) * @ASSOC_REQ_DISABLE_VHT: Disable VHT * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association + * @CONNECT_REQ_EXTERNAL_AUTH_SUPPORT: User space indicates external + * authentication capability. Drivers can offload authentication to + * userspace if this flag is set. Only applicable for cfg80211_connect() + * request (connect callback). */ enum cfg80211_assoc_req_flags { - ASSOC_REQ_DISABLE_HT = BIT(0), - ASSOC_REQ_DISABLE_VHT = BIT(1), - ASSOC_REQ_USE_RRM = BIT(2), + ASSOC_REQ_DISABLE_HT = BIT(0), + ASSOC_REQ_DISABLE_VHT = BIT(1), + ASSOC_REQ_USE_RRM = BIT(2), + CONNECT_REQ_EXTERNAL_AUTH_SUPPORT = BIT(3), }; /** @@ -2600,6 +2605,33 @@ struct cfg80211_pmk_conf { const u8 *pmk_r0_name; }; +/** + * struct cfg80211_external_auth_params - Trigger External authentication. + * + * Commonly used across the external auth request and event interfaces. + * + * @action: action type / trigger for external authentication. Only significant + * for the authentication request event interface (driver to user space). + * @bssid: BSSID of the peer with which the authentication has + * to happen. Used by both the authentication request event and + * authentication response command interface. + * @ssid: SSID of the AP. Used by both the authentication request event and + * authentication response command interface. + * @key_mgmt_suite: AKM suite of the respective authentication. Used by the + * authentication request event interface. + * @status: status code, %WLAN_STATUS_SUCCESS for successful authentication, + * use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space cannot give you + * the real status code for failures. Used only for the authentication + * response command interface (user space to driver). + */ +struct cfg80211_external_auth_params { + enum nl80211_external_auth_action action; + u8 bssid[ETH_ALEN] __aligned(2); + struct cfg80211_ssid ssid; + unsigned int key_mgmt_suite; + u16 status; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -2923,6 +2955,9 @@ struct cfg80211_pmk_conf { * (invoked with the wireless_dev mutex held) * @del_pmk: delete the previously configured PMK for the given authenticator. * (invoked with the wireless_dev mutex held) + * + * @external_auth: indicates result of offloaded authentication processing from + * user space */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3216,6 +3251,8 @@ struct cfg80211_ops { const struct cfg80211_pmk_conf *conf); int (*del_pmk)(struct wiphy *wiphy, struct net_device *dev, const u8 *aa); + int (*external_auth)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_external_auth_params *params); }; /* @@ -6202,6 +6239,17 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev, /* ethtool helper */ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info); +/** + * cfg80211_external_auth_request - userspace request for authentication + * @netdev: network device + * @params: External authentication parameters + * @gfp: allocation flags + * Returns: 0 on success, < 0 on error + */ +int cfg80211_external_auth_request(struct net_device *netdev, + struct cfg80211_external_auth_params *params, + gfp_t gfp); + /* Logging, debugging and troubleshooting/diagnostic helpers. */ /* wiphy_printk helpers, similar to dev_printk */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6f60503fa617..c2342456cf16 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -992,6 +992,27 @@ * * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. * + * @NL80211_CMD_EXTERNAL_AUTH: This interface is exclusively defined for host + * drivers that do not define separate commands for authentication and + * association, but rely on user space for the authentication to happen. + * This interface acts both as the event request (driver to user space) + * to trigger the authentication and command response (userspace to + * driver) to indicate the authentication status. + * + * User space uses the %NL80211_CMD_CONNECT command to the host driver to + * trigger a connection. The host driver selects a BSS and further uses + * this interface to offload only the authentication part to the user + * space. Authentication frames are passed between the driver and user + * space through the %NL80211_CMD_FRAME interface. Host driver proceeds + * further with the association after getting successful authentication + * status. User space indicates the authentication status through + * %NL80211_ATTR_STATUS_CODE attribute in %NL80211_CMD_EXTERNAL_AUTH + * command interface. + * + * Host driver reports this status on an authentication failure to the + * user space through the connect result as the user space would have + * initiated the connection through the connect request. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1198,6 +1219,8 @@ enum nl80211_commands { NL80211_CMD_RELOAD_REGDB, + NL80211_CMD_EXTERNAL_AUTH, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2153,6 +2176,16 @@ enum nl80211_commands { * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT. * @NL80211_ATTR_PORT_AUTHORIZED: (reserved) * + * @NL80211_ATTR_EXTERNAL_AUTH_ACTION: Identify the requested external + * authentication operation (u32 attribute with an + * &enum nl80211_external_auth_action value). This is used with the + * &NL80211_CMD_EXTERNAL_AUTH request event. + * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user + * space supports external authentication. This attribute shall be used + * only with %NL80211_CMD_CONNECT request. The driver may offload + * authentication processing to user space if this capability is indicated + * in NL80211_CMD_CONNECT requests from the user space. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2579,6 +2612,9 @@ enum nl80211_attrs { NL80211_ATTR_PMKR0_NAME, NL80211_ATTR_PORT_AUTHORIZED, + NL80211_ATTR_EXTERNAL_AUTH_ACTION, + NL80211_ATTR_EXTERNAL_AUTH_SUPPORT, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5495,4 +5531,15 @@ enum nl80211_nan_match_attributes { NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 }; +/** + * nl80211_external_auth_action - Action to perform with external + * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. + * @NL80211_EXTERNAL_AUTH_START: Start the authentication. + * @NL80211_EXTERNAL_AUTH_ABORT: Abort the ongoing authentication. + */ +enum nl80211_external_auth_action { + NL80211_EXTERNAL_AUTH_START, + NL80211_EXTERNAL_AUTH_ABORT, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index dd249ec9f228..aa6b64069c80 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -420,6 +420,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 }, [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, + [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -9161,6 +9162,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (nla_get_flag(info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])) { + if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + GENL_SET_ERR_MSG(info, + "external auth requires connection ownership"); + return -EINVAL; + } + connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, @@ -12469,6 +12479,41 @@ static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info) return ret; } +static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_external_auth_params params; + + if (rdev->ops->external_auth) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_SSID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_BSSID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STATUS_CODE]) + return -EINVAL; + + memset(¶ms, 0, sizeof(params)); + + params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + if (params.ssid.ssid_len == 0 || + params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + memcpy(params.ssid.ssid, nla_data(info->attrs[NL80211_ATTR_SSID]), + params.ssid.ssid_len); + + memcpy(params.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]), + ETH_ALEN); + + params.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); + + return rdev_external_auth(rdev, dev, ¶ms); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13364,6 +13409,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_EXTERNAL_AUTH, + .doit = nl80211_external_auth, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; @@ -15375,6 +15428,47 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev) nlmsg_free(msg); } +int cfg80211_external_auth_request(struct net_device *dev, + struct cfg80211_external_auth_params *params, + gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + + if (!wdev->conn_owner_nlportid) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_EXTERNAL_AUTH); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + nla_put_u32(msg, NL80211_ATTR_AKM_SUITES, params->key_mgmt_suite) || + nla_put_u32(msg, NL80211_ATTR_EXTERNAL_AUTH_ACTION, + params->action) || + nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, params->bssid) || + nla_put(msg, NL80211_ATTR_SSID, params->ssid.ssid_len, + params->ssid.ssid)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, + wdev->conn_owner_nlportid); + return 0; + + nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(cfg80211_external_auth_request); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 0c06240d25af..84f23ae015fc 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1190,4 +1190,19 @@ static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, trace_rdev_return_int(&rdev->wiphy, ret); return ret; } + +static inline int +rdev_external_auth(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_external_auth_params *params) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_external_auth(&rdev->wiphy, dev, params); + if (rdev->ops->external_auth) + ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index bcfedd39e7a3..5152938b358d 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2319,6 +2319,29 @@ TRACE_EVENT(rdev_del_pmk, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(aa)) ); +TRACE_EVENT(rdev_external_auth, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_external_auth_params *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(bssid) + __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1) + __field(u16, status) + ), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(bssid, params->bssid); + memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); + memcpy(__entry->ssid, params->ssid.ssid, + params->ssid.ssid_len); + __entry->status = params->status; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT + ", ssid: %s, status: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->bssid, __entry->ssid, __entry->status) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3 From 466b9936bf93b7ec3bce1dcd493262ff0a8a4f44 Mon Sep 17 00:00:00 2001 From: tamizhr@codeaurora.org Date: Wed, 31 Jan 2018 16:24:49 +0530 Subject: cfg80211: Add support to notify station's opmode change to userspace ht/vht action frames will be sent to AP from station to notify change of its ht/vht opmode(max bandwidth, smps mode or nss) modified values. Currently these valuse used by driver/firmware for rate control algorithm. This patch introduces NL80211_CMD_STA_OPMODE_CHANGED command to notify those modified/current supported values(max bandwidth, smps mode, max nss) to userspace application. This will be useful for the application like steering, which closely monitoring station's capability changes. Since the application has taken these values during station association. Signed-off-by: Tamizh chelvam Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 43 ++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 12 ++++++++++ net/wireless/nl80211.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 68def3e5b013..7d49cd0cf92d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3553,6 +3553,35 @@ enum wiphy_vendor_command_flags { WIPHY_VENDOR_CMD_NEED_RUNNING = BIT(2), }; +/** + * enum wiphy_opmode_flag - Station's ht/vht operation mode information flags + * + * @STA_OPMODE_MAX_BW_CHANGED: Max Bandwidth changed + * @STA_OPMODE_SMPS_MODE_CHANGED: SMPS mode changed + * @STA_OPMODE_N_SS_CHANGED: max N_SS (number of spatial streams) changed + * + */ +enum wiphy_opmode_flag { + STA_OPMODE_MAX_BW_CHANGED = BIT(0), + STA_OPMODE_SMPS_MODE_CHANGED = BIT(1), + STA_OPMODE_N_SS_CHANGED = BIT(2), +}; + +/** + * struct sta_opmode_info - Station's ht/vht operation mode information + * @changed: contains value from &enum wiphy_opmode_flag + * @smps_mode: New SMPS mode of a station + * @bw: new max bandwidth value of a station + * @rx_nss: new rx_nss value of a station + */ + +struct sta_opmode_info { + u32 changed; + u8 smps_mode; + u8 bw; + u8 rx_nss; +}; + /** * struct wiphy_vendor_command - vendor command definition * @info: vendor command identifying information, as used in nl80211 @@ -5721,6 +5750,20 @@ void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp); void cfg80211_radar_event(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, gfp_t gfp); +/** + * cfg80211_sta_opmode_change_notify - STA's ht/vht operation mode change event + * @dev: network device + * @mac: MAC address of a station which opmode got modified + * @sta_opmode: station's current opmode value + * @gfp: context flags + * + * Driver should call this function when station's opmode modified via action + * frame. + */ +void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, + struct sta_opmode_info *sta_opmode, + gfp_t gfp); + /** * cfg80211_cac_event - Channel availability check (CAC) event * @netdev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c2342456cf16..ca3d5a613fc0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1013,6 +1013,11 @@ * user space through the connect result as the user space would have * initiated the connection through the connect request. * + * @NL80211_CMD_STA_OPMODE_CHANGED: An event that notify station's + * ht opmode or vht opmode changes using any of &NL80211_ATTR_SMPS_MODE, + * &NL80211_ATTR_CHANNEL_WIDTH,&NL80211_ATTR_NSS attributes with its + * address(specified in &NL80211_ATTR_MAC). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1221,6 +1226,8 @@ enum nl80211_commands { NL80211_CMD_EXTERNAL_AUTH, + NL80211_CMD_STA_OPMODE_CHANGED, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2186,6 +2193,9 @@ enum nl80211_commands { * authentication processing to user space if this capability is indicated * in NL80211_CMD_CONNECT requests from the user space. * + * @NL80211_ATTR_NSS: Station's New/updated RX_NSS value notified using this + * u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2615,6 +2625,8 @@ enum nl80211_attrs { NL80211_ATTR_EXTERNAL_AUTH_ACTION, NL80211_ATTR_EXTERNAL_AUTH_SUPPORT, + NL80211_ATTR_NSS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bdb70fe74e3c..cc6ec5bab676 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14950,6 +14950,61 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } +void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, + struct sta_opmode_info *sta_opmode, + gfp_t gfp) +{ + struct sk_buff *msg; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + void *hdr; + + if (WARN_ON(!mac)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STA_OPMODE_CHANGED); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx)) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_SMPS_MODE_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_SMPS_MODE, sta_opmode->smps_mode)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_NSS, sta_opmode->rx_nss)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, gfp); + + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify); + void cfg80211_probe_status(struct net_device *dev, const u8 *addr, u64 cookie, bool acked, gfp_t gfp) { -- cgit v1.2.3 From 01883eda72bd3f0a6c81447e4f223de14033fd9d Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:35 -0800 Subject: rds: support for zcopy completion notification RDS removes a datagram (rds_message) from the retransmit queue when an ACK is received. The ACK indicates that the receiver has queued the RDS datagram, so that the sender can safely forget the datagram. When all references to the rds_message are quiesced, rds_message_purge is called to release resources used by the rds_message If the datagram to be removed had pinned pages set up, add an entry to the rs->rs_znotify_queue so that the notifcation will be sent up via rds_rm_zerocopy_callback() when the rds_message is eventually freed by rds_message_purge. rds_rm_zerocopy_callback() attempts to batch the number of cookies sent with each notification to a max of SO_EE_ORIGIN_MAX_ZCOOKIES. This is achieved by checking the tail skb in the sk_error_queue: if this has room for one more cookie, the cookie from the current notification is added; else a new skb is added to the sk_error_queue. Every invocation of rds_rm_zerocopy_callback() will trigger a ->sk_error_report to notify the application. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/errqueue.h | 2 ++ net/rds/af_rds.c | 2 ++ net/rds/message.c | 83 +++++++++++++++++++++++++++++++++++++++---- net/rds/rds.h | 14 ++++++++ net/rds/recv.c | 2 ++ 5 files changed, 96 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da..28812eda4209 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,11 +20,13 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_ZCOOKIE 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_ORIGIN_MAX_ZCOOKIES 8 /** * struct scm_timestamping - timestamps exposed through cmsg diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 0a8eefd256b3..a937f18896ae 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ diff --git a/net/rds/message.c b/net/rds/message.c index ef3daafa3d79..bf1a656b198a 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include #include "rds.h" @@ -53,29 +56,95 @@ void rds_message_addref(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_addref); +static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + int ncookies; + u32 *ptr; + + if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE) + return false; + ncookies = serr->ee.ee_data; + if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES) + return false; + ptr = skb_put(skb, sizeof(u32)); + *ptr = cookie; + serr->ee.ee_data = ++ncookies; + return true; +} + +static void rds_rm_zerocopy_callback(struct rds_sock *rs, + struct rds_znotifier *znotif) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct sk_buff *skb, *tail; + struct sock_exterr_skb *serr; + unsigned long flags; + struct sk_buff_head *q; + u32 cookie = znotif->z_cookie; + + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + + if (tail && skb_zcookie_add(tail, cookie)) { + spin_unlock_irqrestore(&q->lock, flags); + mm_unaccount_pinned_pages(&znotif->z_mmp); + consume_skb(rds_skb_from_znotifier(znotif)); + sk->sk_error_report(sk); + return; + } + + skb = rds_skb_from_znotifier(znotif); + serr = SKB_EXT_ERR(skb); + memset(&serr->ee, 0, sizeof(serr->ee)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE; + serr->ee.ee_info = 0; + WARN_ON(!skb_zcookie_add(skb, cookie)); + + __skb_queue_tail(q, skb); + + spin_unlock_irqrestore(&q->lock, flags); + sk->sk_error_report(sk); + + mm_unaccount_pinned_pages(&znotif->z_mmp); +} + /* * This relies on dma_map_sg() not touching sg[].page during merging. */ static void rds_message_purge(struct rds_message *rm) { unsigned long i, flags; + bool zcopy = false; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->data.op_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); - /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->data.op_sg[i])); - } - rm->data.op_nents = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); if (rm->m_rs) { - sock_put(rds_rs_to_sk(rm->m_rs)); + struct rds_sock *rs = rm->m_rs; + + if (rm->data.op_mmp_znotifier) { + zcopy = true; + rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rm->data.op_mmp_znotifier = NULL; + } + sock_put(rds_rs_to_sk(rs)); rm->m_rs = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); + for (i = 0; i < rm->data.op_nents; i++) { + /* XXX will have to put_page for page refs */ + if (!zcopy) + __free_page(sg_page(&rm->data.op_sg[i])); + else + put_page(sg_page(&rm->data.op_sg[i])); + } + rm->data.op_nents = 0; + if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); if (rm->rdma.op_rdma_mr) diff --git a/net/rds/rds.h b/net/rds/rds.h index 7301b9b01890..24576bc4a5e9 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 +struct rds_znotifier { + struct list_head z_list; + struct mmpin z_mmp; + u32 z_cookie; +}; + +#define RDS_ZCOPY_SKB(__skb) ((struct rds_znotifier *)&((__skb)->cb[0])) + +static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z) +{ + return container_of((void *)z, struct sk_buff, cb); +} + struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; @@ -436,6 +449,7 @@ struct rds_message { unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; + struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; diff --git a/net/rds/recv.c b/net/rds/recv.c index b25bcfe411ca..b080961464df 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (msg_flags & MSG_OOB) goto out; + if (msg_flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ -- cgit v1.2.3 From 0cebaccef3acbdfbc2d85880a2efb765d2f4e2e3 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:36 -0800 Subject: rds: zerocopy Tx support. If the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and, if the SO_ZEROCOPY socket option has been set on the PF_RDS socket, application pages sent down with rds_sendmsg() are pinned. The pinning uses the accounting infrastructure added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages") The payload bytes in the message may not be modified for the duration that the message has been pinned. A multi-threaded application using this infrastructure may thus need to be notified about send-completion so that it can free/reuse the buffers passed to rds_sendmsg(). Notification of send-completion will identify each message-buffer by a cookie that the application must specify as ancillary data to rds_sendmsg(). The ancillary data in this case has cmsg_level == SOL_RDS and cmsg_type == RDS_CMSG_ZCOPY_COOKIE. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 1 + net/rds/message.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++- net/rds/rds.h | 3 ++- net/rds/send.c | 44 +++++++++++++++++++++++++++++++++++------ 4 files changed, 91 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index e71d4491f225..12e3bca32cad 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -103,6 +103,7 @@ #define RDS_CMSG_MASKED_ATOMIC_FADD 8 #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 #define RDS_CMSG_RXPATH_LATENCY 11 +#define RDS_CMSG_ZCOPY_COOKIE 12 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 diff --git a/net/rds/message.c b/net/rds/message.c index bf1a656b198a..651834513481 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -341,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy) { unsigned long to_copy, nbytes; unsigned long sg_off; struct scatterlist *sg; int ret = 0; + int length = iov_iter_count(from); rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -356,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + if (zcopy) { + int total_copied = 0; + struct sk_buff *skb; + + skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32), + GFP_KERNEL); + if (!skb) + return -ENOMEM; + rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); + if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, + length)) { + ret = -ENOMEM; + goto err; + } + while (iov_iter_count(from)) { + struct page *pages; + size_t start; + ssize_t copied; + + copied = iov_iter_get_pages(from, &pages, PAGE_SIZE, + 1, &start); + if (copied < 0) { + struct mmpin *mmp; + int i; + + for (i = 0; i < rm->data.op_nents; i++) + put_page(sg_page(&rm->data.op_sg[i])); + mmp = &rm->data.op_mmp_znotifier->z_mmp; + mm_unaccount_pinned_pages(mmp); + ret = -EFAULT; + goto err; + } + total_copied += copied; + iov_iter_advance(from, copied); + length -= copied; + sg_set_page(sg, pages, copied, start); + rm->data.op_nents++; + sg++; + } + WARN_ON_ONCE(length != 0); + return ret; +err: + consume_skb(skb); + rm->data.op_mmp_znotifier = NULL; + return ret; + } /* zcopy */ + while (iov_iter_count(from)) { if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, iov_iter_count(from), diff --git a/net/rds/rds.h b/net/rds/rds.h index 24576bc4a5e9..31cd38852050 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -785,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from); +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); diff --git a/net/rds/send.c b/net/rds/send.c index e8f3ff471b15..028ab598ac1b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -875,12 +875,13 @@ out: * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ -static int rds_rm_size(struct msghdr *msg, int data_len) +static int rds_rm_size(struct msghdr *msg, int num_sgs) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; + bool zcopy_cookie = false; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -899,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len) break; + case RDS_CMSG_ZCOPY_COOKIE: + zcopy_cookie = true; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; @@ -919,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len) } - size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) + return -EINVAL; + + size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) @@ -928,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len) return size; } +static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + u32 *cookie; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie))) + return -EINVAL; + cookie = CMSG_DATA(cmsg); + rm->data.op_mmp_znotifier->z_cookie = *cookie; + return 0; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -970,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_atomic(rs, rm, cmsg); break; + case RDS_CMSG_ZCOPY_COOKIE: + ret = rds_cmsg_zcopy(rs, rm, cmsg); + break; + default: return -EINVAL; } @@ -1040,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; size_t total_payload_len = payload_len, rdma_payload_len = 0; + bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && + sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); + int num_sgs = ceil(payload_len, PAGE_SIZE); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ - if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } @@ -1087,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (zcopy) { + if (rs->rs_transport->t_type != RDS_TRANS_TCP) { + ret = -EOPNOTSUPP; + goto out; + } + num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); + } /* size of rm including all sgs */ - ret = rds_rm_size(msg, payload_len); + ret = rds_rm_size(msg, num_sgs); if (ret < 0) goto out; @@ -1100,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Attach data to the rm */ if (payload_len) { - rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (!rm->data.op_sg) { ret = -ENOMEM; goto out; } - ret = rds_message_copy_from_user(rm, &msg->msg_iter); + ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } -- cgit v1.2.3 From 1ec010e705934c8acbe7dbf31afc81e60e3d828b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 16 Feb 2018 11:03:07 +0100 Subject: tun: export flags, uid, gid, queue information over netlink Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- drivers/net/tun.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/if_link.h | 18 ++++++++++++++ 2 files changed, 74 insertions(+) (limited to 'include/uapi') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8e9a0ac644d2..86b16013e9c5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2291,11 +2291,67 @@ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], return -EINVAL; } +static size_t tun_get_size(const struct net_device *dev) +{ + BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); + BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); + + return nla_total_size(sizeof(uid_t)) + /* OWNER */ + nla_total_size(sizeof(gid_t)) + /* GROUP */ + nla_total_size(sizeof(u8)) + /* TYPE */ + nla_total_size(sizeof(u8)) + /* PI */ + nla_total_size(sizeof(u8)) + /* VNET_HDR */ + nla_total_size(sizeof(u8)) + /* PERSIST */ + nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ + nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ + nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ + 0; +} + +static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + + if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) + goto nla_put_failure; + if (uid_valid(tun->owner) && + nla_put_u32(skb, IFLA_TUN_OWNER, + from_kuid_munged(current_user_ns(), tun->owner))) + goto nla_put_failure; + if (gid_valid(tun->group) && + nla_put_u32(skb, IFLA_TUN_GROUP, + from_kgid_munged(current_user_ns(), tun->group))) + goto nla_put_failure; + if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) + goto nla_put_failure; + if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) + goto nla_put_failure; + if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) + goto nla_put_failure; + if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, + !!(tun->flags & IFF_MULTI_QUEUE))) + goto nla_put_failure; + if (tun->flags & IFF_MULTI_QUEUE) { + if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, + tun->numdisabled)) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, + .get_size = tun_get_size, + .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6d9447700e18..11d0c0ea2bfa 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -941,4 +941,22 @@ enum { IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ }; +/* tun section */ + +enum { + IFLA_TUN_UNSPEC, + IFLA_TUN_OWNER, + IFLA_TUN_GROUP, + IFLA_TUN_TYPE, + IFLA_TUN_PI, + IFLA_TUN_VNET_HDR, + IFLA_TUN_PERSIST, + IFLA_TUN_MULTI_QUEUE, + IFLA_TUN_NUM_QUEUES, + IFLA_TUN_NUM_DISABLED_QUEUES, + __IFLA_TUN_MAX, +}; + +#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ -- cgit v1.2.3 From c4b50cd31d25c3d17886ffc47ca4a9a12c6dc9bf Mon Sep 17 00:00:00 2001 From: Venkateswara Naralasetty Date: Tue, 13 Feb 2018 11:03:06 +0530 Subject: cfg80211: send ack_signal to user in probe client response This patch provides support to get ack signal in probe client response and in station info from user. Signed-off-by: Venkateswara Naralasetty [squash in compilation fixes] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/cfg80211.c | 3 ++- include/net/cfg80211.h | 7 ++++++- include/uapi/linux/nl80211.h | 3 +++ net/mac80211/status.c | 2 +- net/wireless/nl80211.c | 8 ++++++-- 5 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 768f63f38341..b799a5384abb 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1599,7 +1599,8 @@ static void wil_probe_client_handle(struct wil6210_priv *wil, */ bool alive = (sta->status == wil_sta_connected); - cfg80211_probe_status(ndev, sta->addr, req->cookie, alive, GFP_KERNEL); + cfg80211_probe_status(ndev, sta->addr, req->cookie, alive, + 0, false, GFP_KERNEL); } static struct list_head *next_probe_client(struct wil6210_priv *wil) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7d49cd0cf92d..56e905cd4b07 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1147,6 +1147,7 @@ struct cfg80211_tid_stats { * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * @ack_signal: signal strength (in dBm) of the last ACK frame. */ struct station_info { u64 filled; @@ -1191,6 +1192,7 @@ struct station_info { u64 rx_duration; u8 rx_beacon_signal_avg; struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1]; + s8 ack_signal; }; #if IS_ENABLED(CONFIG_CFG80211) @@ -5838,10 +5840,13 @@ bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, * @addr: the address of the peer * @cookie: the cookie filled in @probe_client previously * @acked: indicates whether probe was acked or not + * @ack_signal: signal strength (in dBm) of the ACK frame. + * @is_valid_ack_signal: indicates the ack_signal is valid or not. * @gfp: allocation flags */ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, - u64 cookie, bool acked, gfp_t gfp); + u64 cookie, bool acked, s32 ack_signal, + bool is_valid_ack_signal, gfp_t gfp); /** * cfg80211_report_obss_beacon - report beacon from other APs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ca3d5a613fc0..c13c84304be3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2626,6 +2626,7 @@ enum nl80211_attrs { NL80211_ATTR_EXTERNAL_AUTH_SUPPORT, NL80211_ATTR_NSS, + NL80211_ATTR_ACK_SIGNAL, /* add attributes here, update the policy in nl80211.c */ @@ -2947,6 +2948,7 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_RX_DURATION: aggregate PPDU duration for all frames * received from the station (u64, usec) * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment + * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -2985,6 +2987,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_TID_STATS, NL80211_STA_INFO_RX_DURATION, NL80211_STA_INFO_PAD, + NL80211_STA_INFO_ACK_SIGNAL, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, diff --git a/net/mac80211/status.c b/net/mac80211/status.c index da7427a41529..d74d44e65bd7 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -486,7 +486,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, if (ieee80211_is_nullfunc(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, - cookie, acked, + cookie, acked, 0, false, GFP_ATOMIC); else cfg80211_mgmt_tx_status(&sdata->wdev, cookie, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c6f256b29c73..050ff61b06a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4486,6 +4486,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); + PUT_SINFO(ACK_SIGNAL, ack_signal, u8); #undef PUT_SINFO #undef PUT_SINFO_U64 @@ -14984,7 +14985,8 @@ nla_put_failure: EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify); void cfg80211_probe_status(struct net_device *dev, const u8 *addr, - u64 cookie, bool acked, gfp_t gfp) + u64 cookie, bool acked, s32 ack_signal, + bool is_valid_ack_signal, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -15009,7 +15011,9 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, NL80211_ATTR_PAD) || - (acked && nla_put_flag(msg, NL80211_ATTR_ACK))) + (acked && nla_put_flag(msg, NL80211_ATTR_ACK)) || + (is_valid_ack_signal && nla_put_s32(msg, NL80211_ATTR_ACK_SIGNAL, + ack_signal))) goto nla_put_failure; genlmsg_end(msg, hdr); -- cgit v1.2.3 From ccc007e4a746bb592d3e72106f00241f81d51410 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 15 Feb 2018 19:42:43 +0200 Subject: net: sched: add em_ipt ematch for calling xtables matches The commit a new tc ematch for using netfilter xtable matches. This allows early classification as well as mirroning/redirecting traffic based on logic implemented in netfilter extensions. Current supported use case is classification based on the incoming IPSec state used during decpsulation using the 'policy' iptables extension (xt_policy). The module dynamically fetches the netfilter match module and calls it using a fake xt_action_param structure based on validated userspace provided parameters. As the xt_policy match does not access skb->data, no skb modifications are needed on match. Signed-off-by: Eyal Birger Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +- include/uapi/linux/tc_ematch/tc_em_ipt.h | 20 +++ net/sched/Kconfig | 12 ++ net/sched/Makefile | 1 + net/sched/em_ipt.c | 257 +++++++++++++++++++++++++++++++ 5 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/tc_ematch/tc_em_ipt.h create mode 100644 net/sched/em_ipt.c (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 46c506615f4a..7cafb26df555 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -555,7 +555,8 @@ enum { #define TCF_EM_VLAN 6 #define TCF_EM_CANID 7 #define TCF_EM_IPSET 8 -#define TCF_EM_MAX 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 enum { TCF_EM_PROG_TC diff --git a/include/uapi/linux/tc_ematch/tc_em_ipt.h b/include/uapi/linux/tc_ematch/tc_em_ipt.h new file mode 100644 index 000000000000..49a65530992c --- /dev/null +++ b/include/uapi/linux/tc_ematch/tc_em_ipt.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_TC_EM_IPT_H +#define __LINUX_TC_EM_IPT_H + +#include +#include + +enum { + TCA_EM_IPT_UNSPEC, + TCA_EM_IPT_HOOK, + TCA_EM_IPT_MATCH_NAME, + TCA_EM_IPT_MATCH_REVISION, + TCA_EM_IPT_NFPROTO, + TCA_EM_IPT_MATCH_DATA, + __TCA_EM_IPT_MAX +}; + +#define TCA_EM_IPT_MAX (__TCA_EM_IPT_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index f24a6ae6819a..a01169fb5325 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -658,6 +658,18 @@ config NET_EMATCH_IPSET To compile this code as a module, choose M here: the module will be called em_ipset. +config NET_EMATCH_IPT + tristate "IPtables Matches" + depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES + ---help--- + Say Y here to be able to classify packets based on iptables + matches. + Current supported match is "policy" which allows packet classification + based on IPsec policy that was used during decapsulation + + To compile this code as a module, choose M here: the + module will be called em_ipt. + config NET_CLS_ACT bool "Actions" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 5b635447e3f8..8811d3804878 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -75,3 +75,4 @@ obj-$(CONFIG_NET_EMATCH_META) += em_meta.o obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o +obj-$(CONFIG_NET_EMATCH_IPT) += em_ipt.o diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c new file mode 100644 index 000000000000..a5f34e930eff --- /dev/null +++ b/net/sched/em_ipt.c @@ -0,0 +1,257 @@ +/* + * net/sched/em_ipt.c IPtables matches Ematch + * + * (c) 2018 Eyal Birger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct em_ipt_match { + const struct xt_match *match; + u32 hook; + u8 match_data[0] __aligned(8); +}; + +struct em_ipt_xt_match { + char *match_name; + int (*validate_match_data)(struct nlattr **tb, u8 mrev); +}; + +static const struct nla_policy em_ipt_policy[TCA_EM_IPT_MAX + 1] = { + [TCA_EM_IPT_MATCH_NAME] = { .type = NLA_STRING, + .len = XT_EXTENSION_MAXNAMELEN }, + [TCA_EM_IPT_MATCH_REVISION] = { .type = NLA_U8 }, + [TCA_EM_IPT_HOOK] = { .type = NLA_U32 }, + [TCA_EM_IPT_NFPROTO] = { .type = NLA_U8 }, + [TCA_EM_IPT_MATCH_DATA] = { .type = NLA_UNSPEC }, +}; + +static int check_match(struct net *net, struct em_ipt_match *im, int mdata_len) +{ + struct xt_mtchk_param mtpar = {}; + union { + struct ipt_entry e4; + struct ip6t_entry e6; + } e = {}; + + mtpar.net = net; + mtpar.table = "filter"; + mtpar.hook_mask = 1 << im->hook; + mtpar.family = im->match->family; + mtpar.match = im->match; + mtpar.entryinfo = &e; + mtpar.matchinfo = (void *)im->match_data; + return xt_check_match(&mtpar, mdata_len, 0, 0); +} + +static int policy_validate_match_data(struct nlattr **tb, u8 mrev) +{ + if (mrev != 0) { + pr_err("only policy match revision 0 supported"); + return -EINVAL; + } + + if (nla_get_u32(tb[TCA_EM_IPT_HOOK]) != NF_INET_PRE_ROUTING) { + pr_err("policy can only be matched on NF_INET_PRE_ROUTING"); + return -EINVAL; + } + + return 0; +} + +static const struct em_ipt_xt_match em_ipt_xt_matches[] = { + { + .match_name = "policy", + .validate_match_data = policy_validate_match_data + }, + {} +}; + +static struct xt_match *get_xt_match(struct nlattr **tb) +{ + const struct em_ipt_xt_match *m; + struct nlattr *mname_attr; + u8 nfproto, mrev = 0; + int ret; + + mname_attr = tb[TCA_EM_IPT_MATCH_NAME]; + for (m = em_ipt_xt_matches; m->match_name; m++) { + if (!nla_strcmp(mname_attr, m->match_name)) + break; + } + + if (!m->match_name) { + pr_err("Unsupported xt match"); + return ERR_PTR(-EINVAL); + } + + if (tb[TCA_EM_IPT_MATCH_REVISION]) + mrev = nla_get_u8(tb[TCA_EM_IPT_MATCH_REVISION]); + + ret = m->validate_match_data(tb, mrev); + if (ret < 0) + return ERR_PTR(ret); + + nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]); + return xt_request_find_match(nfproto, m->match_name, mrev); +} + +static int em_ipt_change(struct net *net, void *data, int data_len, + struct tcf_ematch *em) +{ + struct nlattr *tb[TCA_EM_IPT_MAX + 1]; + struct em_ipt_match *im = NULL; + struct xt_match *match; + int mdata_len, ret; + + ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy, + NULL); + if (ret < 0) + return ret; + + if (!tb[TCA_EM_IPT_HOOK] || !tb[TCA_EM_IPT_MATCH_NAME] || + !tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO]) + return -EINVAL; + + match = get_xt_match(tb); + if (IS_ERR(match)) { + pr_err("unable to load match\n"); + return PTR_ERR(match); + } + + mdata_len = XT_ALIGN(nla_len(tb[TCA_EM_IPT_MATCH_DATA])); + im = kzalloc(sizeof(*im) + mdata_len, GFP_KERNEL); + if (!im) { + ret = -ENOMEM; + goto err; + } + + im->match = match; + im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]); + nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len); + + ret = check_match(net, im, mdata_len); + if (ret) + goto err; + + em->datalen = sizeof(*im) + mdata_len; + em->data = (unsigned long)im; + return 0; + +err: + kfree(im); + module_put(match->me); + return ret; +} + +static void em_ipt_destroy(struct tcf_ematch *em) +{ + struct em_ipt_match *im = (void *)em->data; + + if (!im) + return; + + if (im->match->destroy) { + struct xt_mtdtor_param par = { + .net = em->net, + .match = im->match, + .matchinfo = im->match_data, + .family = im->match->family + }; + im->match->destroy(&par); + } + module_put(im->match->me); + kfree((void *)im); +} + +static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + const struct em_ipt_match *im = (const void *)em->data; + struct xt_action_param acpar = {}; + struct net_device *indev = NULL; + struct nf_hook_state state; + int ret; + + rcu_read_lock(); + + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); + + nf_hook_state_init(&state, im->hook, im->match->family, + indev ?: skb->dev, skb->dev, NULL, em->net, NULL); + + acpar.match = im->match; + acpar.matchinfo = im->match_data; + acpar.state = &state; + + ret = im->match->match(skb, &acpar); + + rcu_read_unlock(); + return ret; +} + +static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em) +{ + struct em_ipt_match *im = (void *)em->data; + + if (nla_put_string(skb, TCA_EM_IPT_MATCH_NAME, im->match->name) < 0) + return -EMSGSIZE; + if (nla_put_u32(skb, TCA_EM_IPT_HOOK, im->hook) < 0) + return -EMSGSIZE; + if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0) + return -EMSGSIZE; + if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0) + return -EMSGSIZE; + if (nla_put(skb, TCA_EM_IPT_MATCH_DATA, + im->match->usersize ?: im->match->matchsize, + im->match_data) < 0) + return -EMSGSIZE; + + return 0; +} + +static struct tcf_ematch_ops em_ipt_ops = { + .kind = TCF_EM_IPT, + .change = em_ipt_change, + .destroy = em_ipt_destroy, + .match = em_ipt_match, + .dump = em_ipt_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_ipt_ops.link) +}; + +static int __init init_em_ipt(void) +{ + return tcf_em_register(&em_ipt_ops); +} + +static void __exit exit_em_ipt(void) +{ + tcf_em_unregister(&em_ipt_ops); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eyal Birger "); +MODULE_DESCRIPTION("TC extended match for IPtables matches"); + +module_init(init_em_ipt); +module_exit(exit_em_ipt); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPT); -- cgit v1.2.3 From cac56209a66ea3b0be67aa2966b2c628b944da1e Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Tue, 20 Feb 2018 08:55:58 -0500 Subject: net: Allow a rule to track originating protocol Allow a rule that is being added/deleted/modified or dumped to contain the originating protocol's id. The protocol is handled just like a routes originating protocol is. This is especially useful because there is starting to be a plethora of different user space programs adding rules. Allow the vrf device to specify that the kernel is the originator of the rule created for this device. Signed-off-by: Donald Sharp Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 + include/net/fib_rules.h | 3 ++- include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 7 ++++++- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 239c78c53e58..951a4b42cb29 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1174,6 +1174,7 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; + frh->proto = RTPROT_KERNEL; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 648caf90ec07..b166ef07e6d4 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -26,7 +26,8 @@ struct fib_rule { u32 table; u8 action; u8 l3mdev; - /* 2 bytes hole, try to use */ + u8 proto; + /* 1 byte hole, try to use */ u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2b642bf9b5a0..925539172d5b 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -23,8 +23,8 @@ struct fib_rule_hdr { __u8 tos; __u8 table; + __u8 proto; __u8 res1; /* reserved */ - __u8 res2; /* reserved */ __u8 action; __u32 flags; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index cb071b8e8d17..88298f18cbae 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -51,6 +51,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->pref = pref; r->table = table; r->flags = flags; + r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; @@ -465,6 +466,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } refcount_set(&rule->refcnt, 1); rule->fr_net = net; + rule->proto = frh->proto; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) : fib_default_rule_pref(ops); @@ -664,6 +666,9 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } list_for_each_entry(rule, &ops->rules_list, list) { + if (frh->proto && (frh->proto != rule->proto)) + continue; + if (frh->action && (frh->action != rule->action)) continue; @@ -808,9 +813,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; - frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; + frh->proto = rule->proto; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) -- cgit v1.2.3 From 4fe0de5b143762d327bfaf1d7be7c5b58041a18c Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Wed, 31 Jan 2018 19:04:15 -0600 Subject: uapi: Add 802.11 Preauthentication to if_ether This adds 0x88c7 protocol type to if_ether. Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index f8cb5760ea4f..54585906e50a 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -89,6 +89,7 @@ #define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ #define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ +#define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ #define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ #define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ -- cgit v1.2.3 From 1b71af6053af1bd2f849e9fda4f71c1e3f145dcf Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Fri, 23 Feb 2018 14:01:52 -0500 Subject: net: fib_rules: Add new attribute to set protocol For ages iproute2 has used `struct rtmsg` as the ancillary header for FIB rules and in the process set the protocol value to RTPROT_BOOT. Until ca56209a66 ("net: Allow a rule to track originating protocol") the kernel rules code ignored the protocol value sent from userspace and always returned 0 in notifications. To avoid incompatibility with existing iproute2, send the protocol as a new attribute. Fixes: cac56209a66 ("net: Allow a rule to track originating protocol") Signed-off-by: Donald Sharp Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 ++++- include/net/fib_rules.h | 3 ++- include/uapi/linux/fib_rules.h | 5 +++-- net/core/fib_rules.c | 15 +++++++++++---- 4 files changed, 20 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 951a4b42cb29..9ce0182223a0 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1145,6 +1145,7 @@ static inline size_t vrf_fib_rule_nl_size(void) sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ + sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } @@ -1174,7 +1175,9 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; - frh->proto = RTPROT_KERNEL; + + if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) + goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index b166ef07e6d4..b3d216249240 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -109,7 +109,8 @@ struct fib_rule_notifier_info { [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_L3MDEV] = { .type = NLA_U8 }, \ - [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) } + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, \ + [FRA_PROTOCOL] = { .type = NLA_U8 } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 925539172d5b..77d90ae38114 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -23,8 +23,8 @@ struct fib_rule_hdr { __u8 tos; __u8 table; - __u8 proto; - __u8 res1; /* reserved */ + __u8 res1; /* reserved */ + __u8 res2; /* reserved */ __u8 action; __u32 flags; @@ -58,6 +58,7 @@ enum { FRA_PAD, FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ FRA_UID_RANGE, /* UID range */ + FRA_PROTOCOL, /* Originator of the rule */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 88298f18cbae..a6aea805a0a2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -466,11 +466,13 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } refcount_set(&rule->refcnt, 1); rule->fr_net = net; - rule->proto = frh->proto; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) : fib_default_rule_pref(ops); + rule->proto = tb[FRA_PROTOCOL] ? + nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -666,7 +668,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } list_for_each_entry(rule, &ops->rules_list, list) { - if (frh->proto && (frh->proto != rule->proto)) + if (tb[FRA_PROTOCOL] && + (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) continue; if (frh->action && (frh->action != rule->action)) @@ -786,7 +789,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ - + nla_total_size(sizeof(struct fib_kuid_range)); + + nla_total_size(sizeof(struct fib_kuid_range)) + + nla_total_size(1); /* FRA_PROTOCOL */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -813,9 +817,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; + frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; - frh->proto = rule->proto; + + if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) + goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) -- cgit v1.2.3 From 6b1aea8cf2c8618146edaf6b35775ab55f7cafe5 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 1 Jan 2018 00:00:00 +0100 Subject: batman-adv: Update copyright years for 2018 Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 2 +- include/uapi/linux/batman_adv.h | 2 +- net/batman-adv/Kconfig | 2 +- net/batman-adv/Makefile | 2 +- net/batman-adv/bat_algo.c | 2 +- net/batman-adv/bat_algo.h | 2 +- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bat_iv_ogm.h | 2 +- net/batman-adv/bat_v.c | 2 +- net/batman-adv/bat_v.h | 2 +- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_elp.h | 2 +- net/batman-adv/bat_v_ogm.c | 2 +- net/batman-adv/bat_v_ogm.h | 2 +- net/batman-adv/bitarray.c | 2 +- net/batman-adv/bitarray.h | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/bridge_loop_avoidance.h | 2 +- net/batman-adv/debugfs.c | 2 +- net/batman-adv/debugfs.h | 2 +- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/distributed-arp-table.h | 2 +- net/batman-adv/fragmentation.c | 2 +- net/batman-adv/fragmentation.h | 2 +- net/batman-adv/gateway_client.c | 2 +- net/batman-adv/gateway_client.h | 2 +- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/gateway_common.h | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/hard-interface.h | 2 +- net/batman-adv/hash.c | 2 +- net/batman-adv/hash.h | 2 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/icmp_socket.h | 2 +- net/batman-adv/log.c | 2 +- net/batman-adv/log.h | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/multicast.c | 2 +- net/batman-adv/multicast.h | 2 +- net/batman-adv/netlink.c | 2 +- net/batman-adv/netlink.h | 2 +- net/batman-adv/network-coding.c | 2 +- net/batman-adv/network-coding.h | 2 +- net/batman-adv/originator.c | 2 +- net/batman-adv/originator.h | 2 +- net/batman-adv/routing.c | 2 +- net/batman-adv/routing.h | 2 +- net/batman-adv/send.c | 2 +- net/batman-adv/send.h | 2 +- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/soft-interface.h | 2 +- net/batman-adv/sysfs.c | 2 +- net/batman-adv/sysfs.h | 2 +- net/batman-adv/tp_meter.c | 2 +- net/batman-adv/tp_meter.h | 2 +- net/batman-adv/translation-table.c | 2 +- net/batman-adv/translation-table.h | 2 +- net/batman-adv/tvlv.c | 2 +- net/batman-adv/tvlv.h | 2 +- net/batman-adv/types.h | 2 +- 61 files changed, 61 insertions(+), 61 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 5cb360be2a11..daefd7283896 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index ae00c99cbed0..56ae28934070 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: MIT */ -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c44f6515be5e..e4e2e02b7380 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 022f6e77307b..b97ba6fb8353 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 80c72c7d3cad..ea309ad06175 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 029221615ba3..534b790c3753 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 79e326383726..e21aa147607b 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 9dc0dd5c83df..317cafd302cf 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 27e165ac9302..9c3a34b65b15 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index a17ab68bbce8..ec4a2a569750 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index a83478c46597..28687493599f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 5e39d0588a48..e8c7b7fd290d 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index ba59b77c605d..2948b41b06d4 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 6a4c14ccc3c6..ed36c5e79fde 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index bdc1ef06e05b..a296a4d851f5 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index ca9d0753dd6b..48f683289531 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index fad47853ad3c..8ff81346ff0c 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index b27571abcd2f..71f95a3e4d3f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 21d1189957a7..4229b01ac7b5 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 90a08d35c501..37b069698b04 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 9703c791ffc5..19b15de455ab 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 12897eb46268..e24aa9601c52 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 22dde42fd80e..d815acc13c35 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 138b22a1836a..944512e07782 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 37fe9a644f22..c294f6fd43e0 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 981f58421a32..f0b86fcb2493 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index b3e156af2256..936c107f3199 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index afebd9c7edf4..80afb2793687 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 5f186bff284a..fd4a263dd6b7 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index de5e9a374ece..d1c0f6189301 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 04d964358c98..7b49e4001778 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 4ce1b6d3ad5c..9490a7ca2ba6 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index e91f29c7c638..7d5e9abb7a65 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 84cddd01eeab..958be22beda9 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index dc9fa37ddd14..52d8a4b848c0 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 35e02b2b9e72..35f4f397ed57 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d31c8266e244..69c0d85bceb3 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 69bfedfad174..d5d65999207e 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index cbdeb47ec3f6..6eaffe50335a 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3ac06337ab71..6b8594e23da3 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a823d3899bad..129af56b944d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 0e7e57b69b54..571d9a5ae7aa 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b48116bb24ef..c3578444f3cb 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index adaeafa4f71e..65c346812bc1 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 58a7d9274435..2a51a0cbb82a 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 8e543a3cdc6c..f3601ab0872e 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index b6891e8b741c..289df027ecdd 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index a1289bc5f115..db54c2d9b8bf 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 2a5ab6f1076d..4a35f5c2f52b 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 1e8c79093623..64cce07b8fe6 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 900c5ce21cd4..c95e2b2677fd 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 075c5b5b2ce1..daf87f07fadd 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index c1578fa0b952..f2eef43bd2ec 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index bbeee61221fa..c1e3fb69952d 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 8b576712d0c1..11520de96ccb 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index c8b8f2cb2c2b..68e600974759 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 7550a9ccd695..0225616d5771 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 8d9e3abec2c8..01b6c8eafaf9 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 5ffcb45ac6ff..a637458205d1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index a74df33f446d..ef5867f49824 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index bb1578410e0c..4a3b8837e1b5 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * -- cgit v1.2.3 From a163dc22d515d17844435c8217ff66193d35b3fa Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 24 Jan 2018 12:21:37 +0100 Subject: batman-adv: always assume 2-byte packet alignment NIC drivers generally try to ensure that the "network header" is aligned to a 4-byte boundary. This is not always possible: When Ethernet frames are encapsulated in other packets with 4-byte aligned headers, the inner Ethernet header will have 4-byte alignment, and in consequence, the inner network header is aligned to 2, but not to 4 bytes. Most parts of batman-adv only care about 2-byte alignment; in particular, no unaligned accesses occur in performance-critical paths that handle actual payload data. This is not true for OGM handling: the seqno and crc fields are accessed as 32-bit values. To avoid these unaligned accesses, this patch reduces the expected packet alignment to 2 bytes for all of batadv's packet types. As no unaligned accesses existed on the performance-critical paths anyways, this chance does have any (positive or negative) effect on performance, but it still makes sense to avoid these accesses to prevent log noise when examining other unaligned accesses in the kernel while batman-adv is active. Signed-off-by: Matthias Schiffer Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index daefd7283896..894d8d2f713d 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -196,8 +196,6 @@ struct batadv_bla_claim_dst { __be16 group; /* group id */ }; -#pragma pack() - /** * struct batadv_ogm_packet - ogm (routing protocol) packet * @packet_type: batman-adv packet type, part of the general header @@ -222,9 +220,6 @@ struct batadv_ogm_packet { __u8 reserved; __u8 tq; __be16 tvlv_len; - /* __packed is not needed as the struct size is divisible by 4, - * and the largest data type in this struct has a size of 4. - */ }; #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) @@ -249,9 +244,6 @@ struct batadv_ogm2_packet { __u8 orig[ETH_ALEN]; __be16 tvlv_len; __be32 throughput; - /* __packed is not needed as the struct size is divisible by 4, - * and the largest data type in this struct has a size of 4. - */ }; #define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet) @@ -405,7 +397,6 @@ struct batadv_icmp_packet_rr { * misalignment of the payload after the ethernet header. It may also lead to * leakage of information when the padding it not initialized before sending. */ -#pragma pack(2) /** * struct batadv_unicast_packet - unicast packet for network payload @@ -533,8 +524,6 @@ struct batadv_coded_packet { __be16 coded_len; }; -#pragma pack() - /** * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload * @packet_type: batman-adv packet type, part of the general header @@ -641,4 +630,6 @@ struct batadv_tvlv_mcast_data { __u8 reserved[3]; }; +#pragma pack() + #endif /* _UAPI_LINUX_BATADV_PACKET_H_ */ -- cgit v1.2.3 From 401910db4cd425899832a093539222b6174f92a2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 27 Feb 2018 09:52:43 -0800 Subject: rds: deliver zerocopy completion notification with data This commit is an optimization over commit 01883eda72bd ("rds: support for zcopy completion notification") for PF_RDS sockets. RDS applications are predominantly request-response transactions, so it is more efficient to reduce the number of system calls and have zerocopy completion notification delivered as ancillary data on the POLLIN channel. Cookies are passed up as ancillary data (at level SOL_RDS) in a struct rds_zcopy_cookies when the returned value of recvmsg() is greater than, or equal to, 0. A max of RDS_MAX_ZCOOKIES may be passed with each message. This commit removes support for zerocopy completion notification on MSG_ERRQUEUE for PF_RDS sockets. Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- include/uapi/linux/errqueue.h | 2 -- include/uapi/linux/rds.h | 7 +++++++ net/rds/af_rds.c | 7 +++++-- net/rds/message.c | 38 ++++++++++++++++---------------------- net/rds/rds.h | 2 ++ net/rds/recv.c | 31 ++++++++++++++++++++++++++++++- 6 files changed, 60 insertions(+), 27 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 28812eda4209..dc64cfaf13da 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,13 +20,11 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 -#define SO_EE_ORIGIN_ZCOOKIE 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 -#define SO_EE_ORIGIN_MAX_ZCOOKIES 8 /** * struct scm_timestamping - timestamps exposed through cmsg diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 12e3bca32cad..a66b213de3d7 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -104,6 +104,7 @@ #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 #define RDS_CMSG_RXPATH_LATENCY 11 #define RDS_CMSG_ZCOPY_COOKIE 12 +#define RDS_CMSG_ZCOPY_COMPLETION 13 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 @@ -317,6 +318,12 @@ struct rds_rdma_notify { #define RDS_RDMA_DROPPED 3 #define RDS_RDMA_OTHER_ERROR 4 +#define RDS_MAX_ZCOOKIES 8 +struct rds_zcopy_cookies { + __u32 num; + __u32 cookies[RDS_MAX_ZCOOKIES]; +}; + /* * Common set of flags for all RDMA related structs */ diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index a937f18896ae..f7126108a811 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -77,6 +77,7 @@ static int rds_release(struct socket *sock) rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); + __skb_queue_purge(&rs->rs_zcookie_queue); spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); @@ -144,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, * - to signal that a previously congested destination may have become * uncongested * - A notification has been queued to the socket (this can be a congestion - * update, or a RDMA completion). + * update, or a RDMA completion, or a MSG_ZEROCOPY completion). * * EPOLLOUT is asserted if there is room on the send queue. This does not mean * however, that the next sendmsg() call will succeed. If the application tries @@ -178,7 +179,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, spin_unlock(&rs->rs_lock); } if (!list_empty(&rs->rs_recv_queue) || - !list_empty(&rs->rs_notify_queue)) + !list_empty(&rs->rs_notify_queue) || + !skb_queue_empty(&rs->rs_zcookie_queue)) mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); @@ -513,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_recv_queue); INIT_LIST_HEAD(&rs->rs_notify_queue); INIT_LIST_HEAD(&rs->rs_cong_list); + skb_queue_head_init(&rs->rs_zcookie_queue); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; diff --git a/net/rds/message.c b/net/rds/message.c index 651834513481..116cf87ccb89 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -58,32 +58,26 @@ EXPORT_SYMBOL_GPL(rds_message_addref); static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) { - struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); - int ncookies; - u32 *ptr; + struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb; + int ncookies = ck->num; - if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE) + if (ncookies == RDS_MAX_ZCOOKIES) return false; - ncookies = serr->ee.ee_data; - if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES) - return false; - ptr = skb_put(skb, sizeof(u32)); - *ptr = cookie; - serr->ee.ee_data = ++ncookies; + ck->cookies[ncookies] = cookie; + ck->num = ++ncookies; return true; } static void rds_rm_zerocopy_callback(struct rds_sock *rs, struct rds_znotifier *znotif) { - struct sock *sk = rds_rs_to_sk(rs); struct sk_buff *skb, *tail; - struct sock_exterr_skb *serr; unsigned long flags; struct sk_buff_head *q; u32 cookie = znotif->z_cookie; + struct rds_zcopy_cookies *ck; - q = &sk->sk_error_queue; + q = &rs->rs_zcookie_queue; spin_lock_irqsave(&q->lock, flags); tail = skb_peek_tail(q); @@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_unlock_irqrestore(&q->lock, flags); mm_unaccount_pinned_pages(&znotif->z_mmp); consume_skb(rds_skb_from_znotifier(znotif)); - sk->sk_error_report(sk); + /* caller invokes rds_wake_sk_sleep() */ return; } skb = rds_skb_from_znotifier(znotif); - serr = SKB_EXT_ERR(skb); - memset(&serr->ee, 0, sizeof(serr->ee)); - serr->ee.ee_errno = 0; - serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE; - serr->ee.ee_info = 0; + ck = (struct rds_zcopy_cookies *)skb->cb; + memset(ck, 0, sizeof(*ck)); WARN_ON(!skb_zcookie_add(skb, cookie)); __skb_queue_tail(q, skb); spin_unlock_irqrestore(&q->lock, flags); - sk->sk_error_report(sk); + /* caller invokes rds_wake_sk_sleep() */ mm_unaccount_pinned_pages(&znotif->z_mmp); } @@ -129,6 +120,7 @@ static void rds_message_purge(struct rds_message *rm) if (rm->data.op_mmp_znotifier) { zcopy = true; rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rds_wake_sk_sleep(rs); rm->data.op_mmp_znotifier = NULL; } sock_put(rds_rs_to_sk(rs)); @@ -362,10 +354,12 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, int total_copied = 0; struct sk_buff *skb; - skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32), - GFP_KERNEL); + skb = alloc_skb(0, GFP_KERNEL); if (!skb) return -ENOMEM; + BUILD_BUG_ON(sizeof(skb->cb) < + max_t(int, sizeof(struct rds_znotifier), + sizeof(struct rds_zcopy_cookies))); rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, length)) { diff --git a/net/rds/rds.h b/net/rds/rds.h index 31cd38852050..33b16353d8f3 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -603,6 +603,8 @@ struct rds_sock { /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; + + struct sk_buff_head rs_zcookie_queue; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) diff --git a/net/rds/recv.c b/net/rds/recv.c index b080961464df..d50747725221 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -577,6 +577,32 @@ out: return ret; } +static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) +{ + struct sk_buff *skb; + struct sk_buff_head *q = &rs->rs_zcookie_queue; + struct rds_zcopy_cookies *done; + + if (!msg->msg_control) + return false; + + if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || + msg->msg_controllen < CMSG_SPACE(sizeof(*done))) + return false; + + skb = skb_dequeue(q); + if (!skb) + return false; + done = (struct rds_zcopy_cookies *)skb->cb; + if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), + done)) { + skb_queue_head(q, skb); + return false; + } + consume_skb(skb); + return true; +} + int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { @@ -611,7 +637,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (!rds_next_incoming(rs, &inc)) { if (nonblock) { - ret = -EAGAIN; + bool reaped = rds_recvmsg_zcookie(rs, msg); + + ret = reaped ? 0 : -EAGAIN; break; } @@ -660,6 +688,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, ret = -EFAULT; goto out; } + rds_recvmsg_zcookie(rs, msg); rds_stats_inc(s_recv_delivered); -- cgit v1.2.3 From bfff4862653bb96001ab57c1edd6d03f48e5f035 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:40:16 -0500 Subject: net: fib_rules: support for match on ip_proto, sport and dport uapi for ip_proto, sport and dport range match in fib rules. Signed-off-by: Roopa Prabhu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/fib_rules.h | 36 ++++++++++++++++- include/uapi/linux/fib_rules.h | 8 ++++ net/core/fib_rules.c | 92 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 133 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index b3d216249240..6dd0a00653ae 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -27,7 +27,7 @@ struct fib_rule { u8 action; u8 l3mdev; u8 proto; - /* 1 byte hole, try to use */ + u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; @@ -40,6 +40,8 @@ struct fib_rule { char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; + struct fib_rule_port_range sport_range; + struct fib_rule_port_range dport_range; struct rcu_head rcu; }; @@ -144,6 +146,38 @@ static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) return frh->table; } +static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) +{ + return range->start != 0 && range->end != 0; +} + +static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, + __be16 port) +{ + return ntohs(port) >= a->start && + ntohs(port) <= a->end; +} + +static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) +{ + return a->start != 0 && a->end != 0 && a->end < 0xffff && + a->start <= a->end; +} + +static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, + struct fib_rule_port_range *b) +{ + return a->start == b->start && + a->end == b->end; +} + +static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) +{ + return rule->ip_proto || + fib_rule_port_range_set(&rule->sport_range) || + fib_rule_port_range_set(&rule->dport_range); +} + struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 77d90ae38114..232df14e1287 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -35,6 +35,11 @@ struct fib_rule_uid_range { __u32 end; }; +struct fib_rule_port_range { + __u16 start; + __u16 end; +}; + enum { FRA_UNSPEC, FRA_DST, /* destination address */ @@ -59,6 +64,9 @@ enum { FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ FRA_UID_RANGE, /* UID range */ FRA_PROTOCOL, /* Originator of the rule */ + FRA_IP_PROTO, /* ip proto */ + FRA_SPORT_RANGE, /* sport */ + FRA_DPORT_RANGE, /* dport */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a6aea805a0a2..f6f04fc0f629 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule) if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; + if (fib_rule_port_range_set(&rule->sport_range)) + return false; + if (fib_rule_port_range_set(&rule->dport_range)) + return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); @@ -221,6 +225,26 @@ static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } +static int nla_get_port_range(struct nlattr *pattr, + struct fib_rule_port_range *port_range) +{ + const struct fib_rule_port_range *pr = nla_data(pattr); + + if (!fib_rule_port_range_valid(pr)) + return -EINVAL; + + port_range->start = pr->start; + port_range->end = pr->end; + + return 0; +} + +static int nla_put_port_range(struct sk_buff *skb, int attrtype, + struct fib_rule_port_range *range) +{ + return nla_put(skb, attrtype, sizeof(*range), range); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) @@ -425,6 +449,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; + if (r->ip_proto != rule->ip_proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -569,6 +604,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, rule->uid_range = fib_kuid_range_unset; } + if (tb[FRA_IP_PROTO]) + rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &rule->sport_range); + if (err) + goto errout_free; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &rule->dport_range); + if (err) + goto errout_free; + } + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -634,6 +686,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rule_port_range sprange = {0, 0}; + struct fib_rule_port_range dprange = {0, 0}; struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *r; struct nlattr *tb[FRA_MAX+1]; @@ -667,6 +721,20 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, range = fib_kuid_range_unset; } + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &sprange); + if (err) + goto errout; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &dprange); + if (err) + goto errout; + } + list_for_each_entry(rule, &ops->rules_list, list) { if (tb[FRA_PROTOCOL] && (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) @@ -712,6 +780,18 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, !uid_eq(rule->uid_range.end, range.end))) continue; + if (tb[FRA_IP_PROTO] && + (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO]))) + continue; + + if (fib_rule_port_range_set(&sprange) && + !fib_rule_port_range_compare(&rule->sport_range, &sprange)) + continue; + + if (fib_rule_port_range_set(&dprange) && + !fib_rule_port_range_compare(&rule->dport_range, &dprange)) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -790,7 +870,10 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) - + nla_total_size(1); /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_IP_PROTO */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -855,7 +938,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && - nla_put_uid_range(skb, &rule->uid_range))) + nla_put_uid_range(skb, &rule->uid_range)) || + (fib_rule_port_range_set(&rule->sport_range) && + nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (fib_rule_port_range_set(&rule->dport_range) && + nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { -- cgit v1.2.3 From 77a5196a804e34ce5e215ef84d5e1de332e9c529 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 1 Mar 2018 13:49:57 -0800 Subject: gre: add sequence number for collect md mode. Currently GRE sequence number can only be used in native tunnel mode. This patch adds sequence number support for gre collect metadata mode. RFC2890 defines GRE sequence number to be specific to the traffic flow identified by the key. However, this patch does not implement per-key seqno. The sequence number is shared in the same tunnel device. That is, different tunnel keys using the same collect_md tunnel share single sequence number. Signed-off-by: William Tu Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 4 +++- net/ipv4/ip_gre.c | 7 +++++-- net/ipv6/ip6_gre.c | 13 ++++++++----- 4 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index db6bdc375126..2a66769e5875 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -800,6 +800,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) +#define BPF_F_SEQ_NUMBER (1ULL << 3) /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. diff --git a/net/core/filter.c b/net/core/filter.c index 0c121adbdbaa..33edfa8372fd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2991,7 +2991,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -3025,6 +3025,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; if (flags & BPF_F_ZERO_CSUM_TX) info->key.tun_flags &= ~TUNNEL_CSUM; + if (flags & BPF_F_SEQ_NUMBER) + info->key.tun_flags |= TUNNEL_SEQ; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0fe1d69b5df4..95fd225f402e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -522,6 +522,7 @@ err_free_skb: static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { + struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct rtable *rt = NULL; @@ -545,9 +546,11 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) goto err_free_rt; - flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = tun_info->key.tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 83c7766c8c75..18a3dfbd0300 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -695,9 +695,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -720,14 +717,20 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; - flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = key->tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tunnel->tun_hlen = gre_calc_hlen(flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + : 0); } else { + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); -- cgit v1.2.3 From 87ecc95d81d951b0984f2eb9c5c118cb68d0dce8 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Sun, 4 Mar 2018 10:38:35 -0800 Subject: tcp: add send queue size stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_SENDQ_SIZE stat into SCM_TIMESTAMPING_OPT_STATS. It reports no. of bytes present in send queue, when timestamp is generated. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index b4a4f64635fa..93bad2128ef6 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -241,6 +241,7 @@ enum { TCP_NLA_MIN_RTT, /* minimum RTT */ TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ + TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a33539798bf6..162ba4227446 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3031,7 +3031,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 3 * nla_total_size(sizeof(u32)) + + 4 * nla_total_size(sizeof(u32)) + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3061,6 +3061,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + + nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); return stats; } -- cgit v1.2.3 From be631892948060f44b1ceee3132be1266932071e Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Sun, 4 Mar 2018 10:38:36 -0800 Subject: tcp: add ca_state stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_CA_STATE stat into SCM_TIMESTAMPING_OPT_STATS. It reports ca_state of socket, when timestamp is generated. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 93bad2128ef6..4c0ae0faf7ca 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -242,6 +242,7 @@ enum { TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ + TCP_NLA_CA_STATE, /* ca_state of socket */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 162ba4227446..fb350f740f69 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3032,7 +3032,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3063,6 +3063,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); + nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); return stats; } -- cgit v1.2.3 From 955dc68cb9b23b42999cafe6df3684309bc686c6 Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Mon, 5 Mar 2018 11:39:05 +1100 Subject: net/ncsi: Add generic netlink family Add a generic netlink family for NCSI. This supports three commands; NCSI_CMD_PKG_INFO which returns information on packages and their associated channels, NCSI_CMD_SET_INTERFACE which allows a specific package or package/channel combination to be set as the preferred choice, and NCSI_CMD_CLEAR_INTERFACE which clears any preferred setting. Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- include/uapi/linux/ncsi.h | 115 +++++++++++++ net/ncsi/Makefile | 2 +- net/ncsi/internal.h | 3 + net/ncsi/ncsi-manage.c | 30 +++- net/ncsi/ncsi-netlink.c | 421 ++++++++++++++++++++++++++++++++++++++++++++++ net/ncsi/ncsi-netlink.h | 20 +++ 6 files changed, 586 insertions(+), 5 deletions(-) create mode 100644 include/uapi/linux/ncsi.h create mode 100644 net/ncsi/ncsi-netlink.c create mode 100644 net/ncsi/ncsi-netlink.h (limited to 'include/uapi') diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h new file mode 100644 index 000000000000..4c292ecbb748 --- /dev/null +++ b/include/uapi/linux/ncsi.h @@ -0,0 +1,115 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __UAPI_NCSI_NETLINK_H__ +#define __UAPI_NCSI_NETLINK_H__ + +/** + * enum ncsi_nl_commands - supported NCSI commands + * + * @NCSI_CMD_UNSPEC: unspecified command to catch errors + * @NCSI_CMD_PKG_INFO: list package and channel attributes. Requires + * NCSI_ATTR_IFINDEX. If NCSI_ATTR_PACKAGE_ID is specified returns the + * specific package and its channels - otherwise a dump request returns + * all packages and their associated channels. + * @NCSI_CMD_SET_INTERFACE: set preferred package and channel combination. + * Requires NCSI_ATTR_IFINDEX and the preferred NCSI_ATTR_PACKAGE_ID and + * optionally the preferred NCSI_ATTR_CHANNEL_ID. + * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination. + * Requires NCSI_ATTR_IFINDEX. + * @NCSI_CMD_MAX: highest command number + */ +enum ncsi_nl_commands { + NCSI_CMD_UNSPEC, + NCSI_CMD_PKG_INFO, + NCSI_CMD_SET_INTERFACE, + NCSI_CMD_CLEAR_INTERFACE, + + __NCSI_CMD_AFTER_LAST, + NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1 +}; + +/** + * enum ncsi_nl_attrs - General NCSI netlink attributes + * + * @NCSI_ATTR_UNSPEC: unspecified attributes to catch errors + * @NCSI_ATTR_IFINDEX: ifindex of network device using NCSI + * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes + * @NCSI_ATTR_PACKAGE_ID: package ID + * @NCSI_ATTR_CHANNEL_ID: channel ID + * @NCSI_ATTR_MAX: highest attribute number + */ +enum ncsi_nl_attrs { + NCSI_ATTR_UNSPEC, + NCSI_ATTR_IFINDEX, + NCSI_ATTR_PACKAGE_LIST, + NCSI_ATTR_PACKAGE_ID, + NCSI_ATTR_CHANNEL_ID, + + __NCSI_ATTR_AFTER_LAST, + NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1 +}; + +/** + * enum ncsi_nl_pkg_attrs - NCSI netlink package-specific attributes + * + * @NCSI_PKG_ATTR_UNSPEC: unspecified attributes to catch errors + * @NCSI_PKG_ATTR: nested array of package attributes + * @NCSI_PKG_ATTR_ID: package ID + * @NCSI_PKG_ATTR_FORCED: flag signifying a package has been set as preferred + * @NCSI_PKG_ATTR_CHANNEL_LIST: nested array of NCSI_CHANNEL_ATTR attributes + * @NCSI_PKG_ATTR_MAX: highest attribute number + */ +enum ncsi_nl_pkg_attrs { + NCSI_PKG_ATTR_UNSPEC, + NCSI_PKG_ATTR, + NCSI_PKG_ATTR_ID, + NCSI_PKG_ATTR_FORCED, + NCSI_PKG_ATTR_CHANNEL_LIST, + + __NCSI_PKG_ATTR_AFTER_LAST, + NCSI_PKG_ATTR_MAX = __NCSI_PKG_ATTR_AFTER_LAST - 1 +}; + +/** + * enum ncsi_nl_channel_attrs - NCSI netlink channel-specific attributes + * + * @NCSI_CHANNEL_ATTR_UNSPEC: unspecified attributes to catch errors + * @NCSI_CHANNEL_ATTR: nested array of channel attributes + * @NCSI_CHANNEL_ATTR_ID: channel ID + * @NCSI_CHANNEL_ATTR_VERSION_MAJOR: channel major version number + * @NCSI_CHANNEL_ATTR_VERSION_MINOR: channel minor version number + * @NCSI_CHANNEL_ATTR_VERSION_STR: channel version string + * @NCSI_CHANNEL_ATTR_LINK_STATE: channel link state flags + * @NCSI_CHANNEL_ATTR_ACTIVE: channels with this flag are in + * NCSI_CHANNEL_ACTIVE state + * @NCSI_CHANNEL_ATTR_FORCED: flag signifying a channel has been set as + * preferred + * @NCSI_CHANNEL_ATTR_VLAN_LIST: nested array of NCSI_CHANNEL_ATTR_VLAN_IDs + * @NCSI_CHANNEL_ATTR_VLAN_ID: VLAN ID being filtered on this channel + * @NCSI_CHANNEL_ATTR_MAX: highest attribute number + */ +enum ncsi_nl_channel_attrs { + NCSI_CHANNEL_ATTR_UNSPEC, + NCSI_CHANNEL_ATTR, + NCSI_CHANNEL_ATTR_ID, + NCSI_CHANNEL_ATTR_VERSION_MAJOR, + NCSI_CHANNEL_ATTR_VERSION_MINOR, + NCSI_CHANNEL_ATTR_VERSION_STR, + NCSI_CHANNEL_ATTR_LINK_STATE, + NCSI_CHANNEL_ATTR_ACTIVE, + NCSI_CHANNEL_ATTR_FORCED, + NCSI_CHANNEL_ATTR_VLAN_LIST, + NCSI_CHANNEL_ATTR_VLAN_ID, + + __NCSI_CHANNEL_ATTR_AFTER_LAST, + NCSI_CHANNEL_ATTR_MAX = __NCSI_CHANNEL_ATTR_AFTER_LAST - 1 +}; + +#endif /* __UAPI_NCSI_NETLINK_H__ */ diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile index dd12b564f2e7..436ef68331f2 100644 --- a/net/ncsi/Makefile +++ b/net/ncsi/Makefile @@ -1,4 +1,4 @@ # # Makefile for NCSI API # -obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o +obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o ncsi-netlink.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index d30f7bd741d0..8da84312cd3b 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -276,6 +276,8 @@ struct ncsi_dev_priv { unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ + struct ncsi_package *force_package; /* Force a specific package */ + struct ncsi_channel *force_channel; /* Force a specific channel */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ #define NCSI_REQ_START_IDX 1 @@ -318,6 +320,7 @@ extern spinlock_t ncsi_dev_lock; list_for_each_entry_rcu(nc, &np->channels, node) /* Resources */ +u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index); int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index c989211bbabc..c3695ba0cf94 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -23,6 +22,7 @@ #include "internal.h" #include "ncsi-pkt.h" +#include "ncsi-netlink.h" LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); @@ -38,7 +38,7 @@ static inline int ncsi_filter_size(int table) return sizes[table]; } -static u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) +u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) { struct ncsi_channel_filter *ncf; int size; @@ -965,20 +965,37 @@ error: static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_package *np; - struct ncsi_channel *nc, *found, *hot_nc; + struct ncsi_package *np, *force_package; + struct ncsi_channel *nc, *found, *hot_nc, *force_channel; struct ncsi_channel_mode *ncm; unsigned long flags; spin_lock_irqsave(&ndp->lock, flags); hot_nc = ndp->hot_channel; + force_channel = ndp->force_channel; + force_package = ndp->force_package; spin_unlock_irqrestore(&ndp->lock, flags); + /* Force a specific channel whether or not it has link if we have been + * configured to do so + */ + if (force_package && force_channel) { + found = force_channel; + ncm = &found->modes[NCSI_MODE_LINK]; + if (!(ncm->data[2] & 0x1)) + netdev_info(ndp->ndev.dev, + "NCSI: Channel %u forced, but it is link down\n", + found->id); + goto out; + } + /* The search is done once an inactive channel with up * link is found. */ found = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (ndp->force_package && np != ndp->force_package) + continue; NCSI_FOR_EACH_CHANNEL(np, nc) { spin_lock_irqsave(&nc->lock, flags); @@ -1594,6 +1611,9 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, ndp->ptype.dev = dev; dev_add_pack(&ndp->ptype); + /* Set up generic netlink interface */ + ncsi_init_netlink(dev); + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); @@ -1673,6 +1693,8 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) #endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); + ncsi_unregister_netlink(nd->dev); + kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c new file mode 100644 index 000000000000..d4201665a580 --- /dev/null +++ b/net/ncsi/ncsi-netlink.c @@ -0,0 +1,421 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "ncsi-netlink.h" + +static struct genl_family ncsi_genl_family; + +static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { + [NCSI_ATTR_IFINDEX] = { .type = NLA_U32 }, + [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, + [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, + [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, +}; + +static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) +{ + struct ncsi_dev_priv *ndp; + struct net_device *dev; + struct ncsi_dev *nd; + struct ncsi_dev; + + if (!net) + return NULL; + + dev = dev_get_by_index(net, ifindex); + if (!dev) { + pr_err("NCSI netlink: No device for ifindex %u\n", ifindex); + return NULL; + } + + nd = ncsi_find_dev(dev); + ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + + dev_put(dev); + return ndp; +} + +static int ncsi_write_channel_info(struct sk_buff *skb, + struct ncsi_dev_priv *ndp, + struct ncsi_channel *nc) +{ + struct nlattr *vid_nest; + struct ncsi_channel_filter *ncf; + struct ncsi_channel_mode *m; + u32 *data; + int i; + + nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id); + m = &nc->modes[NCSI_MODE_LINK]; + nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); + if (nc->state == NCSI_CHANNEL_ACTIVE) + nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); + if (ndp->force_channel == nc) + nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); + + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version); + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.alpha2); + nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name); + + vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); + if (!vid_nest) + return -ENOMEM; + ncf = nc->filters[NCSI_FILTER_VLAN]; + i = -1; + if (ncf) { + while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total, + i + 1)) < ncf->total) { + data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i); + /* Uninitialised channels will have 'zero' vlan ids */ + if (!data || !*data) + continue; + nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID, + *(u16 *)data); + } + } + nla_nest_end(skb, vid_nest); + + return 0; +} + +static int ncsi_write_package_info(struct sk_buff *skb, + struct ncsi_dev_priv *ndp, unsigned int id) +{ + struct nlattr *pnest, *cnest, *nest; + struct ncsi_package *np; + struct ncsi_channel *nc; + bool found; + int rc; + + if (id > ndp->package_num) { + netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id); + return -ENODEV; + } + + found = false; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (np->id != id) + continue; + pnest = nla_nest_start(skb, NCSI_PKG_ATTR); + if (!pnest) + return -ENOMEM; + nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); + if (ndp->force_package == np) + nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); + cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST); + if (!cnest) { + nla_nest_cancel(skb, pnest); + return -ENOMEM; + } + NCSI_FOR_EACH_CHANNEL(np, nc) { + nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR); + if (!nest) { + nla_nest_cancel(skb, cnest); + nla_nest_cancel(skb, pnest); + return -ENOMEM; + } + rc = ncsi_write_channel_info(skb, ndp, nc); + if (rc) { + nla_nest_cancel(skb, nest); + nla_nest_cancel(skb, cnest); + nla_nest_cancel(skb, pnest); + return rc; + } + nla_nest_end(skb, nest); + } + nla_nest_end(skb, cnest); + nla_nest_end(skb, pnest); + found = true; + } + + if (!found) + return -ENODEV; + + return 0; +} + +static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned int package_id; + struct sk_buff *skb; + struct nlattr *attr; + void *hdr; + int rc; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + ndp = ndp_from_ifindex(genl_info_net(info), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); + if (!hdr) { + kfree(skb); + return -EMSGSIZE; + } + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + + attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + rc = ncsi_write_package_info(skb, ndp, package_id); + + if (rc) { + nla_nest_cancel(skb, attr); + goto err; + } + + nla_nest_end(skb, attr); + + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, info); + +err: + genlmsg_cancel(skb, hdr); + kfree(skb); + return rc; +} + +static int ncsi_pkg_info_all_nl(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *attrs[NCSI_ATTR_MAX]; + struct ncsi_package *np, *package; + struct ncsi_dev_priv *ndp; + unsigned int package_id; + struct nlattr *attr; + void *hdr; + int rc; + + rc = genlmsg_parse(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX, + ncsi_genl_policy, NULL); + if (rc) + return rc; + + if (!attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(skb->sk)), + nla_get_u32(attrs[NCSI_ATTR_IFINDEX])); + + if (!ndp) + return -ENODEV; + + package_id = cb->args[0]; + package = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) + package = np; + + if (!package) + return 0; /* done */ + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); + if (!hdr) { + rc = -EMSGSIZE; + goto err; + } + + attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + rc = ncsi_write_package_info(skb, ndp, package->id); + if (rc) { + nla_nest_cancel(skb, attr); + goto err; + } + + nla_nest_end(skb, attr); + genlmsg_end(skb, hdr); + + cb->args[0] = package_id + 1; + + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return rc; +} + +static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_package *np, *package; + struct ncsi_channel *nc, *channel; + u32 package_id, channel_id; + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + package = NULL; + + spin_lock_irqsave(&ndp->lock, flags); + + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) + package = np; + if (!package) { + /* The user has set a package that does not exist */ + return -ERANGE; + } + + channel = NULL; + if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { + /* Allow any channel */ + channel_id = NCSI_RESERVED_CHANNEL; + } else { + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + NCSI_FOR_EACH_CHANNEL(package, nc) + if (nc->id == channel_id) + channel = nc; + } + + if (channel_id != NCSI_RESERVED_CHANNEL && !channel) { + /* The user has set a channel that does not exist on this + * package + */ + netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", + channel_id); + return -ERANGE; + } + + ndp->force_package = package; + ndp->force_channel = channel; + spin_unlock_irqrestore(&ndp->lock, flags); + + netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n", + package_id, channel_id, + channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : ""); + + /* Bounce the NCSI channel to set changes */ + ncsi_stop_dev(&ndp->ndev); + ncsi_start_dev(&ndp->ndev); + + return 0; +} + +static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + /* Clear any override */ + spin_lock_irqsave(&ndp->lock, flags); + ndp->force_package = NULL; + ndp->force_channel = NULL; + spin_unlock_irqrestore(&ndp->lock, flags); + netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); + + /* Bounce the NCSI channel to set changes */ + ncsi_stop_dev(&ndp->ndev); + ncsi_start_dev(&ndp->ndev); + + return 0; +} + +static const struct genl_ops ncsi_ops[] = { + { + .cmd = NCSI_CMD_PKG_INFO, + .policy = ncsi_genl_policy, + .doit = ncsi_pkg_info_nl, + .dumpit = ncsi_pkg_info_all_nl, + .flags = 0, + }, + { + .cmd = NCSI_CMD_SET_INTERFACE, + .policy = ncsi_genl_policy, + .doit = ncsi_set_interface_nl, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NCSI_CMD_CLEAR_INTERFACE, + .policy = ncsi_genl_policy, + .doit = ncsi_clear_interface_nl, + .flags = GENL_ADMIN_PERM, + }, +}; + +static struct genl_family ncsi_genl_family __ro_after_init = { + .name = "NCSI", + .version = 0, + .maxattr = NCSI_ATTR_MAX, + .module = THIS_MODULE, + .ops = ncsi_ops, + .n_ops = ARRAY_SIZE(ncsi_ops), +}; + +int ncsi_init_netlink(struct net_device *dev) +{ + int rc; + + rc = genl_register_family(&ncsi_genl_family); + if (rc) + netdev_err(dev, "ncsi: failed to register netlink family\n"); + + return rc; +} + +int ncsi_unregister_netlink(struct net_device *dev) +{ + int rc; + + rc = genl_unregister_family(&ncsi_genl_family); + if (rc) + netdev_err(dev, "ncsi: failed to unregister netlink family\n"); + + return rc; +} diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h new file mode 100644 index 000000000000..91a5c256f8c4 --- /dev/null +++ b/net/ncsi/ncsi-netlink.h @@ -0,0 +1,20 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_NETLINK_H__ +#define __NCSI_NETLINK_H__ + +#include + +#include "internal.h" + +int ncsi_init_netlink(struct net_device *dev); +int ncsi_unregister_netlink(struct net_device *dev); + +#endif /* __NCSI_NETLINK_H__ */ -- cgit v1.2.3 From ed63afb8a318f6b3558d76afba7809daee4f28e5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 5 Mar 2018 20:44:18 +0800 Subject: sctp: add support for PR-SCTP Information for sendmsg This patch is to add support for PR-SCTP Information for sendmsg, as described in section 5.3.7 of RFC6458. With this option, you can specify pr_policy and pr_value for user data in sendmsg. It's also a necessary send info for sctp_sendv. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 15 +++++++++++++++ net/sctp/socket.c | 31 ++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 03e92dda1813..d40a2a329888 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2112,6 +2112,7 @@ struct sctp_cmsgs { struct sctp_initmsg *init; struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; + struct sctp_prinfo *prinfo; }; /* Structure for tracking memory objects */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 4c4db14786bd..0dd1f82a4fa8 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -260,6 +260,19 @@ struct sctp_nxtinfo { sctp_assoc_t nxt_assoc_id; }; +/* 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ------------------- + * IPPROTO_SCTP SCTP_PRINFO struct sctp_prinfo + */ +struct sctp_prinfo { + __u16 pr_policy; + __u32 pr_value; +}; + /* * sinfo_flags: 16 bits (unsigned integer) * @@ -293,6 +306,8 @@ typedef enum sctp_cmsg_type { #define SCTP_RCVINFO SCTP_RCVINFO SCTP_NXTINFO, /* 5.3.6 SCTP Next Receive Information Structure */ #define SCTP_NXTINFO SCTP_NXTINFO + SCTP_PRINFO, /* 5.3.7 SCTP PR-SCTP Information Structure */ +#define SCTP_PRINFO SCTP_PRINFO } sctp_cmsg_t; /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7fa76031bb08..fdde697b37e7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1644,6 +1644,12 @@ static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs, srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id; } + if (cmsgs->prinfo) { + srinfo->sinfo_timetolive = cmsgs->prinfo->pr_value; + SCTP_PR_SET_POLICY(srinfo->sinfo_flags, + cmsgs->prinfo->pr_policy); + } + sflags = srinfo->sinfo_flags; if (!sflags && msg_len) return 0; @@ -1901,9 +1907,12 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, sinfo->sinfo_ppid = asoc->default_ppid; sinfo->sinfo_context = asoc->default_context; sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); + + if (!cmsgs->prinfo) + sinfo->sinfo_flags = asoc->default_flags; } - if (!cmsgs->srinfo) + if (!cmsgs->srinfo && !cmsgs->prinfo) sinfo->sinfo_timetolive = asoc->default_timetolive; } @@ -7749,6 +7758,26 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; + case SCTP_PRINFO: + /* SCTP Socket API Extension + * 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_PRINFO struct sctp_prinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_prinfo))) + return -EINVAL; + + cmsgs->prinfo = CMSG_DATA(cmsg); + if (cmsgs->prinfo->pr_policy & ~SCTP_PR_SCTP_MASK) + return -EINVAL; + + if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) + cmsgs->prinfo->pr_value = 0; + break; default: return -EINVAL; } -- cgit v1.2.3 From 2c0dbaa0c43d04d8d6daf52adb724c5789676b15 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 5 Mar 2018 20:44:19 +0800 Subject: sctp: add support for SCTP_DSTADDRV4/6 Information for sendmsg This patch is to add support for Destination IPv4/6 Address options for sendmsg, as described in section 5.3.9/10 of RFC6458. With this option, you can provide more than one destination addrs to sendmsg when creating asoc, like sctp_connectx. It's also a necessary send info for sctp_sendv. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 6 ++++ net/sctp/socket.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) (limited to 'include/uapi') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d40a2a329888..ec6e46b7e119 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2113,6 +2113,7 @@ struct sctp_cmsgs { struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; struct sctp_prinfo *prinfo; + struct msghdr *addrs_msg; }; /* Structure for tracking memory objects */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 0dd1f82a4fa8..a1bc35098033 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -308,6 +308,12 @@ typedef enum sctp_cmsg_type { #define SCTP_NXTINFO SCTP_NXTINFO SCTP_PRINFO, /* 5.3.7 SCTP PR-SCTP Information Structure */ #define SCTP_PRINFO SCTP_PRINFO + SCTP_AUTHINFO, /* 5.3.8 SCTP AUTH Information Structure (RESERVED) */ +#define SCTP_AUTHINFO SCTP_AUTHINFO + SCTP_DSTADDRV4, /* 5.3.9 SCTP Destination IPv4 Address Structure */ +#define SCTP_DSTADDRV4 SCTP_DSTADDRV4 + SCTP_DSTADDRV6, /* 5.3.10 SCTP Destination IPv6 Address Structure */ +#define SCTP_DSTADDRV6 SCTP_DSTADDRV6 } sctp_cmsg_t; /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index fdde697b37e7..067b57a330b1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1676,6 +1676,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct net *net = sock_net(sk); struct sctp_association *asoc; enum sctp_scope scope; + struct cmsghdr *cmsg; int err = -EINVAL; *tp = NULL; @@ -1741,6 +1742,67 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, goto free; } + if (!cmsgs->addrs_msg) + return 0; + + /* sendv addr list parse */ + for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { + struct sctp_transport *transport; + struct sctp_association *old; + union sctp_addr _daddr; + int dlen; + + if (cmsg->cmsg_level != IPPROTO_SCTP || + (cmsg->cmsg_type != SCTP_DSTADDRV4 && + cmsg->cmsg_type != SCTP_DSTADDRV6)) + continue; + + daddr = &_daddr; + memset(daddr, 0, sizeof(*daddr)); + dlen = cmsg->cmsg_len - sizeof(struct cmsghdr); + if (cmsg->cmsg_type == SCTP_DSTADDRV4) { + if (dlen < sizeof(struct in_addr)) + goto free; + + dlen = sizeof(struct in_addr); + daddr->v4.sin_family = AF_INET; + daddr->v4.sin_port = htons(asoc->peer.port); + memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen); + } else { + if (dlen < sizeof(struct in6_addr)) + goto free; + + dlen = sizeof(struct in6_addr); + daddr->v6.sin6_family = AF_INET6; + daddr->v6.sin6_port = htons(asoc->peer.port); + memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); + } + err = sctp_verify_addr(sk, daddr, sizeof(*daddr)); + if (err) + goto free; + + old = sctp_endpoint_lookup_assoc(ep, daddr, &transport); + if (old && old != asoc) { + if (old->state >= SCTP_STATE_ESTABLISHED) + err = -EISCONN; + else + err = -EALREADY; + goto free; + } + + if (sctp_endpoint_is_peeled_off(ep, daddr)) { + err = -EADDRNOTAVAIL; + goto free; + } + + transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, + SCTP_UNKNOWN); + if (!transport) { + err = -ENOMEM; + goto free; + } + } + return 0; free: @@ -7778,6 +7840,21 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) cmsgs->prinfo->pr_value = 0; break; + case SCTP_DSTADDRV4: + case SCTP_DSTADDRV6: + /* SCTP Socket API Extension + * 5.3.9/10 SCTP Destination IPv4/6 Address Structure (SCTP_DSTADDRV4/6) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_DSTADDRV4 struct in_addr + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_DSTADDRV6 struct in6_addr + */ + cmsgs->addrs_msg = my_msg; + break; default: return -EINVAL; } -- cgit v1.2.3 From 4910280503f3af2857d5aa77e35b22d93a8960a8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 5 Mar 2018 20:44:20 +0800 Subject: sctp: add support for snd flag SCTP_SENDALL process in sendmsg This patch is to add support for snd flag SCTP_SENDALL process in sendmsg, as described in section 5.3.4 of RFC6458. With this flag, you can send the same data to all the asocs of this sk once. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ net/sctp/socket.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index a1bc35098033..e94b6d297ad9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -284,6 +284,8 @@ enum sctp_sinfo_flags { SCTP_ADDR_OVER = (1 << 1), /* Override the primary destination. */ SCTP_ABORT = (1 << 2), /* Send an ABORT message to the peer. */ SCTP_SACK_IMMEDIATELY = (1 << 3), /* SACK should be sent without delay. */ + /* 2 bits here have been used by SCTP_PR_SCTP_MASK */ + SCTP_SENDALL = (1 << 6), SCTP_NOTIFICATION = MSG_NOTIFICATION, /* Next message is not user msg but notification. */ SCTP_EOF = MSG_FIN, /* Initiate graceful shutdown process. */ }; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 067b57a330b1..7d3476a4860d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1820,6 +1820,10 @@ static int sctp_sendmsg_check_sflags(struct sctp_association *asoc, if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) return -EPIPE; + if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP) && + !sctp_state(asoc, ESTABLISHED)) + return 0; + if (sflags & SCTP_EOF) { pr_debug("%s: shutting down association:%p\n", __func__, asoc); sctp_primitive_SHUTDOWN(net, asoc, NULL); @@ -2007,6 +2011,29 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) lock_sock(sk); + /* SCTP_SENDALL process */ + if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) { + list_for_each_entry(asoc, &ep->asocs, asocs) { + err = sctp_sendmsg_check_sflags(asoc, sflags, msg, + msg_len); + if (err == 0) + continue; + if (err < 0) + goto out_unlock; + + sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); + + err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, + NULL, sinfo); + if (err < 0) + goto out_unlock; + + iov_iter_revert(&msg->msg_iter, err); + } + + goto out_unlock; + } + /* Get and check or create asoc */ if (daddr) { asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); @@ -7792,8 +7819,8 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | - SCTP_ABORT | SCTP_EOF)) + SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | + SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -7816,8 +7843,8 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | - SCTP_ABORT | SCTP_EOF)) + SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | + SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; case SCTP_PRINFO: -- cgit v1.2.3 From 95da0cdb723260362fc126a563285ac66a193da7 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Tue, 6 Mar 2018 10:55:01 -0800 Subject: bpf: add support to read sample address in bpf program This commit adds new field "addr" to bpf_perf_event_data which could be read and used by bpf programs attached to perf events. The value of the field is copied from bpf_perf_event_data_kern.addr and contains the address value recorded by specifying sample_type with PERF_SAMPLE_ADDR when calling perf_event_open. Signed-off-by: Teng Qin Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf_perf_event.h | 1 + kernel/trace/bpf_trace.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h index 8f95303f9d80..eb1b9d21250c 100644 --- a/include/uapi/linux/bpf_perf_event.h +++ b/include/uapi/linux/bpf_perf_event.h @@ -13,6 +13,7 @@ struct bpf_perf_event_data { bpf_user_pt_regs_t regs; __u64 sample_period; + __u64 addr; }; #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c0a9e310d715..c634e093951f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -726,8 +726,7 @@ const struct bpf_prog_ops tracepoint_prog_ops = { static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, - sample_period); + const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -738,8 +737,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): - bpf_ctx_record_field_size(info, size_sp); - if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + case bpf_ctx_range(struct bpf_perf_event_data, addr): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: @@ -766,6 +770,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; + case offsetof(struct bpf_perf_event_data, addr): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, addr, 8, + target_size)); + break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, -- cgit v1.2.3 From 459d153d9916ea48b1550bbb6f2959dc03bff011 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Tue, 6 Mar 2018 18:11:14 +0100 Subject: net/sched: cls_flower: Add support to handle first frag as match field Allow setting firstfrag as matching option in tc flower classifier. # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_flags firstfrag action mirred egress redirect dev eth1 Signed-off-by: Pieter Jansen van Vuuren Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 1 + net/sched/cls_flower.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7cafb26df555..be05e66c167b 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -475,6 +475,7 @@ enum { enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), }; /* Match-all classifier */ diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 7d0ce2c40f93..d964e60c730e 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -511,6 +511,9 @@ static int fl_set_key_flags(struct nlattr **tb, fl_set_key_flag(key, mask, flags_key, flags_mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_FIRST_FRAG); return 0; } @@ -1130,6 +1133,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) fl_get_key_flag(flags_key, flags_mask, &key, &mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_FIRST_FRAG); _key = cpu_to_be32(key); _mask = cpu_to_be32(mask); -- cgit v1.2.3 From 84a1d9c4820080bebcbd413a845076dcb62f45fa Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 8 Mar 2018 15:45:03 +0000 Subject: net: ethtool: extend RXNFC API to support RSS spreading of filter matches We use a two-step process to configure a filter with RSS spreading. First, the RSS context is allocated and configured using ETHTOOL_SRSSH; this returns an identifier (rss_context) which can then be passed to subsequent invocations of ETHTOOL_SRXCLSRLINS to specify that the offset from the RSS indirection table lookup should be added to the queue number (ring_cookie) when delivering the packet. Drivers for devices which can only use the indirection table entry directly (not add it to a base queue number) should reject rule insertions combining RSS with a nonzero ring_cookie. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/ethtool.h | 5 ++++ include/uapi/linux/ethtool.h | 32 +++++++++++++++++----- net/core/ethtool.c | 64 +++++++++++++++++++++++++++++++++----------- 3 files changed, 80 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 2ec41a7eb54f..ebe41811ed34 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -371,6 +371,11 @@ struct ethtool_ops { u8 *hfunc); int (*set_rxfh)(struct net_device *, const u32 *indir, const u8 *key, const u8 hfunc); + int (*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key, + u8 *hfunc, u32 rss_context); + int (*set_rxfh_context)(struct net_device *, const u32 *indir, + const u8 *key, const u8 hfunc, + u32 *rss_context, bool delete); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 44a0b675a6bc..20da156aaf64 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -914,12 +914,15 @@ static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW * @data: Command-dependent value * @fs: Flow classification rule + * @rss_context: RSS context to be affected * @rule_cnt: Number of rules to be affected * @rule_locs: Array of used rule locations * * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following - * structure fields must not be used. + * structure fields must not be used, except that if @flow_type includes + * the %FLOW_RSS flag, then @rss_context determines which RSS context to + * act on. * * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues * on return. @@ -931,7 +934,9 @@ static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) * set in @data then special location values should not be used. * * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an - * existing rule on entry and @fs contains the rule on return. + * existing rule on entry and @fs contains the rule on return; if + * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is + * filled with the RSS context ID associated with the rule. * * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the * user buffer for @rule_locs on entry. On return, @data is the size @@ -942,7 +947,11 @@ static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. * @fs.@location either specifies the location to use or is a special * location value with %RX_CLS_LOC_SPECIAL flag set. On return, - * @fs.@location is the actual rule location. + * @fs.@location is the actual rule location. If @fs.@flow_type + * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to + * use for flow spreading traffic which matches this rule. The value + * from the rxfh indirection table will be added to @fs.@ring_cookie + * to choose which ring to deliver to. * * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an * existing rule on entry. @@ -963,7 +972,10 @@ struct ethtool_rxnfc { __u32 flow_type; __u64 data; struct ethtool_rx_flow_spec fs; - __u32 rule_cnt; + union { + __u32 rule_cnt; + __u32 rss_context; + }; __u32 rule_locs[0]; }; @@ -990,7 +1002,11 @@ struct ethtool_rxfh_indir { /** * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH - * @rss_context: RSS context identifier. + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command + * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. * @indir_size: On entry, the array size of the user buffer for the * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, @@ -1009,7 +1025,8 @@ struct ethtool_rxfh_indir { * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested * and a @indir_size of zero means the indir table should be reset to default - * values. An hfunc of zero means that hash function setting is not requested. + * values (if @rss_context == 0) or that the RSS context should be deleted. + * An hfunc of zero means that hash function setting is not requested. */ struct ethtool_rxfh { __u32 cmd; @@ -1021,6 +1038,7 @@ struct ethtool_rxfh { __u32 rsvd32; __u32 rss_config[0]; }; +#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff #define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff /** @@ -1635,6 +1653,8 @@ static inline int ethtool_validate_duplex(__u8 duplex) /* Flag to enable additional fields in struct ethtool_rx_flow_spec */ #define FLOW_EXT 0x80000000 #define FLOW_MAC_EXT 0x40000000 +/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ +#define FLOW_RSS 0x20000000 /* L3-L4 network traffic flow hash options */ #define RXH_L2DA (1 << 1) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3f89c76d5c24..157cd9efa4be 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1022,6 +1022,15 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + } + if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) @@ -1251,9 +1260,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || - rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->get_rxfh_context) + return -EOPNOTSUPP; rxfh.indir_size = dev_indir_size; rxfh.key_size = dev_key_size; @@ -1276,7 +1287,12 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (rxfh.rss_context) + ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey, + &dev_hfunc, + rxfh.rss_context); + else + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); if (ret) goto out; @@ -1306,6 +1322,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *hkey = NULL; u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + bool delete = false; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; @@ -1319,9 +1336,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || - rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->set_rxfh_context) + return -EOPNOTSUPP; /* If either indir, hash key or function is valid, proceed further. * Must request at least one change: indir size, hash key or function. @@ -1346,7 +1365,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (ret) goto out; - /* rxfh.indir_size == 0 means reset the indir table to default. + /* rxfh.indir_size == 0 means reset the indir table to default (master + * context) or delete the context (other RSS contexts). * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. */ if (rxfh.indir_size && @@ -1359,9 +1379,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (ret) goto out; } else if (rxfh.indir_size == 0) { - indir = (u32 *)rss_config; - for (i = 0; i < dev_indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + if (rxfh.rss_context == 0) { + indir = (u32 *)rss_config; + for (i = 0; i < dev_indir_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } else { + delete = true; + } } if (rxfh.key_size) { @@ -1374,15 +1398,25 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (rxfh.rss_context) + ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc, + &rxfh.rss_context, delete); + else + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); if (ret) goto out; - /* indicate whether rxfh was set to default */ - if (rxfh.indir_size == 0) - dev->priv_flags &= ~IFF_RXFH_CONFIGURED; - else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - dev->priv_flags |= IFF_RXFH_CONFIGURED; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), + &rxfh.rss_context, sizeof(rxfh.rss_context))) + ret = -EFAULT; + + if (!rxfh.rss_context) { + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; + } out: kfree(rss_config); -- cgit v1.2.3 From 41aeefcc38a2643a62db46818a70a781efb40d99 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 13 Mar 2018 11:41:12 +0100 Subject: batman-adv: add DAT cache netlink support Dump the list of DAT cache entries via the netlink socket. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 20 +++++ net/batman-adv/distributed-arp-table.c | 152 +++++++++++++++++++++++++++++++++ net/batman-adv/distributed-arp-table.h | 8 ++ net/batman-adv/netlink.c | 76 ++++++++++------- 4 files changed, 223 insertions(+), 33 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 56ae28934070..95ab5dbd09fa 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -272,6 +272,21 @@ enum batadv_nl_attrs { */ BATADV_ATTR_BLA_CRC, + /** + * @BATADV_ATTR_DAT_CACHE_IP4ADDRESS: Client IPv4 address + */ + BATADV_ATTR_DAT_CACHE_IP4ADDRESS, + + /** + * @BATADV_ATTR_DAT_CACHE_HWADDRESS: Client MAC address + */ + BATADV_ATTR_DAT_CACHE_HWADDRESS, + + /** + * @BATADV_ATTR_DAT_CACHE_VID: VLAN ID + */ + BATADV_ATTR_DAT_CACHE_VID, + /* add attributes above here, update the policy in netlink.c */ /** @@ -361,6 +376,11 @@ enum batadv_nl_commands { */ BATADV_CMD_GET_BLA_BACKBONE, + /** + * @BATADV_CMD_GET_DAT_CACHE: Query list of DAT cache entries + */ + BATADV_CMD_GET_DAT_CACHE, + /* add new commands above here */ /** diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 4469dcc1558f..75dda9454ccf 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -43,13 +44,19 @@ #include #include #include +#include +#include +#include +#include #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "send.h" +#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -851,6 +858,151 @@ out: } #endif +/** + * batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a + * netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @dat_entry: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_dat_entry *dat_entry) +{ + int msecs; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE); + if (!hdr) + return -ENOBUFS; + + msecs = jiffies_to_msecs(jiffies - dat_entry->last_update); + + if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS, + dat_entry->ip) || + nla_put(msg, BATADV_ATTR_DAT_CACHE_HWADDRESS, ETH_ALEN, + dat_entry->mac_addr) || + nla_put_u16(msg, BATADV_ATTR_DAT_CACHE_VID, dat_entry->vid) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + + genlmsg_end(msg, hdr); + return 0; +} + +/** + * batadv_dat_cache_dump_bucket() - dump one bucket of the DAT cache table to + * a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: 0 or error code. + */ +static int +batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_dat_entry *dat_entry; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { + if (idx < *idx_skip) + goto skip; + + if (batadv_dat_cache_dump_entry(msg, portid, seq, + dat_entry)) { + rcu_read_unlock(); + *idx_skip = idx; + + return -EMSGSIZE; + } + +skip: + idx++; + } + rcu_read_unlock(); + + return 0; +} + +/** + * batadv_dat_cache_dump() - dump DAT cache table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->dat.hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_dat_cache_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, head, + &idx)) + break; + + bucket++; + idx = 0; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + /** * batadv_arp_get_type() - parse an ARP packet and gets the type * @bat_priv: the bat priv with all the soft interface information diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index e24aa9601c52..a04596028337 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -28,6 +28,7 @@ #include "originator.h" +struct netlink_callback; struct seq_file; struct sk_buff; @@ -81,6 +82,7 @@ batadv_dat_init_own_addr(struct batadv_priv *bat_priv, int batadv_dat_init(struct batadv_priv *bat_priv); void batadv_dat_free(struct batadv_priv *bat_priv); int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset); +int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb); /** * batadv_dat_inc_counter() - increment the correct DAT packet counter @@ -169,6 +171,12 @@ static inline void batadv_dat_free(struct batadv_priv *bat_priv) { } +static inline int +batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, u8 subtype) { diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 129af56b944d..2b3e7c3f87fa 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -45,6 +45,7 @@ #include "bat_algo.h" #include "bridge_loop_avoidance.h" +#include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "originator.h" @@ -64,39 +65,42 @@ static const struct genl_multicast_group batadv_netlink_mcgrps[] = { }; static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { - [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, - [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, - [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, - [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, - [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, - [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, - [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, - [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, - [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, - [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, - [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, - [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, - [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, - [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, - [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, - [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, - [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, - [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, - [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TQ] = { .type = NLA_U8 }, - [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, - [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, - [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, - [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, - [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, - [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, + [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, + [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, + [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, + [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, + [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, + [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, + [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, + [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, + [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, + [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TQ] = { .type = NLA_U8 }, + [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, + [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, + [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, + [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, + [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, }; /** @@ -604,6 +608,12 @@ static const struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_bla_backbone_dump, }, + { + .cmd = BATADV_CMD_GET_DAT_CACHE, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_dat_cache_dump, + }, }; -- cgit v1.2.3 From 53dd9a68ba683986ec90497586f94b941bb748a0 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 13 Mar 2018 11:41:13 +0100 Subject: batman-adv: add multicast flags netlink support Dump the list of multicast flags entries via the netlink socket. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 62 +++++++++++ net/batman-adv/multicast.c | 237 ++++++++++++++++++++++++++++++++++++++++ net/batman-adv/multicast.h | 18 +++ net/batman-adv/netlink.c | 12 ++ 4 files changed, 329 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 95ab5dbd09fa..324a0e1143e7 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -91,6 +91,53 @@ enum batadv_tt_client_flags { BATADV_TT_CLIENT_TEMP = (1 << 11), }; +/** + * enum batadv_mcast_flags_priv - Private, own multicast flags + * + * These are internal, multicast related flags. Currently they describe certain + * multicast related attributes of the segment this originator bridges into the + * mesh. + * + * Those attributes are used to determine the public multicast flags this + * originator is going to announce via TT. + * + * For netlink, if BATADV_MCAST_FLAGS_BRIDGED is unset then all querier + * related flags are undefined. + */ +enum batadv_mcast_flags_priv { + /** + * @BATADV_MCAST_FLAGS_BRIDGED: There is a bridge on top of the mesh + * interface. + */ + BATADV_MCAST_FLAGS_BRIDGED = (1 << 0), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS: Whether an IGMP querier + * exists in the mesh + */ + BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS = (1 << 1), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS: Whether an MLD querier + * exists in the mesh + */ + BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS = (1 << 2), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING: If an IGMP querier + * exists, whether it is potentially shadowing multicast listeners + * (i.e. querier is behind our own bridge segment) + */ + BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING = (1 << 3), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING: If an MLD querier + * exists, whether it is potentially shadowing multicast listeners + * (i.e. querier is behind our own bridge segment) + */ + BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING = (1 << 4), +}; + /** * enum batadv_nl_attrs - batman-adv netlink attributes */ @@ -287,6 +334,16 @@ enum batadv_nl_attrs { */ BATADV_ATTR_DAT_CACHE_VID, + /** + * @BATADV_ATTR_MCAST_FLAGS: Per originator multicast flags + */ + BATADV_ATTR_MCAST_FLAGS, + + /** + * @BATADV_ATTR_MCAST_FLAGS_PRIV: Private, own multicast flags + */ + BATADV_ATTR_MCAST_FLAGS_PRIV, + /* add attributes above here, update the policy in netlink.c */ /** @@ -381,6 +438,11 @@ enum batadv_nl_commands { */ BATADV_CMD_GET_DAT_CACHE, + /** + * @BATADV_CMD_GET_MCAST_FLAGS: Query list of multicast flags + */ + BATADV_CMD_GET_MCAST_FLAGS, + /* add new commands above here */ /** diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index c7a1305ca7e7..5615b6abea6f 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -52,14 +53,20 @@ #include #include #include +#include #include #include #include +#include +#include #include +#include #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" +#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -1333,6 +1340,236 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) } #endif +/** + * batadv_mcast_mesh_info_put() - put multicast info into a netlink message + * @msg: buffer for the message + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 or error code. + */ +int batadv_mcast_mesh_info_put(struct sk_buff *msg, + struct batadv_priv *bat_priv) +{ + u32 flags = bat_priv->mcast.flags; + u32 flags_priv = BATADV_NO_FLAGS; + + if (bat_priv->mcast.bridged) { + flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; + + if (bat_priv->mcast.querier_ipv4.exists) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; + if (bat_priv->mcast.querier_ipv6.exists) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; + if (bat_priv->mcast.querier_ipv4.shadowing) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; + if (bat_priv->mcast.querier_ipv6.shadowing) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; + } + + if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || + nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) + return -EMSGSIZE; + + return 0; +} + +/** + * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @orig_node: originator to dump the multicast flags of + * + * Return: 0 or error code. + */ +static int +batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_orig_node *orig_node) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig_node->orig)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + + if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capabilities)) { + if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, + orig_node->mcast_flags)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + } + + genlmsg_end(msg, hdr); + return 0; +} + +/** + * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags + * table to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: 0 or error code. + */ +static int +batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct hlist_head *head, long *idx_skip) +{ + struct batadv_orig_node *orig_node; + long idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capa_initialized)) + continue; + + if (idx < *idx_skip) + goto skip; + + if (batadv_mcast_flags_dump_entry(msg, portid, seq, + orig_node)) { + rcu_read_unlock(); + *idx_skip = idx; + + return -EMSGSIZE; + } + +skip: + idx++; + } + rcu_read_unlock(); + + return 0; +} + +/** + * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @bat_priv: the bat priv with all the soft interface information + * @bucket: current bucket to dump + * @idx: index in current bucket to the next entry to dump + * + * Return: 0 or error code. + */ +static int +__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, long *bucket, long *idx) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + long bucket_tmp = *bucket; + struct hlist_head *head; + long idx_tmp = *idx; + + while (bucket_tmp < hash->size) { + head = &hash->table[bucket_tmp]; + + if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head, + &idx_tmp)) + break; + + bucket_tmp++; + idx_tmp = 0; + } + + *bucket = bucket_tmp; + *idx = idx_tmp; + + return msg->len; +} + +/** + * batadv_mcast_netlink_get_primary() - get primary interface from netlink + * callback + * @cb: netlink callback structure + * @primary_if: the primary interface pointer to return the result in + * + * Return: 0 or error code. + */ +static int +batadv_mcast_netlink_get_primary(struct netlink_callback *cb, + struct batadv_hard_iface **primary_if) +{ + struct batadv_hard_iface *hard_iface = NULL; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + hard_iface = batadv_primary_if_get_selected(bat_priv); + if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + +out: + if (soft_iface) + dev_put(soft_iface); + + if (!ret && primary_if) + *primary_if = hard_iface; + else + batadv_hardif_put(hard_iface); + + return ret; +} + +/** + * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_priv *bat_priv; + long *bucket = &cb->args[0]; + long *idx = &cb->args[1]; + int ret; + + ret = batadv_mcast_netlink_get_primary(cb, &primary_if); + if (ret) + return ret; + + bat_priv = netdev_priv(primary_if->soft_iface); + ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, bucket, idx); + + batadv_hardif_put(primary_if); + return ret; +} + /** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 6b8594e23da3..3b04ab13f0eb 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -21,6 +21,7 @@ #include "main.h" +struct netlink_callback; struct seq_file; struct sk_buff; @@ -54,6 +55,11 @@ void batadv_mcast_init(struct batadv_priv *bat_priv); int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); +int batadv_mcast_mesh_info_put(struct sk_buff *msg, + struct batadv_priv *bat_priv); + +int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb); + void batadv_mcast_free(struct batadv_priv *bat_priv); void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); @@ -72,6 +78,18 @@ static inline int batadv_mcast_init(struct batadv_priv *bat_priv) return 0; } +static inline int +batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline int batadv_mcast_flags_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static inline void batadv_mcast_free(struct batadv_priv *bat_priv) { } diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 2b3e7c3f87fa..0d9459b69bdb 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -48,6 +48,7 @@ #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" +#include "multicast.h" #include "originator.h" #include "soft-interface.h" #include "tp_meter.h" @@ -101,6 +102,8 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, }; /** @@ -151,6 +154,9 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) goto out; #endif + if (batadv_mcast_mesh_info_put(msg, bat_priv)) + goto out; + primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { hard_iface = primary_if->net_dev; @@ -614,6 +620,12 @@ static const struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_dat_cache_dump, }, + { + .cmd = BATADV_CMD_GET_MCAST_FLAGS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_mcast_flags_dump, + }, }; -- cgit v1.2.3 From 3ff547c06a7d75d72d37dae2c064fcf0672e56c0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:31 +0800 Subject: sctp: add support for SCTP AUTH Information for sendmsg This patch is to add support for SCTP AUTH Information for sendmsg, as described in section 5.3.8 of RFC6458. With this option, you can provide shared key identifier used for sending the user message. It's also a necessary send info for sctp_sendv. Note that it reuses sinfo->sinfo_tsn to indicate if this option is set and sinfo->sinfo_ssn to save the shkey ID which can be 0. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 14 +++++++++++++- net/sctp/chunk.c | 11 ++++++++++- net/sctp/socket.c | 23 +++++++++++++++++++++++ 4 files changed, 47 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 49ad67bbdbb5..012fb3e2f4cf 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2118,6 +2118,7 @@ struct sctp_cmsgs { struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; struct sctp_prinfo *prinfo; + struct sctp_authinfo *authinfo; struct msghdr *addrs_msg; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index e94b6d297ad9..47e781ebde05 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -273,6 +273,18 @@ struct sctp_prinfo { __u32 pr_value; }; +/* 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ------------------- + * IPPROTO_SCTP SCTP_AUTHINFO struct sctp_authinfo + */ +struct sctp_authinfo { + __u16 auth_keynumber; +}; + /* * sinfo_flags: 16 bits (unsigned integer) * @@ -310,7 +322,7 @@ typedef enum sctp_cmsg_type { #define SCTP_NXTINFO SCTP_NXTINFO SCTP_PRINFO, /* 5.3.7 SCTP PR-SCTP Information Structure */ #define SCTP_PRINFO SCTP_PRINFO - SCTP_AUTHINFO, /* 5.3.8 SCTP AUTH Information Structure (RESERVED) */ + SCTP_AUTHINFO, /* 5.3.8 SCTP AUTH Information Structure */ #define SCTP_AUTHINFO SCTP_AUTHINFO SCTP_DSTADDRV4, /* 5.3.9 SCTP Destination IPv4 Address Structure */ #define SCTP_DSTADDRV4 SCTP_DSTADDRV4 diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 9f28a9aae976..f889a84f264d 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -206,7 +206,16 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); - shkey = asoc->shkey; + if (sinfo->sinfo_tsn && + sinfo->sinfo_ssn != asoc->active_key_id) { + shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); + if (!shkey) { + err = -EINVAL; + goto errout; + } + } else { + shkey = asoc->shkey; + } } /* Check what's our max considering the above */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 003a4ad89c01..9ffdecbc3531 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1987,6 +1987,14 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, if (!cmsgs->srinfo && !cmsgs->prinfo) sinfo->sinfo_timetolive = asoc->default_timetolive; + + if (cmsgs->authinfo) { + /* Reuse sinfo_tsn to indicate that authinfo was set and + * sinfo_ssn to save the keyid on tx path. + */ + sinfo->sinfo_tsn = 1; + sinfo->sinfo_ssn = cmsgs->authinfo->auth_keynumber; + } } static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) @@ -7874,6 +7882,21 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) cmsgs->prinfo->pr_value = 0; break; + case SCTP_AUTHINFO: + /* SCTP Socket API Extension + * 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_AUTHINFO struct sctp_authinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_authinfo))) + return -EINVAL; + + cmsgs->authinfo = CMSG_DATA(cmsg); + break; case SCTP_DSTADDRV4: case SCTP_DSTADDRV6: /* SCTP Socket API Extension -- cgit v1.2.3 From 601590ec155aadf5daa17a6f63a06d1bba5b5ce9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:32 +0800 Subject: sctp: add sockopt SCTP_AUTH_DEACTIVATE_KEY This patch is to add sockopt SCTP_AUTH_DEACTIVATE_KEY, as described in section 8.3.4 of RFC6458. This set option indicates that the application will no longer send user messages using the indicated key identifier. Note that RFC requires that only deactivated keys that are no longer used by an association can be deleted, but for the backward compatibility, it is not to check deactivated when deleting or replacing one sh_key. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/auth.h | 12 ++++++------ include/uapi/linux/sctp.h | 1 + net/sctp/auth.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- net/sctp/socket.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index 017c1aa3a2c9..687e7f80037d 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -65,6 +65,7 @@ struct sctp_shared_key { struct sctp_auth_bytes *key; refcount_t refcnt; __u16 key_id; + __u8 deactivated; }; #define key_for_each(__key, __list_head) \ @@ -113,14 +114,13 @@ void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key); int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id); int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, struct sctp_hmacalgo *hmacs); -int sctp_auth_set_key(struct sctp_endpoint *ep, - struct sctp_association *asoc, +int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key); int sctp_auth_set_active_key(struct sctp_endpoint *ep, - struct sctp_association *asoc, - __u16 key_id); + struct sctp_association *asoc, __u16 key_id); int sctp_auth_del_key_id(struct sctp_endpoint *ep, - struct sctp_association *asoc, - __u16 key_id); + struct sctp_association *asoc, __u16 key_id); +int sctp_auth_deact_key_id(struct sctp_endpoint *ep, + struct sctp_association *asoc, __u16 key_id); #endif diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 47e781ebde05..08fc313829f4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -99,6 +99,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_RECVRCVINFO 32 #define SCTP_RECVNXTINFO 33 #define SCTP_DEFAULT_SNDINFO 34 +#define SCTP_AUTH_DEACTIVATE_KEY 35 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. diff --git a/net/sctp/auth.c b/net/sctp/auth.c index e5214fd7f650..a073123fc485 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -449,8 +449,11 @@ struct sctp_shared_key *sctp_auth_get_shkey( /* First search associations set of endpoint pair shared keys */ key_for_each(key, &asoc->endpoint_shared_keys) { - if (key->key_id == key_id) - return key; + if (key->key_id == key_id) { + if (!key->deactivated) + return key; + break; + } } return NULL; @@ -905,7 +908,7 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep, } } - if (!found) + if (!found || key->deactivated) return -EINVAL; if (asoc) { @@ -956,3 +959,40 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep, return 0; } + +int sctp_auth_deact_key_id(struct sctp_endpoint *ep, + struct sctp_association *asoc, __u16 key_id) +{ + struct sctp_shared_key *key; + struct list_head *sh_keys; + int found = 0; + + /* The key identifier MUST NOT be the current active key + * The key identifier MUST correst to an existing key + */ + if (asoc) { + if (asoc->active_key_id == key_id) + return -EINVAL; + + sh_keys = &asoc->endpoint_shared_keys; + } else { + if (ep->active_key_id == key_id) + return -EINVAL; + + sh_keys = &ep->endpoint_shared_keys; + } + + key_for_each(key, sh_keys) { + if (key->key_id == key_id) { + found = 1; + break; + } + } + + if (!found) + return -EINVAL; + + key->deactivated = 1; + + return 0; +} diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9ffdecbc3531..65cc354c520f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3646,6 +3646,33 @@ static int sctp_setsockopt_del_key(struct sock *sk, } +/* + * 8.3.4 Deactivate a Shared Key (SCTP_AUTH_DEACTIVATE_KEY) + * + * This set option will deactivate a shared secret key. + */ +static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct sctp_authkeyid val; + struct sctp_association *asoc; + + if (!ep->auth_enable) + return -EACCES; + + if (optlen != sizeof(struct sctp_authkeyid)) + return -EINVAL; + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, val.scact_assoc_id); + if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + return sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber); +} + /* * 8.1.23 SCTP_AUTO_ASCONF * @@ -4238,6 +4265,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_AUTH_DELETE_KEY: retval = sctp_setsockopt_del_key(sk, optval, optlen); break; + case SCTP_AUTH_DEACTIVATE_KEY: + retval = sctp_setsockopt_deactivate_key(sk, optval, optlen); + break; case SCTP_AUTO_ASCONF: retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); break; @@ -7212,6 +7242,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_AUTH_KEY: case SCTP_AUTH_CHUNK: case SCTP_AUTH_DELETE_KEY: + case SCTP_AUTH_DEACTIVATE_KEY: retval = -EOPNOTSUPP; break; case SCTP_HMAC_IDENT: -- cgit v1.2.3 From ec2e506c680deaa8e1a087986db6d73ba63a04be Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:33 +0800 Subject: sctp: add SCTP_AUTH_FREE_KEY type for AUTHENTICATION_EVENT This patch is to add SCTP_AUTH_FREE_KEY type for AUTHENTICATION_EVENT, as described in section 6.1.8 of RFC6458. SCTP_AUTH_FREE_KEY: This report indicates that the SCTP implementation will no longer use the key identifier specified in auth_keynumber. After deactivating a key, it would never be used again, which means it's refcnt can't be held/increased by new chunks. But there may be some chunks in out queue still using it. So only when refcnt is 1, which means no chunk in outqueue is using/holding this key either, this EVENT would be sent. When users receive this notification, they could do DEL_KEY sockopt to remove this shkey, and also tell the peer that this key won't be used in any chunk thoroughly from now on, then the peer can remove it as well safely. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 6 +++++- net/sctp/auth.c | 14 ++++++++++++++ net/sctp/sm_make_chunk.c | 20 +++++++++++++++++++- net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 19 ++++++++++++++++++- 5 files changed, 57 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 08fc313829f4..18ebbfeee4af 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -518,7 +518,11 @@ struct sctp_authkey_event { sctp_assoc_t auth_assoc_id; }; -enum { SCTP_AUTH_NEWKEY = 0, }; +enum { + SCTP_AUTH_NEW_KEY, +#define SCTP_AUTH_NEWKEY SCTP_AUTH_NEW_KEY /* compatible with before */ + SCTP_AUTH_FREE_KEY, +}; /* * 6.1.9. SCTP_SENDER_DRY_EVENT diff --git a/net/sctp/auth.c b/net/sctp/auth.c index a073123fc485..e64630cd3331 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -992,6 +992,20 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, if (!found) return -EINVAL; + /* refcnt == 1 and !list_empty mean it's not being used anywhere + * and deactivated will be set, so it's time to notify userland + * that this shkey can be freed. + */ + if (asoc && !list_empty(&key->key_list) && + refcount_read(&key->refcnt) == 1) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, key->key_id, + SCTP_AUTH_FREE_KEY, GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + key->deactivated = 1; return 0; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 10f071cdf188..cc20bc39ee7c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -89,8 +89,26 @@ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; - if (chunk->shkey) + if (chunk->shkey) { + struct sctp_shared_key *shkey = chunk->shkey; + struct sctp_association *asoc = chunk->asoc; + + /* refcnt == 2 and !list_empty mean after this release, it's + * not being used anywhere, and it's time to notify userland + * that this shkey can be freed if it's been deactivated. + */ + if (shkey->deactivated && !list_empty(&shkey->key_list) && + refcount_read(&shkey->refcnt) == 2) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, + SCTP_AUTH_FREE_KEY, + GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } sctp_auth_shkey_release(chunk->shkey); + } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 792e0e2be320..1e41dee70b51 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4246,7 +4246,7 @@ enum sctp_disposition sctp_sf_eat_auth(struct net *net, struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id), - SCTP_AUTH_NEWKEY, GFP_ATOMIC); + SCTP_AUTH_NEW_KEY, GFP_ATOMIC); if (!ev) return -ENOMEM; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 65cc354c520f..aeecdd620c45 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8166,8 +8166,25 @@ static void sctp_wfree(struct sk_buff *skb) sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); - if (chunk->shkey) + if (chunk->shkey) { + struct sctp_shared_key *shkey = chunk->shkey; + + /* refcnt == 2 and !list_empty mean after this release, it's + * not being used anywhere, and it's time to notify userland + * that this shkey can be freed if it's been deactivated. + */ + if (shkey->deactivated && !list_empty(&shkey->key_list) && + refcount_read(&shkey->refcnt) == 2) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, + SCTP_AUTH_FREE_KEY, + GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } sctp_auth_shkey_release(chunk->shkey); + } sock_wfree(skb); sctp_wake_up_waiters(sk, asoc); -- cgit v1.2.3 From 30f6ebf65bc46161c5aaff1db2e6e7c76aa4a06b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:34 +0800 Subject: sctp: add SCTP_AUTH_NO_AUTH type for AUTHENTICATION_EVENT This patch is to add SCTP_AUTH_NO_AUTH type for AUTHENTICATION_EVENT, as described in section 6.1.8 of RFC6458. SCTP_AUTH_NO_AUTH: This report indicates that the peer does not support SCTP authentication as defined in [RFC4895]. Note that the implementation is quite similar as that of SCTP_ADAPTATION_INDICATION. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/command.h | 1 + include/uapi/linux/sctp.h | 1 + net/sctp/sm_sideeffect.c | 13 +++++++++++++ net/sctp/sm_statefuns.c | 43 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 56 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index b55c6a48a206..6640f84fe536 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -100,6 +100,7 @@ enum sctp_verb { SCTP_CMD_SET_SK_ERR, /* Set sk_err */ SCTP_CMD_ASSOC_CHANGE, /* generate and send assoc_change event */ SCTP_CMD_ADAPTATION_IND, /* generate and send adaptation event */ + SCTP_CMD_PEER_NO_AUTH, /* generate and send authentication event */ SCTP_CMD_ASSOC_SHKEY, /* generate the association shared keys */ SCTP_CMD_T1_RETRAN, /* Mark for retransmission after T1 timeout */ SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 18ebbfeee4af..afd4346386e0 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -522,6 +522,7 @@ enum { SCTP_AUTH_NEW_KEY, #define SCTP_AUTH_NEWKEY SCTP_AUTH_NEW_KEY /* compatible with before */ SCTP_AUTH_FREE_KEY, + SCTP_AUTH_NO_AUTH, }; /* diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index b71e7fb0a20a..298112ca8c06 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1049,6 +1049,16 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } +static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands, + struct sctp_association *asoc) +{ + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); +} + /* Helper function to generate an adaptation indication event */ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, struct sctp_association *asoc) @@ -1755,6 +1765,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, case SCTP_CMD_ADAPTATION_IND: sctp_cmd_adaptation_ind(commands, asoc); break; + case SCTP_CMD_PEER_NO_AUTH: + sctp_cmd_peer_no_auth(commands, asoc); + break; case SCTP_CMD_ASSOC_SHKEY: error = sctp_auth_asoc_init_active_key(asoc, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 1e41dee70b51..cc56a67dbb4d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -659,7 +659,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, void *arg, struct sctp_cmd_seq *commands) { - struct sctp_ulpevent *ev, *ai_ev = NULL; + struct sctp_ulpevent *ev, *ai_ev = NULL, *auth_ev = NULL; struct sctp_association *new_asoc; struct sctp_init_chunk *peer_init; struct sctp_chunk *chunk = arg; @@ -820,6 +820,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, goto nomem_aiev; } + if (!new_asoc->peer.auth_capable) { + auth_ev = sctp_ulpevent_make_authkey(new_asoc, 0, + SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!auth_ev) + goto nomem_authev; + } + /* Add all the state machine commands now since we've created * everything. This way we don't introduce memory corruptions * during side-effect processing and correclty count established @@ -847,8 +855,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ai_ev)); + if (auth_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(auth_ev)); + return SCTP_DISPOSITION_CONSUME; +nomem_authev: + sctp_ulpevent_free(ai_ev); nomem_aiev: sctp_ulpevent_free(ev); nomem_ev: @@ -953,6 +967,15 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, SCTP_ULPEVENT(ev)); } + if (!asoc->peer.auth_capable) { + ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!ev) + goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + return SCTP_DISPOSITION_CONSUME; nomem: return SCTP_DISPOSITION_NOMEM; @@ -1908,6 +1931,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_b( if (asoc->peer.adaptation_ind) sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL()); + if (!asoc->peer.auth_capable) + sctp_add_cmd_sf(commands, SCTP_CMD_PEER_NO_AUTH, SCTP_NULL()); + return SCTP_DISPOSITION_CONSUME; nomem: @@ -1954,7 +1980,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( struct sctp_cmd_seq *commands, struct sctp_association *new_asoc) { - struct sctp_ulpevent *ev = NULL, *ai_ev = NULL; + struct sctp_ulpevent *ev = NULL, *ai_ev = NULL, *auth_ev = NULL; struct sctp_chunk *repl; /* Clarification from Implementor's Guide: @@ -2001,6 +2027,14 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( goto nomem; } + + if (!asoc->peer.auth_capable) { + auth_ev = sctp_ulpevent_make_authkey(asoc, 0, + SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!auth_ev) + goto nomem; + } } repl = sctp_make_cookie_ack(new_asoc, chunk); @@ -2015,10 +2049,15 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( if (ai_ev) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ai_ev)); + if (auth_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(auth_ev)); return SCTP_DISPOSITION_CONSUME; nomem: + if (auth_ev) + sctp_ulpevent_free(auth_ev); if (ai_ev) sctp_ulpevent_free(ai_ev); if (ev) -- cgit v1.2.3 From 615755a77b2461ed78dfafb8a6649456201949c7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 14 Mar 2018 10:23:21 -0700 Subject: bpf: extend stackmap to save binary_build_id+offset instead of address Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XXXXXXXXXX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; The search of build_id is limited to the first page of the file, and this page should be in page cache. Otherwise, we fallback to store ip for this entry (ip field in struct bpf_stack_build_id_offset). This requires the build_id to be stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. Build_id is only meaningful for user stack. If a kernel stack is added to a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id lookup failed for some reason, it will also fallback to store ip. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Note: Stackmap with build_id only works in non-nmi context at this time. This is because we need to take mm->mmap_sem for find_vma(). If this changes, we would like to allow build_id lookup in nmi context. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 22 ++++ kernel/bpf/stackmap.c | 257 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 257 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a66769e5875..1e15d1724d89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -231,6 +231,28 @@ enum bpf_attach_type { #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) +/* Flag for stack_map, store build_id+offset instead of pointer */ +#define BPF_F_STACK_BUILD_ID (1U << 5) + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43f5894..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@ #include #include #include +#include +#include #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ + BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -114,13 +136,184 @@ free_smap: return ERR_PTR(err); } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BPF_BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + BPF_BUILD_ID_SIZE); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, + unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = page_address(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = stack_map_get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = stack_map_get_build_id_64(page_addr, build_id); +out: + put_page(page); + return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, + struct stack_map_bucket *bucket, + u64 *ips, u32 trace_nr, bool user) +{ + int i; + struct vm_area_struct *vma; + struct bpf_stack_build_id *id_offs; + + bucket->nr = trace_nr; + id_offs = (struct bpf_stack_build_id *)bucket->data; + + /* + * We cannot do up_read() in nmi context, so build_id lookup is + * only supported for non-nmi events. If at some point, it is + * possible to run find_vma() without taking the semaphore, we + * would like to allow build_id lookup in nmi context. + * + * Same fallback is used for kernel stack (!user) on a stackmap + * with build_id. + */ + if (!user || !current || !current->mm || in_nmi() || + down_read_trylock(¤t->mm->mmap_sem) == 0) { + /* cannot access current->mm, fall back to ips */ + for (i = 0; i < trace_nr; i++) { + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + } + return; + } + + for (i = 0; i < trace_nr; i++) { + vma = find_vma(current->mm, ips[i]); + if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + /* per entry fall back to ips */ + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + continue; + } + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] + - vma->vm_start; + id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + } + up_read(¤t->mm->mmap_sem); +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / 8; + u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, bool user = flags & BPF_F_USER_STACK; bool kernel = !user; u64 *ips; + bool hash_matches; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); - if (bucket && bucket->hash == hash) { - if (flags & BPF_F_FAST_STACK_CMP) + hash_matches = bucket && bucket->hash == hash; + /* fast cmp */ + if (hash_matches && flags & BPF_F_FAST_STACK_CMP) + return id; + + if (stack_map_use_build_id(map)) { + /* for build_id+offset, pop a bucket before slow cmp */ + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + stack_map_get_build_id_offset(map, new_bucket, ips, + trace_nr, user); + trace_len = trace_nr * sizeof(struct bpf_stack_build_id); + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, new_bucket->data, trace_len) == 0) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; - if (bucket->nr == trace_nr && - memcmp(bucket->ip, ips, trace_len) == 0) + } + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); + return -EEXIST; + } + } else { + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, ips, trace_len) == 0) return id; + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + memcpy(new_bucket->data, ips, trace_len); } - /* this call stack is not in the map, try to add it */ - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; - - new_bucket = (struct stack_map_bucket *) - pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; - - memcpy(new_bucket->ip, ips, trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) if (!bucket) return -ENOENT; - trace_len = bucket->nr * sizeof(u64); - memcpy(value, bucket->ip, trace_len); + trace_len = bucket->nr * stack_map_data_size(map); + memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); -- cgit v1.2.3 From 7156d194a0772f733865267e7207e0b08f81b02b Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Fri, 16 Mar 2018 10:51:07 -0700 Subject: tcp: add snd_ssthresh stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_SND_SSTHRESH stat into SCM_TIMESTAMPING_OPT_STATS that reports tcp_sock.snd_ssthresh. Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Priyaranjan Jha Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 4c0ae0faf7ca..560374c978f9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -243,6 +243,7 @@ enum { TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ + TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fb350f740f69..e553f84bde83 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3031,7 +3031,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 4 * nla_total_size(sizeof(u32)) + + 5 * nla_total_size(sizeof(u32)) + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3061,6 +3061,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); -- cgit v1.2.3 From 928df1880e24bcd47d6359ff86df24db3dfba3c3 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Mar 2018 16:48:51 +0100 Subject: tipc: obsolete TIPC_ZONE_SCOPE Publications for TIPC_CLUSTER_SCOPE and TIPC_ZONE_SCOPE are in all aspects handled the same way, both on the publishing node and on the receiving nodes. Despite previous ambitions to the contrary, this is never going to change, so we take the conseqeunce of this and obsolete TIPC_ZONE_SCOPE and related macros/functions. Whenever a user is doing a bind() or a sendmsg() attempt using ZONE_SCOPE we translate this internally to CLUSTER_SCOPE, while we remain compatible with users and remote nodes still using ZONE_SCOPE. Furthermore, the non-formalized scope value 0 has always been permitted for use during lookup, with the same meaning as ZONE_SCOPE/CLUSTER_SCOPE. We now permit it even as binding scope, but for compatibility reasons we choose to not change the value of TIPC_CLUSTER_SCOPE. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 102 ++++++++++++++++++++++++---------------------- net/tipc/addr.c | 31 -------------- net/tipc/addr.h | 10 +++++ net/tipc/msg.c | 2 +- net/tipc/name_table.c | 3 +- net/tipc/net.c | 2 +- net/tipc/socket.c | 15 ++++--- 7 files changed, 77 insertions(+), 88 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 14bacc7e6cef..4ac9f1f02b06 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -61,50 +61,6 @@ struct tipc_name_seq { __u32 upper; }; -/* TIPC Address Size, Offset, Mask specification for Z.C.N - */ -#define TIPC_NODE_BITS 12 -#define TIPC_CLUSTER_BITS 12 -#define TIPC_ZONE_BITS 8 - -#define TIPC_NODE_OFFSET 0 -#define TIPC_CLUSTER_OFFSET TIPC_NODE_BITS -#define TIPC_ZONE_OFFSET (TIPC_CLUSTER_OFFSET + TIPC_CLUSTER_BITS) - -#define TIPC_NODE_SIZE ((1UL << TIPC_NODE_BITS) - 1) -#define TIPC_CLUSTER_SIZE ((1UL << TIPC_CLUSTER_BITS) - 1) -#define TIPC_ZONE_SIZE ((1UL << TIPC_ZONE_BITS) - 1) - -#define TIPC_NODE_MASK (TIPC_NODE_SIZE << TIPC_NODE_OFFSET) -#define TIPC_CLUSTER_MASK (TIPC_CLUSTER_SIZE << TIPC_CLUSTER_OFFSET) -#define TIPC_ZONE_MASK (TIPC_ZONE_SIZE << TIPC_ZONE_OFFSET) - -#define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK) - -static inline __u32 tipc_addr(unsigned int zone, - unsigned int cluster, - unsigned int node) -{ - return (zone << TIPC_ZONE_OFFSET) | - (cluster << TIPC_CLUSTER_OFFSET) | - node; -} - -static inline unsigned int tipc_zone(__u32 addr) -{ - return addr >> TIPC_ZONE_OFFSET; -} - -static inline unsigned int tipc_cluster(__u32 addr) -{ - return (addr & TIPC_CLUSTER_MASK) >> TIPC_CLUSTER_OFFSET; -} - -static inline unsigned int tipc_node(__u32 addr) -{ - return addr & TIPC_NODE_MASK; -} - /* * Application-accessible port name types */ @@ -117,9 +73,10 @@ static inline unsigned int tipc_node(__u32 addr) /* * Publication scopes when binding port names and port name sequences */ -#define TIPC_ZONE_SCOPE 1 -#define TIPC_CLUSTER_SCOPE 2 -#define TIPC_NODE_SCOPE 3 +enum tipc_scope { + TIPC_CLUSTER_SCOPE = 2, /* 0 can also be used */ + TIPC_NODE_SCOPE = 3 +}; /* * Limiting values for messages @@ -243,7 +200,7 @@ struct sockaddr_tipc { struct tipc_group_req { __u32 type; /* group id */ __u32 instance; /* member id */ - __u32 scope; /* zone/cluster/node */ + __u32 scope; /* cluster/node */ __u32 flags; }; @@ -268,4 +225,53 @@ struct tipc_sioc_ln_req { __u32 bearer_id; char linkname[TIPC_MAX_LINK_NAME]; }; + + +/* The macros and functions below are deprecated: + */ + +#define TIPC_ZONE_SCOPE 1 + +#define TIPC_NODE_BITS 12 +#define TIPC_CLUSTER_BITS 12 +#define TIPC_ZONE_BITS 8 + +#define TIPC_NODE_OFFSET 0 +#define TIPC_CLUSTER_OFFSET TIPC_NODE_BITS +#define TIPC_ZONE_OFFSET (TIPC_CLUSTER_OFFSET + TIPC_CLUSTER_BITS) + +#define TIPC_NODE_SIZE ((1UL << TIPC_NODE_BITS) - 1) +#define TIPC_CLUSTER_SIZE ((1UL << TIPC_CLUSTER_BITS) - 1) +#define TIPC_ZONE_SIZE ((1UL << TIPC_ZONE_BITS) - 1) + +#define TIPC_NODE_MASK (TIPC_NODE_SIZE << TIPC_NODE_OFFSET) +#define TIPC_CLUSTER_MASK (TIPC_CLUSTER_SIZE << TIPC_CLUSTER_OFFSET) +#define TIPC_ZONE_MASK (TIPC_ZONE_SIZE << TIPC_ZONE_OFFSET) + +#define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK) + +static inline __u32 tipc_addr(unsigned int zone, + unsigned int cluster, + unsigned int node) +{ + return (zone << TIPC_ZONE_OFFSET) | + (cluster << TIPC_CLUSTER_OFFSET) | + node; +} + +static inline unsigned int tipc_zone(__u32 addr) +{ + return addr >> TIPC_ZONE_OFFSET; +} + +static inline unsigned int tipc_cluster(__u32 addr) +{ + return (addr & TIPC_CLUSTER_MASK) >> TIPC_CLUSTER_OFFSET; +} + +static inline unsigned int tipc_node(__u32 addr) +{ + return addr & TIPC_NODE_MASK; +} + #endif diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 48fd3b5a73fb..97cd857d7f43 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -63,23 +63,6 @@ int in_own_node(struct net *net, u32 addr) return (addr == tn->own_addr) || !addr; } -/** - * addr_domain - convert 2-bit scope value to equivalent message lookup domain - * - * Needed when address of a named message must be looked up a second time - * after a network hop. - */ -u32 addr_domain(struct net *net, u32 sc) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - if (likely(sc == TIPC_NODE_SCOPE)) - return tn->own_addr; - if (sc == TIPC_CLUSTER_SCOPE) - return tipc_cluster_mask(tn->own_addr); - return tipc_zone_mask(tn->own_addr); -} - /** * tipc_addr_domain_valid - validates a network domain address * @@ -124,20 +107,6 @@ int tipc_in_scope(u32 domain, u32 addr) return 0; } -/** - * tipc_addr_scope - convert message lookup domain to a 2-bit scope value - */ -int tipc_addr_scope(u32 domain) -{ - if (likely(!domain)) - return TIPC_ZONE_SCOPE; - if (tipc_node(domain)) - return TIPC_NODE_SCOPE; - if (tipc_cluster(domain)) - return TIPC_CLUSTER_SCOPE; - return TIPC_ZONE_SCOPE; -} - char *tipc_addr_string_fill(char *string, u32 addr) { snprintf(string, 16, "<%u.%u.%u>", diff --git a/net/tipc/addr.h b/net/tipc/addr.h index bebb347803ce..2ecf5a5d40dd 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -60,6 +60,16 @@ static inline u32 tipc_cluster_mask(u32 addr) return addr & TIPC_ZONE_CLUSTER_MASK; } +static inline int tipc_node2scope(u32 node) +{ + return node ? TIPC_NODE_SCOPE : TIPC_CLUSTER_SCOPE; +} + +static inline int tipc_scope2node(struct net *net, int sc) +{ + return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net); +} + u32 tipc_own_addr(struct net *net); int in_own_cluster(struct net *net, u32 addr); int in_own_cluster_exact(struct net *net, u32 addr); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 4e1c6f6450bb..b6c45dccba3d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -580,7 +580,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; - dnode = addr_domain(net, msg_lookup_scope(msg)); + dnode = tipc_scope2node(net, msg_lookup_scope(msg)); dport = tipc_nametbl_translate(net, msg_nametype(msg), msg_nameinst(msg), &dnode); if (!dport) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e01c9c691ba2..6772390fcb00 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -473,8 +473,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, struct name_seq *seq = nametbl_find_seq(net, type); int index = hash(type); - if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE) || - (lower > upper)) { + if (scope > TIPC_NODE_SCOPE || lower > upper) { pr_debug("Failed to publish illegal {%u,%u,%u} with scope %u\n", type, lower, upper, scope); return NULL; diff --git a/net/tipc/net.c b/net/tipc/net.c index 1a2fde0d6f61..5c4c4405b78e 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -118,7 +118,7 @@ int tipc_net_start(struct net *net, u32 addr) tipc_sk_reinit(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, - TIPC_ZONE_SCOPE, 0, tn->own_addr); + TIPC_CLUSTER_SCOPE, 0, tn->own_addr); pr_info("Started in network mode\n"); pr_info("Own node address %s, network identity %u\n", diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8b04e601311c..910d3827f499 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -644,7 +644,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, goto exit; } - res = (addr->scope > 0) ? + res = (addr->scope >= 0) ? tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); exit: @@ -1280,8 +1280,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; - u32 type, inst, domain; u32 dnode, dport; + u32 type, inst; int mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) @@ -1332,13 +1332,12 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (dest->addrtype == TIPC_ADDR_NAME) { type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; - domain = dest->addr.name.domain; - dnode = domain; + dnode = dest->addr.name.domain; msg_set_type(hdr, TIPC_NAMED_MSG); msg_set_hdr_sz(hdr, NAMED_H_SIZE); msg_set_nametype(hdr, type); msg_set_nameinst(hdr, inst); - msg_set_lookup_scope(hdr, tipc_addr_scope(domain)); + msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); dport = tipc_nametbl_translate(net, type, inst, &dnode); msg_set_destnode(hdr, dnode); msg_set_destport(hdr, dport); @@ -2592,6 +2591,9 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct publication *publ; u32 key; + if (scope != TIPC_NODE_SCOPE) + scope = TIPC_CLUSTER_SCOPE; + if (tipc_sk_connected(sk)) return -EINVAL; key = tsk->portid + tsk->pub_count + 1; @@ -2617,6 +2619,9 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct publication *safe; int rc = -EINVAL; + if (scope != TIPC_NODE_SCOPE) + scope = TIPC_CLUSTER_SCOPE; + list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) { if (seq) { if (publ->scope != scope) -- cgit v1.2.3 From 4f738adba30a7cfc006f605707e7aee847ffefa0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:10 -0700 Subject: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data This implements a BPF ULP layer to allow policy enforcement and monitoring at the socket layer. In order to support this a new program type BPF_PROG_TYPE_SK_MSG is used to run the policy at the sendmsg/sendpage hook. To attach the policy to sockets a sockmap is used with a new program attach type BPF_SK_MSG_VERDICT. Similar to previous sockmap usages when a sock is added to a sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT program type attached then the BPF ULP layer is created on the socket and the attached BPF_PROG_TYPE_SK_MSG program is run for every msg in sendmsg case and page/offset in sendpage case. BPF_PROG_TYPE_SK_MSG Semantics/API: BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and SK_DROP. Returning SK_DROP free's the copied data in the sendmsg case and in the sendpage case leaves the data untouched. Both cases return -EACESS to the user. Returning SK_PASS will allow the msg to be sent. In the sendmsg case data is copied into kernel space buffers before running the BPF program. The kernel space buffers are stored in a scatterlist object where each element is a kernel memory buffer. Some effort is made to coalesce data from the sendmsg call here. For example a sendmsg call with many one byte iov entries will likely be pushed into a single entry. The BPF program is run with data pointers (start/end) pointing to the first sg element. In the sendpage case data is not copied. We opt not to copy the data by default here, because the BPF infrastructure does not know what bytes will be needed nor when they will be needed. So copying all bytes may be wasteful. Because of this the initial start/end data pointers are (0,0). Meaning no data can be read or written. This avoids reading data that may be modified by the user. A new helper is added later in this series if reading and writing the data is needed. The helper call will do a copy by default so that the page is exclusively owned by the BPF call. The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg in the sendmsg() case and the entire page/offset in the sendpage case. This avoids ambiguity on how to handle mixed return codes in the sendmsg case. Again a helper is added later in the series if a verdict needs to apply to multiple system calls and/or only a subpart of the currently being processed message. The helper msg_redirect_map() can be used to select the socket to send the data on. This is used similar to existing redirect use cases. This allows policy to redirect msgs. Pseudo code simple example: The basic logic to attach a program to a socket is as follows, // load the programs bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); // lookup the sockmap bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map"); // get fd for sockmap map_fd_msg = bpf_map__fd(bpf_map_msg); // attach program to sockmap bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); Adding sockets to the map is done in the normal way, // Add a socket 'fd' to sockmap at location 'i' bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY); After the above any socket attached to "my_sock_map", in this case 'fd', will run the BPF msg verdict program (msg_prog) on every sendmsg and sendpage system call. For a complete example see BPF selftests or sockmap samples. Implementation notes: It seemed the simplest, to me at least, to use a refcnt to ensure psock is not lost across the sendmsg copy into the sg, the bpf program running on the data in sg_data, and the final pass to the TCP stack. Some performance testing may show a better method to do this and avoid the refcnt cost, but for now use the simpler method. Another item that will come after basic support is in place is supporting MSG_MORE flag. At the moment we call sendpages even if the MSG_MORE flag is set. An enhancement would be to collect the pages into a larger scatterlist and pass down the stack. Notice that bpf_tcp_sendmsg() could support this with some additional state saved across sendmsg calls. I built the code to support this without having to do refactoring work. Other features TBD include ZEROCOPY and the TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series shortly. Future work could improve size limits on the scatterlist rings used here. Currently, we use MAX_SKB_FRAGS simply because this was being used already in the TLS case. Future work could extend the kernel sk APIs to tune this depending on workload. This is a trade-off between memory usage and throughput performance. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 17 ++ include/uapi/linux/bpf.h | 22 +- kernel/bpf/sockmap.c | 712 +++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 14 +- kernel/bpf/verifier.c | 5 +- net/core/filter.c | 106 +++++++ 8 files changed, 857 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 66df387106de..819229c80eca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; +struct sock; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 19b8349a3809..5e2e8a49fb21 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -13,6 +13,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) diff --git a/include/linux/filter.h b/include/linux/filter.h index fdb691b520c0..109d05ccea9a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -507,6 +507,22 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct sk_msg_buff { + void *data; + void *data_end; + __u32 apply_bytes; + __u32 cork_bytes; + int sg_copybreak; + int sg_start; + int sg_curr; + int sg_end; + struct scatterlist sg_data[MAX_SKB_FRAGS]; + bool sg_copy[MAX_SKB_FRAGS]; + __u32 key; + __u32 flags; + struct bpf_map *map; +}; + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -771,6 +787,7 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); +struct sock *do_msg_redirect_map(struct sk_msg_buff *md); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e15d1724d89..ef529539abde 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, }; enum bpf_attach_type { @@ -143,6 +144,7 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -718,6 +720,15 @@ union bpf_attr { * int bpf_override_return(pt_regs, rc) * @pt_regs: pointer to struct pt_regs * @rc: the return value to set + * + * int bpf_msg_redirect_map(map, key, flags) + * Redirect msg to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_PASS + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -779,7 +790,8 @@ union bpf_attr { FN(perf_prog_read_value), \ FN(getsockopt), \ FN(override_return), \ - FN(sock_ops_cb_flags_set), + FN(sock_ops_cb_flags_set), \ + FN(msg_redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -942,6 +954,14 @@ enum sk_action { SK_PASS, }; +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + void *data; + void *data_end; +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 051b224270e3..69c5bccabd22 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -73,7 +75,16 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -91,6 +102,11 @@ struct smap_psock { void (*save_write_space)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -115,27 +131,41 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + } + sk->sk_prot = &tcp_bpf_proto; rcu_read_unlock(); return 0; } +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + static void bpf_tcp_release(struct sock *sk) { struct smap_psock *psock; rcu_read_lock(); psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; - if (likely(psock)) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; } + + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; +out: rcu_read_unlock(); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -174,6 +204,7 @@ enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { @@ -185,10 +216,621 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { .release = bpf_tcp_release, }; +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + memset(sg, 0, sizeof(*sg)); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_table(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) + m = psock->cork; + else + m = &md; + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg = &m->sg_data[m->sg_end]; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + static int bpf_tcp_ulp_register(void) { tcp_bpf_proto = tcp_prot; tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ return tcp_register_ulp(&bpf_tcp_ulp_ops); } @@ -412,7 +1054,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -482,12 +1123,22 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -503,6 +1154,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); @@ -711,10 +1363,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -737,6 +1390,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -755,6 +1409,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -769,7 +1434,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - refcount_inc(&psock->refcnt); + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -777,11 +1449,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_progs; - set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -794,6 +1463,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -815,8 +1492,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -829,6 +1504,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -843,6 +1520,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -904,6 +1584,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file) orig = xchg(&stab->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); } const struct bpf_map_ops sock_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..3aeb4ea2a93a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1315,7 +1315,8 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1329,8 +1330,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1382,9 +1382,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1437,9 +1439,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb79a34359c0..e9f7c20691c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1248,6 +1248,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -2071,7 +2072,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -2109,6 +2111,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; diff --git a/net/core/filter.c b/net/core/filter.c index 33edfa8372fd..2b6c47597eab 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1890,6 +1890,44 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags)) + return SK_DROP; + + msg->key = key; + msg->flags = flags; + msg->map = map; + + return SK_PASS; +} + +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + struct sock *sk = NULL; + + if (msg->map) { + sk = __sock_map_lookup_elem(msg->map, msg->key); + + msg->key = 0; + msg->map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3591,6 +3629,16 @@ static const struct bpf_func_proto * } } +static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -3980,6 +4028,32 @@ static bool sk_skb_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + if (off < 0 || off >= sizeof(struct sk_msg_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u64)) + return false; + + return true; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4778,6 +4852,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data_end)); + break; + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4868,6 +4965,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = { const struct bpf_prog_ops sk_skb_prog_ops = { }; +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; -- cgit v1.2.3 From 2a100317c9ebc204a166f16294884fbf9da074ce Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:15 -0700 Subject: bpf: sockmap, add bpf_msg_apply_bytes() helper A single sendmsg or sendfile system call can contain multiple logical messages that a BPF program may want to read and apply a verdict. But, without an apply_bytes helper any verdict on the data applies to all bytes in the sendmsg/sendfile. Alternatively, a BPF program may only care to read the first N bytes of a msg. If the payload is large say MB or even GB setting up and calling the BPF program repeatedly for all bytes, even though the verdict is already known, creates unnecessary overhead. To allow BPF programs to control how many bytes a given verdict applies to we implement a bpf_msg_apply_bytes() helper. When called from within a BPF program this sets a counter, internal to the BPF infrastructure, that applies the last verdict to the next N bytes. If the N is smaller than the current data being processed from a sendmsg/sendfile call, the first N bytes will be sent and the BPF program will be re-run with start_data pointing to the N+1 byte. If N is larger than the current data being processed the BPF verdict will be applied to multiple sendmsg/sendfile calls until N bytes are consumed. Note1 if a socket closes with apply_bytes counter non-zero this is not a problem because data is not being buffered for N bytes and is sent as its received. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ef529539abde..a557a2a5d72d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -791,7 +791,8 @@ union bpf_attr { FN(getsockopt), \ FN(override_return), \ FN(sock_ops_cb_flags_set), \ - FN(msg_redirect_map), + FN(msg_redirect_map), \ + FN(msg_apply_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 2b6c47597eab..17d6775f5431 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1928,6 +1928,20 @@ static const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3634,6 +3648,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 91843d540a139eb8070bcff8aa10089164436deb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:20 -0700 Subject: bpf: sockmap, add msg_cork_bytes() helper In the case where we need a specific number of bytes before a verdict can be assigned, even if the data spans multiple sendmsg or sendfile calls. The BPF program may use msg_cork_bytes(). The extreme case is a user can call sendmsg repeatedly with 1-byte msg segments. Obviously, this is bad for performance but is still valid. If the BPF program needs N bytes to validate a header it can use msg_cork_bytes to specify N bytes and the BPF program will not be called again until N bytes have been accumulated. The infrastructure will attempt to coalesce data if possible so in many cases (most my use cases at least) the data will be in a single scatterlist element with data pointers pointing to start/end of the element. However, this is dependent on available memory so is not guaranteed. So BPF programs must validate data pointer ranges, but this is the case anyways to convince the verifier the accesses are valid. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a557a2a5d72d..1765cfb16c99 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -792,7 +792,8 @@ union bpf_attr { FN(override_return), \ FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ - FN(msg_apply_bytes), + FN(msg_apply_bytes), \ + FN(msg_cork_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 17d6775f5431..0c9daf6ee555 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1942,6 +1942,20 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3650,6 +3664,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 015632bb30daaaee64e1bcac07570860e0bf3092 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:25 -0700 Subject: bpf: sk_msg program helper bpf_sk_msg_pull_data Currently, if a bpf sk msg program is run the program can only parse data that the (start,end) pointers already consumed. For sendmsg hooks this is likely the first scatterlist element. For sendpage this will be the range (0,0) because the data is shared with userspace and by default we want to avoid allowing userspace to modify data while (or after) BPF verdict is being decided. To support pulling in additional bytes for parsing use a new helper bpf_sk_msg_pull(start, end, flags) which works similar to cls tc logic. This helper will attempt to point the data start pointer at 'start' bytes offest into msg and data end pointer at 'end' bytes offset into message. After basic sanity checks to ensure 'start' <= 'end' and 'end' <= msg_length there are a few cases we need to handle. First the sendmsg hook has already copied the data from userspace and has exclusive access to it. Therefor, it is not necessesary to copy the data. However, it may be required. After finding the scatterlist element with 'start' offset byte in it there are two cases. One the range (start,end) is entirely contained in the sg element and is already linear. All that is needed is to update the data pointers, no allocate/copy is needed. The other case is (start, end) crosses sg element boundaries. In this case we allocate a block of size 'end - start' and copy the data to linearize it. Next sendpage hook has not copied any data in initial state so that data pointers are (0,0). In this case we handle it similar to the above sendmsg case except the allocation/copy must always happen. Then when sending the data we have possibly three memory regions that need to be sent, (0, start - 1), (start, end), and (end + 1, msg_length). This is required to ensure any writes by the BPF program are correctly transmitted. Lastly this operation will invalidate any previous data checks so BPF programs will have to revalidate pointers after making this BPF call. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 +- net/core/filter.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1765cfb16c99..18b7c510c511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -793,7 +793,8 @@ union bpf_attr { FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ FN(msg_apply_bytes), \ - FN(msg_cork_bytes), + FN(msg_cork_bytes), \ + FN(msg_pull_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 0c9daf6ee555..c86f03fd9ea5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1956,6 +1956,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_pull_data, + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +{ + unsigned int len = 0, offset = 0, copy = 0; + struct scatterlist *sg = msg->sg_data; + int first_sg, last_sg, i, shift; + unsigned char *p, *to, *from; + int bytes = end - start; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg_start; + do { + len = sg[i].length; + offset += len; + if (start < offset + len) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != msg->sg_end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + if (!msg->sg_copy[i] && bytes <= len) + goto out; + + first_sg = i; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sg[i].length; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (bytes < copy) + break; + } while (i != msg->sg_end); + last_sg = i; + + if (unlikely(copy < end - start)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + p = page_address(page); + offset = 0; + + i = first_sg; + do { + from = sg_virt(&sg[i]); + len = sg[i].length; + to = p + offset; + + memcpy(to, from, len); + offset += len; + sg[i].length = 0; + put_page(sg_page(&sg[i])); + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != last_sg); + + sg[first_sg].length = copy; + sg_set_page(&sg[first_sg], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + shift = last_sg - first_sg - 1; + if (!shift) + goto out; + + i = first_sg + 1; + do { + int move_from; + + if (i + shift >= MAX_SKB_FRAGS) + move_from = i + shift - MAX_SKB_FRAGS; + else + move_from = i + shift; + + if (move_from == msg->sg_end) + break; + + sg[i] = sg[move_from]; + sg[move_from].length = 0; + sg[move_from].page_link = 0; + sg[move_from].offset = 0; + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (1); + msg->sg_end -= shift; + if (msg->sg_end < 0) + msg->sg_end += MAX_SKB_FRAGS; +out: + msg->data = sg_virt(&sg[i]) + start - offset; + msg->data_end = msg->data + bytes; + + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2897,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta) + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data) return true; return false; @@ -3666,6 +3797,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From d719e3f21cf91d3f82bd827d46199ba41af2f73a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Mar 2018 11:57:20 +0100 Subject: netfilter: nft_ct: add NFT_CT_{SRC,DST}_{IP,IP6} All existing keys, except the NFT_CT_SRC and NFT_CT_DST are assumed to have strict datatypes. This is causing problems with sets and concatenations given the specific length of these keys is not known. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 12 ++++++++-- net/netfilter/nft_ct.c | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 66dceee0ae30..6a3d653d5b27 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -909,8 +909,8 @@ enum nft_rt_attributes { * @NFT_CT_EXPIRATION: relative conntrack expiration time in ms * @NFT_CT_HELPER: connection tracking helper assigned to conntrack * @NFT_CT_L3PROTOCOL: conntrack layer 3 protocol - * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address) - * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address) + * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address, deprecated) + * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address, deprecated) * @NFT_CT_PROTOCOL: conntrack layer 4 protocol * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination @@ -920,6 +920,10 @@ enum nft_rt_attributes { * @NFT_CT_AVGPKT: conntrack average bytes per packet * @NFT_CT_ZONE: conntrack zone * @NFT_CT_EVENTMASK: ctnetlink events to be generated for this conntrack + * @NFT_CT_SRC_IP: conntrack layer 3 protocol source (IPv4 address) + * @NFT_CT_DST_IP: conntrack layer 3 protocol destination (IPv4 address) + * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address) + * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address) */ enum nft_ct_keys { NFT_CT_STATE, @@ -941,6 +945,10 @@ enum nft_ct_keys { NFT_CT_AVGPKT, NFT_CT_ZONE, NFT_CT_EVENTMASK, + NFT_CT_SRC_IP, + NFT_CT_DST_IP, + NFT_CT_SRC_IP6, + NFT_CT_DST_IP6, }; /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6ab274b14484..ea737fd789e8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -196,6 +196,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr, case NFT_CT_PROTO_DST: nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; + case NFT_CT_SRC_IP: + if (nf_ct_l3num(ct) != NFPROTO_IPV4) + goto err; + *dest = tuple->src.u3.ip; + return; + case NFT_CT_DST_IP: + if (nf_ct_l3num(ct) != NFPROTO_IPV4) + goto err; + *dest = tuple->dst.u3.ip; + return; + case NFT_CT_SRC_IP6: + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + goto err; + memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr)); + return; + case NFT_CT_DST_IP6: + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + goto err; + memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr)); + return; default: break; } @@ -419,6 +439,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, return -EAFNOSUPPORT; } break; + case NFT_CT_SRC_IP: + case NFT_CT_DST_IP: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + + len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip); + break; + case NFT_CT_SRC_IP6: + case NFT_CT_DST_IP6: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + + len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6); + break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) @@ -588,6 +622,10 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) switch (priv->key) { case NFT_CT_SRC: case NFT_CT_DST: + case NFT_CT_SRC_IP: + case NFT_CT_DST_IP: + case NFT_CT_SRC_IP6: + case NFT_CT_DST_IP6: case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) -- cgit v1.2.3 From 472a73e00757b971d613d796374d2727b2e4954d Mon Sep 17 00:00:00 2001 From: Jack Ma Date: Mon, 19 Mar 2018 09:41:59 +1300 Subject: netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK targets. This patch introduces a new feature that allows bitshifting (left and right) operations to co-operate with existing iptables options. Reviewed-by: Florian Westphal Signed-off-by: Jack Ma Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_connmark.h | 10 ++++ net/netfilter/xt_connmark.c | 77 +++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_connmark.h b/include/uapi/linux/netfilter/xt_connmark.h index 408a9654f05c..1aa5c955ee1e 100644 --- a/include/uapi/linux/netfilter/xt_connmark.h +++ b/include/uapi/linux/netfilter/xt_connmark.h @@ -19,11 +19,21 @@ enum { XT_CONNMARK_RESTORE }; +enum { + D_SHIFT_LEFT = 0, + D_SHIFT_RIGHT, +}; + struct xt_connmark_tginfo1 { __u32 ctmark, ctmask, nfmask; __u8 mode; }; +struct xt_connmark_tginfo2 { + __u32 ctmark, ctmask, nfmask; + __u8 shift_dir, shift_bits, mode; +}; + struct xt_connmark_mtinfo1 { __u32 mark, mask; __u8 invert; diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 809639ce6f5a..773da82190dc 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -36,9 +36,10 @@ MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static unsigned int -connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +connmark_tg_shift(struct sk_buff *skb, + const struct xt_connmark_tginfo1 *info, + u8 shift_bits, u8 shift_dir) { - const struct xt_connmark_tginfo1 *info = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; u_int32_t newmark; @@ -50,6 +51,10 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (info->mode) { case XT_CONNMARK_SET: newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); @@ -57,7 +62,11 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) break; case XT_CONNMARK_SAVE: newmark = (ct->mark & ~info->ctmask) ^ - (skb->mark & info->nfmask); + (skb->mark & info->nfmask); + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); @@ -65,14 +74,34 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) break; case XT_CONNMARK_RESTORE: newmark = (skb->mark & ~info->nfmask) ^ - (ct->mark & info->ctmask); + (ct->mark & info->ctmask); + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; skb->mark = newmark; break; } - return XT_CONTINUE; } +static unsigned int +connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo1 *info = par->targinfo; + + return connmark_tg_shift(skb, info, 0, 0); +} + +static unsigned int +connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo2 *info = par->targinfo; + + return connmark_tg_shift(skb, (const struct xt_connmark_tginfo1 *)info, + info->shift_bits, info->shift_dir); +} + static int connmark_tg_check(const struct xt_tgchk_param *par) { int ret; @@ -119,15 +148,27 @@ static void connmark_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target connmark_tg_reg __read_mostly = { - .name = "CONNMARK", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connmark_tg_check, - .target = connmark_tg, - .targetsize = sizeof(struct xt_connmark_tginfo1), - .destroy = connmark_tg_destroy, - .me = THIS_MODULE, +static struct xt_target connmark_tg_reg[] __read_mostly = { + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg_v2, + .targetsize = sizeof(struct xt_connmark_tginfo2), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + } }; static struct xt_match connmark_mt_reg __read_mostly = { @@ -145,12 +186,14 @@ static int __init connmark_mt_init(void) { int ret; - ret = xt_register_target(&connmark_tg_reg); + ret = xt_register_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&connmark_mt_reg); if (ret < 0) { - xt_unregister_target(&connmark_tg_reg); + xt_unregister_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); return ret; } return 0; @@ -159,7 +202,7 @@ static int __init connmark_mt_init(void) static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); - xt_unregister_target(&connmark_tg_reg); + xt_unregister_target(connmark_tg_reg); } module_init(connmark_mt_init); -- cgit v1.2.3 From 20710b3b81895c89e92bcc32ce85c0bede1171f8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 20 Mar 2018 12:33:51 +0100 Subject: netfilter: ctnetlink: synproxy support This patch exposes synproxy information per-conntrack. Moreover, send sequence adjustment events once server sends us the SYN,ACK packet, so we can synchronize the sequence adjustment too for packets going as reply from the server, as part of the synproxy logic. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 1 + include/uapi/linux/netfilter/nfnetlink_conntrack.h | 10 +++ net/ipv4/netfilter/ipt_SYNPROXY.c | 8 +- net/ipv6/netfilter/ip6t_SYNPROXY.c | 8 +- net/netfilter/nf_conntrack_netlink.c | 87 +++++++++++++++++++++- 5 files changed, 109 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 9574bd40870b..c712eb6879f1 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -129,6 +129,7 @@ enum ip_conntrack_events { IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ + IPCT_SYNPROXY, /* synproxy has been set */ #ifdef __KERNEL__ __IPCT_MAX #endif diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 7397e022ce6e..77987111cab0 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -54,6 +54,7 @@ enum ctattr_type { CTA_MARK_MASK, CTA_LABELS, CTA_LABELS_MASK, + CTA_SYNPROXY, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -190,6 +191,15 @@ enum ctattr_natseq { }; #define CTA_NAT_SEQ_MAX (__CTA_NAT_SEQ_MAX - 1) +enum ctattr_synproxy { + CTA_SYNPROXY_UNSPEC, + CTA_SYNPROXY_ISN, + CTA_SYNPROXY_ITS, + CTA_SYNPROXY_TSOFF, + __CTA_SYNPROXY_MAX, +}; +#define CTA_SYNPROXY_MAX (__CTA_SYNPROXY_MAX - 1) + enum ctattr_expect { CTA_EXPECT_UNSPEC, CTA_EXPECT_MASTER, diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f75fc6b53115..690b17ef6a44 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -16,6 +16,7 @@ #include #include #include +#include static struct iphdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, @@ -384,6 +385,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy->isn = ntohl(th->ack_seq); if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) @@ -392,8 +395,10 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } opts.options &= ~(XT_SYNPROXY_OPT_MSS | XT_SYNPROXY_OPT_WSCALE | @@ -403,6 +408,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 437af8c95277..cb6d42b03cb5 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -18,6 +18,7 @@ #include #include #include +#include static struct ipv6hdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, @@ -405,6 +406,8 @@ static unsigned int ipv6_synproxy_hook(void *priv, synproxy->isn = ntohl(th->ack_seq); if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) @@ -413,8 +416,10 @@ static unsigned int ipv6_synproxy_hook(void *priv, if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } opts.options &= ~(XT_SYNPROXY_OPT_MSS | XT_SYNPROXY_OPT_WSCALE | @@ -424,6 +429,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8884d302d33a..11ef85a57244 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -440,6 +440,31 @@ err: return -1; } +static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) +{ + struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); + struct nlattr *nest_parms; + + if (!synproxy) + return 0; + + nest_parms = nla_nest_start(skb, CTA_SYNPROXY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) || + nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) || + nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff))) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) @@ -518,7 +543,8 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -730,6 +756,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (events & (1 << IPCT_SEQADJ) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; + + if (events & (1 << IPCT_SYNPROXY) && + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + goto nla_put_failure; } #ifdef CONFIG_NF_CONNTRACK_MARK @@ -1689,6 +1719,39 @@ err: return ret; } +static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = { + [CTA_SYNPROXY_ISN] = { .type = NLA_U32 }, + [CTA_SYNPROXY_ITS] = { .type = NLA_U32 }, + [CTA_SYNPROXY_TSOFF] = { .type = NLA_U32 }, +}; + +static int ctnetlink_change_synproxy(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); + struct nlattr *tb[CTA_SYNPROXY_MAX + 1]; + int err; + + if (!synproxy) + return 0; + + err = nla_parse_nested(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY], + synproxy_policy, NULL); + if (err < 0) + return err; + + if (!tb[CTA_SYNPROXY_ISN] || + !tb[CTA_SYNPROXY_ITS] || + !tb[CTA_SYNPROXY_TSOFF]) + return -EINVAL; + + synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN])); + synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS])); + synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF])); + + return 0; +} + static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1759,6 +1822,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct, return err; } + if (cda[CTA_SYNPROXY]) { + err = ctnetlink_change_synproxy(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) @@ -1880,6 +1949,12 @@ ctnetlink_create_conntrack(struct net *net, goto err2; } + if (cda[CTA_SYNPROXY]) { + err = ctnetlink_change_synproxy(ct, cda); + if (err < 0) + goto err2; + } + #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); @@ -1991,7 +2066,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | - (1 << IPCT_MARK) | events, + (1 << IPCT_MARK) | + (1 << IPCT_SYNPROXY) | + events, ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); @@ -2012,7 +2089,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | - (1 << IPCT_MARK), + (1 << IPCT_MARK) | + (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); } @@ -2282,6 +2360,9 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; + if (ctnetlink_dump_ct_synproxy(skb, ct) < 0) + goto nla_put_failure; + #ifdef CONFIG_NF_CONNTRACK_MARK if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; -- cgit v1.2.3 From 5adc1668ddc42bb44fd6d006cacad74ed0cbf49d Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 4 Mar 2018 09:28:53 +0100 Subject: netfilter: ebtables: add support for matching ICMP type and code We already have ICMPv6 type/code matches. This adds support for IPv4 ICMP matches in the same way. Signed-off-by: Matthias Schiffer Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebt_ip.h | 13 +++++++-- net/bridge/netfilter/ebt_ip.c | 43 +++++++++++++++++++++------- 2 files changed, 43 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter_bridge/ebt_ip.h b/include/uapi/linux/netfilter_bridge/ebt_ip.h index 8e462fb1983f..4ed7fbb0a482 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_ip.h +++ b/include/uapi/linux/netfilter_bridge/ebt_ip.h @@ -24,8 +24,9 @@ #define EBT_IP_PROTO 0x08 #define EBT_IP_SPORT 0x10 #define EBT_IP_DPORT 0x20 +#define EBT_IP_ICMP 0x40 #define EBT_IP_MASK (EBT_IP_SOURCE | EBT_IP_DEST | EBT_IP_TOS | EBT_IP_PROTO |\ - EBT_IP_SPORT | EBT_IP_DPORT ) + EBT_IP_SPORT | EBT_IP_DPORT | EBT_IP_ICMP) #define EBT_IP_MATCH "ip" /* the same values are used for the invflags */ @@ -38,8 +39,14 @@ struct ebt_ip_info { __u8 protocol; __u8 bitmask; __u8 invflags; - __u16 sport[2]; - __u16 dport[2]; + union { + __u16 sport[2]; + __u8 icmp_type[2]; + }; + union { + __u16 dport[2]; + __u8 icmp_code[2]; + }; }; #endif diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 2b46c50abce0..8cb8f8395768 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -19,9 +19,15 @@ #include #include -struct tcpudphdr { - __be16 src; - __be16 dst; +union pkthdr { + struct { + __be16 src; + __be16 dst; + } tcpudphdr; + struct { + u8 type; + u8 code; + } icmphdr; }; static bool @@ -30,8 +36,8 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct ebt_ip_info *info = par->matchinfo; const struct iphdr *ih; struct iphdr _iph; - const struct tcpudphdr *pptr; - struct tcpudphdr _ports; + const union pkthdr *pptr; + union pkthdr _pkthdr; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) @@ -50,29 +56,38 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->bitmask & EBT_IP_PROTO) { if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; - if (!(info->bitmask & EBT_IP_DPORT) && - !(info->bitmask & EBT_IP_SPORT)) + if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT | + EBT_IP_ICMP))) return true; if (ntohs(ih->frag_off) & IP_OFFSET) return false; + + /* min icmp headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, ih->ihl*4, - sizeof(_ports), &_ports); + sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP_DPORT) { - u32 dst = ntohs(pptr->dst); + u32 dst = ntohs(pptr->tcpudphdr.dst); if (NF_INVF(info, EBT_IP_DPORT, dst < info->dport[0] || dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP_SPORT) { - u32 src = ntohs(pptr->src); + u32 src = ntohs(pptr->tcpudphdr.src); if (NF_INVF(info, EBT_IP_SPORT, src < info->sport[0] || src > info->sport[1])) return false; } + if ((info->bitmask & EBT_IP_ICMP) && + NF_INVF(info, EBT_IP_ICMP, + pptr->icmphdr.type < info->icmp_type[0] || + pptr->icmphdr.type > info->icmp_type[1] || + pptr->icmphdr.code < info->icmp_code[0] || + pptr->icmphdr.code > info->icmp_code[1])) + return false; } return true; } @@ -101,6 +116,14 @@ static int ebt_ip_mt_check(const struct xt_mtchk_param *par) return -EINVAL; if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; + if (info->bitmask & EBT_IP_ICMP) { + if ((info->invflags & EBT_IP_PROTO) || + info->protocol != IPPROTO_ICMP) + return -EINVAL; + if (info->icmp_type[0] > info->icmp_type[1] || + info->icmp_code[0] > info->icmp_code[1]) + return -EINVAL; + } return 0; } -- cgit v1.2.3 From 78d9f4d49bbecd101b4e5faf19f8f70719fee2ca Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 4 Mar 2018 09:28:54 +0100 Subject: netfilter: ebtables: add support for matching IGMP type We already have ICMPv6 type/code matches (which can be used to distinguish different types of MLD packets). Add support for IPv4 IGMP matches in the same way. Signed-off-by: Matthias Schiffer Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebt_ip.h | 4 +++- net/bridge/netfilter/ebt_ip.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter_bridge/ebt_ip.h b/include/uapi/linux/netfilter_bridge/ebt_ip.h index 4ed7fbb0a482..46d6261370b0 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_ip.h +++ b/include/uapi/linux/netfilter_bridge/ebt_ip.h @@ -25,8 +25,9 @@ #define EBT_IP_SPORT 0x10 #define EBT_IP_DPORT 0x20 #define EBT_IP_ICMP 0x40 +#define EBT_IP_IGMP 0x80 #define EBT_IP_MASK (EBT_IP_SOURCE | EBT_IP_DEST | EBT_IP_TOS | EBT_IP_PROTO |\ - EBT_IP_SPORT | EBT_IP_DPORT | EBT_IP_ICMP) + EBT_IP_SPORT | EBT_IP_DPORT | EBT_IP_ICMP | EBT_IP_IGMP) #define EBT_IP_MATCH "ip" /* the same values are used for the invflags */ @@ -42,6 +43,7 @@ struct ebt_ip_info { union { __u16 sport[2]; __u8 icmp_type[2]; + __u8 igmp_type[2]; }; union { __u16 dport[2]; diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 8cb8f8395768..ffaa8ce2e724 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -28,6 +28,9 @@ union pkthdr { u8 type; u8 code; } icmphdr; + struct { + u8 type; + } igmphdr; }; static bool @@ -57,12 +60,12 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT | - EBT_IP_ICMP))) + EBT_IP_ICMP | EBT_IP_IGMP))) return true; if (ntohs(ih->frag_off) & IP_OFFSET) return false; - /* min icmp headersize is 4, so sizeof(_pkthdr) is ok. */ + /* min icmp/igmp headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, ih->ihl*4, sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) @@ -88,6 +91,11 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) pptr->icmphdr.code < info->icmp_code[0] || pptr->icmphdr.code > info->icmp_code[1])) return false; + if ((info->bitmask & EBT_IP_IGMP) && + NF_INVF(info, EBT_IP_IGMP, + pptr->igmphdr.type < info->igmp_type[0] || + pptr->igmphdr.type > info->igmp_type[1])) + return false; } return true; } @@ -124,6 +132,13 @@ static int ebt_ip_mt_check(const struct xt_mtchk_param *par) info->icmp_code[0] > info->icmp_code[1]) return -EINVAL; } + if (info->bitmask & EBT_IP_IGMP) { + if ((info->invflags & EBT_IP_PROTO) || + info->protocol != IPPROTO_IGMP) + return -EINVAL; + if (info->igmp_type[0] > info->igmp_type[1]) + return -EINVAL; + } return 0; } -- cgit v1.2.3 From 2cb021f5de55b1d158fa18b0215a4613c3289a82 Mon Sep 17 00:00:00 2001 From: Dmitry Lebed Date: Thu, 1 Mar 2018 12:39:16 +0300 Subject: cfg80211/nl80211: add CAC_STARTED event CAC_STARTED event is needed for DFS offload feature and should be generated by driver/HW if DFS_OFFLOAD is enabled. Signed-off-by: Dmitry Lebed Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ net/wireless/mlme.c | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c13c84304be3..b8e989073cbb 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5204,6 +5204,8 @@ enum nl80211_smps_mode { * non-operating channel is expired and no longer valid. New CAC must * be done on this channel before starting the operation. This is not * applicable for ETSI dfs domain where pre-CAC is valid for ever. + * @NL80211_RADAR_CAC_STARTED: Channel Availability Check has been started, + * should be generated by HW if NL80211_EXT_FEATURE_DFS_OFFLOAD is enabled. */ enum nl80211_radar_event { NL80211_RADAR_DETECTED, @@ -5211,6 +5213,7 @@ enum nl80211_radar_event { NL80211_RADAR_CAC_ABORTED, NL80211_RADAR_NOP_FINISHED, NL80211_RADAR_PRE_CAC_EXPIRED, + NL80211_RADAR_CAC_STARTED, }; /** diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index bbb9907bfa86..6b6818dd76bd 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -888,14 +888,17 @@ void cfg80211_cac_event(struct net_device *netdev, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); - break; + /* fall through */ case NL80211_RADAR_CAC_ABORTED: + wdev->cac_started = false; + break; + case NL80211_RADAR_CAC_STARTED: + wdev->cac_started = true; break; default: WARN_ON(1); return; } - wdev->cac_started = false; nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } -- cgit v1.2.3 From 13cf6dec93e021ebd297619ea1926aea31b6430b Mon Sep 17 00:00:00 2001 From: Dmitry Lebed Date: Thu, 1 Mar 2018 12:39:15 +0300 Subject: cfg80211/nl80211: add DFS offload flag Add wiphy EXT_FEATURE flag to indicate that HW or driver does all DFS actions by itself. User-space functionality already implemented in hostapd using vendor-specific (QCA) OUI to advertise DFS offload support. Need to introduce generic flag to inform about DFS offload support. For devices with DFS_OFFLOAD flag set user-space will no longer need to issue CAC or do any actions in response to "radar detected" events. HW will do everything by itself and send events to user-space to indicate that CAC was started/finished, etc. Signed-off-by: Dmitrii Lebed Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/nl80211.c | 12 ++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b8e989073cbb..60fefc5a2ea4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4999,6 +4999,12 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_LOW_SPAN_SCAN: Driver supports low span scan. * @NL80211_EXT_FEATURE_LOW_POWER_SCAN: Driver supports low power scan. * @NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN: Driver supports high accuracy scan. + * @NL80211_EXT_FEATURE_DFS_OFFLOAD: HW/driver will offload DFS actions. + * Device or driver will do all DFS-related actions by itself, + * informing user-space about CAC progress, radar detection event, + * channel change triggered by radar detection event. + * No need to start CAC from user-space, no need to react to + * "radar detected" event. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -5029,6 +5035,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_LOW_SPAN_SCAN, NL80211_EXT_FEATURE_LOW_POWER_SCAN, NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, + NL80211_EXT_FEATURE_DFS_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a910150f8169..fe27ab443d01 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7551,12 +7551,13 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; struct cfg80211_chan_def chandef; enum nl80211_dfs_regions dfs_region; unsigned int cac_time_ms; int err; - dfs_region = reg_get_dfs_region(wdev->wiphy); + dfs_region = reg_get_dfs_region(wiphy); if (dfs_region == NL80211_DFS_UNSET) return -EINVAL; @@ -7570,17 +7571,20 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, if (wdev->cac_started) return -EBUSY; - err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef, - wdev->iftype); + err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype); if (err < 0) return err; if (err == 0) return -EINVAL; - if (!cfg80211_chandef_dfs_usable(wdev->wiphy, &chandef)) + if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) return -EINVAL; + /* CAC start is offloaded to HW and can't be started manually */ + if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) + return -EOPNOTSUPP; + if (!rdev->ops->start_radar_detection) return -EOPNOTSUPP; -- cgit v1.2.3 From c30b70deb5f4861f590031c33fd3ec6cc63f1df1 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Wed, 21 Mar 2018 14:37:44 +0100 Subject: tipc: implement socket diagnostics for AF_TIPC This commit adds socket diagnostics capability for AF_TIPC in netlink family NETLINK_SOCK_DIAG in a new kernel module (diag.ko). The following are key design considerations: - config TIPC_DIAG has default y, like INET_DIAG. - only requests with flag NLM_F_DUMP is supported (dump all). - tipc_sock_diag_req message is introduced to send filter parameters. - the response attributes are of TLV, some nested. To avoid exposing data structures between diag and tipc modules and avoid code duplication, the following additions are required: - export tipc_nl_sk_walk function to reuse socket iterator. - export tipc_sk_fill_sock_diag to fill the tipc diag attributes. - create a sock_diag response message in __tipc_add_sock_diag defined in diag.c and use the above exported tipc_sk_fill_sock_diag to fill response. Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 18 ++++++ include/uapi/linux/tipc_sockets_diag.h | 17 +++++ net/tipc/Kconfig | 8 +++ net/tipc/Makefile | 5 ++ net/tipc/diag.c | 114 +++++++++++++++++++++++++++++++++ net/tipc/socket.c | 72 +++++++++++++++++++-- net/tipc/socket.h | 10 ++- 7 files changed, 238 insertions(+), 6 deletions(-) create mode 100644 include/uapi/linux/tipc_sockets_diag.h create mode 100644 net/tipc/diag.c (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 469aa67a5ecb..d7cec0480d70 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -114,6 +114,13 @@ enum { TIPC_NLA_SOCK_REF, /* u32 */ TIPC_NLA_SOCK_CON, /* nest */ TIPC_NLA_SOCK_HAS_PUBL, /* flag */ + TIPC_NLA_SOCK_STAT, /* nest */ + TIPC_NLA_SOCK_TYPE, /* u32 */ + TIPC_NLA_SOCK_INO, /* u32 */ + TIPC_NLA_SOCK_UID, /* u32 */ + TIPC_NLA_SOCK_TIPC_STATE, /* u32 */ + TIPC_NLA_SOCK_COOKIE, /* u64 */ + TIPC_NLA_SOCK_PAD, /* flag */ __TIPC_NLA_SOCK_MAX, TIPC_NLA_SOCK_MAX = __TIPC_NLA_SOCK_MAX - 1 @@ -238,6 +245,17 @@ enum { TIPC_NLA_CON_MAX = __TIPC_NLA_CON_MAX - 1 }; +/* Nest, socket statistics info */ +enum { + TIPC_NLA_SOCK_STAT_RCVQ, /* u32 */ + TIPC_NLA_SOCK_STAT_SENDQ, /* u32 */ + TIPC_NLA_SOCK_STAT_LINK_CONG, /* flag */ + TIPC_NLA_SOCK_STAT_CONN_CONG, /* flag */ + + __TIPC_NLA_SOCK_STAT_MAX, + TIPC_NLA_SOCK_STAT_MAX = __TIPC_NLA_SOCK_STAT_MAX - 1 +}; + /* Nest, link propreties. Valid for link, media and bearer */ enum { TIPC_NLA_PROP_UNSPEC, diff --git a/include/uapi/linux/tipc_sockets_diag.h b/include/uapi/linux/tipc_sockets_diag.h new file mode 100644 index 000000000000..7678cf2f0dcc --- /dev/null +++ b/include/uapi/linux/tipc_sockets_diag.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* AF_TIPC sock_diag interface for querying open sockets */ + +#ifndef _UAPI__TIPC_SOCKETS_DIAG_H__ +#define _UAPI__TIPC_SOCKETS_DIAG_H__ + +#include +#include + +/* Request */ +struct tipc_sock_diag_req { + __u8 sdiag_family; /* must be AF_TIPC */ + __u8 sdiag_protocol; /* must be 0 */ + __u16 pad; /* must be 0 */ + __u32 tidiag_states; /* query*/ +}; +#endif /* _UAPI__TIPC_SOCKETS_DIAG_H__ */ diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index c25a3a149dc4..e450212121d2 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -34,3 +34,11 @@ config TIPC_MEDIA_UDP Saying Y here will enable support for running TIPC over IP/UDP bool default y + +config TIPC_DIAG + tristate "TIPC: socket monitoring interface" + depends on TIPC + default y + ---help--- + Support for TIPC socket monitoring interface used by ss tool. + If unsure, say Y. diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 1edb7192aa2f..aca168f2abb1 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -14,3 +14,8 @@ tipc-y += addr.o bcast.o bearer.o \ tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o + + +obj-$(CONFIG_TIPC_DIAG) += diag.o + +tipc_diag-y := diag.o diff --git a/net/tipc/diag.c b/net/tipc/diag.c new file mode 100644 index 000000000000..46d9cd62f781 --- /dev/null +++ b/net/tipc/diag.c @@ -0,0 +1,114 @@ +/* + * net/tipc/diag.c: TIPC socket diag + * + * Copyright (c) 2018, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "socket.h" +#include +#include + +static u64 __tipc_diag_gen_cookie(struct sock *sk) +{ + u32 res[2]; + + sock_diag_save_cookie(sk, res); + return *((u64 *)res); +} + +static int __tipc_add_sock_diag(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk) +{ + struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); + struct nlmsghdr *nlh; + int err; + + nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, + NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + err = tipc_sk_fill_sock_diag(skb, tsk, req->tidiag_states, + __tipc_diag_gen_cookie); + if (err) + return err; + + nlmsg_end(skb, nlh); + return 0; +} + +static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); +} + +static int tipc_sock_diag_handler_dump(struct sk_buff *skb, + struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct tipc_sock_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = tipc_diag_dump, + }; + netlink_dump_start(net->diag_nlsk, skb, h, &c); + return 0; + } + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler tipc_sock_diag_handler = { + .family = AF_TIPC, + .dump = tipc_sock_diag_handler_dump, +}; + +static int __init tipc_diag_init(void) +{ + return sock_diag_register(&tipc_sock_diag_handler); +} + +static void __exit tipc_diag_exit(void) +{ + sock_diag_unregister(&tipc_sock_diag_handler); +} + +module_init(tipc_diag_init); +module_exit(tipc_diag_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 35ac0f5b9529..07559ce4b8ba 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3213,10 +3213,10 @@ msg_cancel: return -EMSGSIZE; } -static int __tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, - int (*skb_handler)(struct sk_buff *skb, - struct netlink_callback *cb, - struct tipc_sock *tsk)) +int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, + int (*skb_handler)(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk)) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); @@ -3255,10 +3255,72 @@ out: return skb->len; } +EXPORT_SYMBOL(tipc_nl_sk_walk); + +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, + u32 sk_filter_state, + u64 (*tipc_diag_gen_cookie)(struct sock *sk)) +{ + struct sock *sk = &tsk->sk; + struct nlattr *attrs; + struct nlattr *stat; + + /*filter response w.r.t sk_state*/ + if (!(sk_filter_state & (1 << sk->sk_state))) + return 0; + + attrs = nla_nest_start(skb, TIPC_NLA_SOCK); + if (!attrs) + goto msg_cancel; + + if (__tipc_nl_add_sk_info(skb, tsk)) + goto attr_msg_cancel; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_TYPE, (u32)sk->sk_type) || + nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) || + nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) || + nla_put_u32(skb, TIPC_NLA_SOCK_UID, + from_kuid_munged(sk_user_ns(sk), sock_i_uid(sk))) || + nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE, + tipc_diag_gen_cookie(sk), + TIPC_NLA_SOCK_PAD)) + goto attr_msg_cancel; + + stat = nla_nest_start(skb, TIPC_NLA_SOCK_STAT); + if (!stat) + goto attr_msg_cancel; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ, + skb_queue_len(&sk->sk_receive_queue)) || + nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, + skb_queue_len(&sk->sk_write_queue))) + goto stat_msg_cancel; + + if (tsk->cong_link_cnt && + nla_put_flag(skb, TIPC_NLA_SOCK_STAT_LINK_CONG)) + goto stat_msg_cancel; + + if (tsk_conn_cong(tsk) && + nla_put_flag(skb, TIPC_NLA_SOCK_STAT_CONN_CONG)) + goto stat_msg_cancel; + + nla_nest_end(skb, stat); + nla_nest_end(skb, attrs); + + return 0; + +stat_msg_cancel: + nla_nest_cancel(skb, stat); +attr_msg_cancel: + nla_nest_cancel(skb, attrs); +msg_cancel: + return -EMSGSIZE; +} +EXPORT_SYMBOL(tipc_sk_fill_sock_diag); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) { - return __tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk); + return tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk); } /* Caller should hold socket lock for the passed tipc socket. */ diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 06fb5944cf76..aae3fd4cd06c 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -49,6 +49,8 @@ #define RCVBUF_DEF (FLOWCTL_BLK_SZ * 1024 * 2) #define RCVBUF_MAX (FLOWCTL_BLK_SZ * 1024 * 16) +struct tipc_sock; + int tipc_socket_init(void); void tipc_socket_stop(void); void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); @@ -59,5 +61,11 @@ int tipc_sk_rht_init(struct net *net); void tipc_sk_rht_destroy(struct net *net); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb); - +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, + u32 sk_filter_state, + u64 (*tipc_diag_gen_cookie)(struct sock *sk)); +int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, + int (*skb_handler)(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk)); #endif -- cgit v1.2.3 From 872619d8cf810c17279335ef531a2a34f3b4e589 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Wed, 21 Mar 2018 14:37:45 +0100 Subject: tipc: step sk->sk_drops when rcv buffer is full Currently when tipc is unable to queue a received message on a socket, the message is rejected back to the sender with error TIPC_ERR_OVERLOAD. However, the application on this socket has no knowledge about these discards. In this commit, we try to step the sk_drops counter when tipc is unable to queue a received message. Export sk_drops using tipc socket diagnostics. Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/socket.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index d7cec0480d70..d896ded51bcb 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -251,6 +251,7 @@ enum { TIPC_NLA_SOCK_STAT_SENDQ, /* u32 */ TIPC_NLA_SOCK_STAT_LINK_CONG, /* flag */ TIPC_NLA_SOCK_STAT_CONN_CONG, /* flag */ + TIPC_NLA_SOCK_STAT_DROP, /* u32 */ __TIPC_NLA_SOCK_STAT_MAX, TIPC_NLA_SOCK_STAT_MAX = __TIPC_NLA_SOCK_STAT_MAX - 1 diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 07559ce4b8ba..732ec894f69f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2122,8 +2122,10 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, (!sk_conn && msg_connected(hdr)) || (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; - else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) + else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { + atomic_inc(&sk->sk_drops); err = TIPC_ERR_OVERLOAD; + } if (unlikely(err)) { tipc_skb_reject(net, err, skb, xmitq); @@ -2202,6 +2204,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); + atomic_inc(&sk->sk_drops); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) __skb_queue_tail(xmitq, skb); break; @@ -3293,7 +3296,9 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ, skb_queue_len(&sk->sk_receive_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, - skb_queue_len(&sk->sk_write_queue))) + skb_queue_len(&sk->sk_write_queue)) || + nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, + atomic_read(&sk->sk_drops))) goto stat_msg_cancel; if (tsk->cong_link_cnt && -- cgit v1.2.3 From 14452ca3b5ce304fb2fea96dbc9ca1e4e7978551 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 21 Mar 2018 19:48:14 -0600 Subject: net: qualcomm: rmnet: Export mux_id and flags to netlink Define new netlink attributes for rmnet mux_id and flags. These flags / mux_id were earlier using vlan flags / id respectively. The flag bits are also moved to uapi and are renamed with prefix RMNET_FLAG_*. Also add the rmnet policy to handle the new netlink attributes. Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 41 +++++++++++++--------- .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 10 +++--- .../ethernet/qualcomm/rmnet/rmnet_map_command.c | 2 +- .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 2 +- .../net/ethernet/qualcomm/rmnet/rmnet_private.h | 6 ---- include/uapi/linux/if_link.h | 21 +++++++++++ 6 files changed, 53 insertions(+), 29 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 096301a31c4f..c5b7b2af25d8 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -43,6 +43,11 @@ /* Local Definitions and Declarations */ +static const struct nla_policy rmnet_policy[IFLA_RMNET_MAX + 1] = { + [IFLA_RMNET_MUX_ID] = { .type = NLA_U16 }, + [IFLA_RMNET_FLAGS] = { .len = sizeof(struct ifla_rmnet_flags) }, +}; + static int rmnet_is_real_dev_registered(const struct net_device *real_dev) { return rcu_access_pointer(real_dev->rx_handler) == rmnet_rx_handler; @@ -131,7 +136,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - u32 data_format = RMNET_INGRESS_FORMAT_DEAGGREGATION; + u32 data_format = RMNET_FLAGS_INGRESS_DEAGGREGATION; struct net_device *real_dev; int mode = RMNET_EPMODE_VND; struct rmnet_endpoint *ep; @@ -143,14 +148,14 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, if (!real_dev || !dev) return -ENODEV; - if (!data[IFLA_VLAN_ID]) + if (!data[IFLA_RMNET_MUX_ID]) return -EINVAL; ep = kzalloc(sizeof(*ep), GFP_ATOMIC); if (!ep) return -ENOMEM; - mux_id = nla_get_u16(data[IFLA_VLAN_ID]); + mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]); err = rmnet_register_real_device(real_dev); if (err) @@ -165,10 +170,10 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]); - if (data[IFLA_VLAN_FLAGS]) { - struct ifla_vlan_flags *flags; + if (data[IFLA_RMNET_FLAGS]) { + struct ifla_rmnet_flags *flags; - flags = nla_data(data[IFLA_VLAN_FLAGS]); + flags = nla_data(data[IFLA_RMNET_FLAGS]); data_format = flags->flags & flags->mask; } @@ -276,10 +281,10 @@ static int rmnet_rtnl_validate(struct nlattr *tb[], struct nlattr *data[], { u16 mux_id; - if (!data || !data[IFLA_VLAN_ID]) + if (!data || !data[IFLA_RMNET_MUX_ID]) return -EINVAL; - mux_id = nla_get_u16(data[IFLA_VLAN_ID]); + mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]); if (mux_id > (RMNET_MAX_LOGICAL_EP - 1)) return -ERANGE; @@ -304,8 +309,8 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[], port = rmnet_get_port_rtnl(real_dev); - if (data[IFLA_VLAN_ID]) { - mux_id = nla_get_u16(data[IFLA_VLAN_ID]); + if (data[IFLA_RMNET_MUX_ID]) { + mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]); ep = rmnet_get_endpoint(port, priv->mux_id); hlist_del_init_rcu(&ep->hlnode); @@ -315,10 +320,10 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[], priv->mux_id = mux_id; } - if (data[IFLA_VLAN_FLAGS]) { - struct ifla_vlan_flags *flags; + if (data[IFLA_RMNET_FLAGS]) { + struct ifla_rmnet_flags *flags; - flags = nla_data(data[IFLA_VLAN_FLAGS]); + flags = nla_data(data[IFLA_RMNET_FLAGS]); port->data_format = flags->flags & flags->mask; } @@ -327,13 +332,16 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[], static size_t rmnet_get_size(const struct net_device *dev) { - return nla_total_size(2) /* IFLA_VLAN_ID */ + - nla_total_size(sizeof(struct ifla_vlan_flags)); /* IFLA_VLAN_FLAGS */ + return + /* IFLA_RMNET_MUX_ID */ + nla_total_size(2) + + /* IFLA_RMNET_FLAGS */ + nla_total_size(sizeof(struct ifla_rmnet_flags)); } struct rtnl_link_ops rmnet_link_ops __read_mostly = { .kind = "rmnet", - .maxtype = __IFLA_VLAN_MAX, + .maxtype = __IFLA_RMNET_MAX, .priv_size = sizeof(struct rmnet_priv), .setup = rmnet_vnd_setup, .validate = rmnet_rtnl_validate, @@ -341,6 +349,7 @@ struct rtnl_link_ops rmnet_link_ops __read_mostly = { .dellink = rmnet_dellink, .get_size = rmnet_get_size, .changelink = rmnet_changelink, + .policy = rmnet_policy, }; /* Needs either rcu_read_lock() or rtnl lock */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index c758248bf2cd..6fcd586e9804 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -70,7 +70,7 @@ __rmnet_map_ingress_handler(struct sk_buff *skb, u8 mux_id; if (RMNET_MAP_GET_CD_BIT(skb)) { - if (port->data_format & RMNET_INGRESS_FORMAT_MAP_COMMANDS) + if (port->data_format & RMNET_FLAGS_INGRESS_MAP_COMMANDS) return rmnet_map_command(skb, port); goto free_skb; @@ -93,7 +93,7 @@ __rmnet_map_ingress_handler(struct sk_buff *skb, skb_pull(skb, sizeof(struct rmnet_map_header)); rmnet_set_skb_proto(skb); - if (port->data_format & RMNET_INGRESS_FORMAT_MAP_CKSUMV4) { + if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4) { if (!rmnet_map_checksum_downlink_packet(skb, len + pad)) skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -121,7 +121,7 @@ rmnet_map_ingress_handler(struct sk_buff *skb, skb_push(skb, ETH_HLEN); } - if (port->data_format & RMNET_INGRESS_FORMAT_DEAGGREGATION) { + if (port->data_format & RMNET_FLAGS_INGRESS_DEAGGREGATION) { while ((skbn = rmnet_map_deaggregate(skb, port)) != NULL) __rmnet_map_ingress_handler(skbn, port); @@ -141,7 +141,7 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, additional_header_len = 0; required_headroom = sizeof(struct rmnet_map_header); - if (port->data_format & RMNET_EGRESS_FORMAT_MAP_CKSUMV4) { + if (port->data_format & RMNET_FLAGS_EGRESS_MAP_CKSUMV4) { additional_header_len = sizeof(struct rmnet_map_ul_csum_header); required_headroom += additional_header_len; } @@ -151,7 +151,7 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, goto fail; } - if (port->data_format & RMNET_EGRESS_FORMAT_MAP_CKSUMV4) + if (port->data_format & RMNET_FLAGS_EGRESS_MAP_CKSUMV4) rmnet_map_checksum_uplink_packet(skb, orig_dev); map_header = rmnet_map_add_map_header(skb, additional_header_len, 0); diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c index afa2b862515f..78fdad0c6f76 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c @@ -69,7 +69,7 @@ static void rmnet_map_send_ack(struct sk_buff *skb, struct rmnet_map_control_command *cmd; int xmit_status; - if (port->data_format & RMNET_INGRESS_FORMAT_MAP_CKSUMV4) { + if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4) { if (skb->len < sizeof(struct rmnet_map_header) + RMNET_MAP_GET_LENGTH(skb) + sizeof(struct rmnet_map_dl_csum_trailer)) { diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index e8f6c79157be..a6ea09416f8d 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -309,7 +309,7 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb, maph = (struct rmnet_map_header *)skb->data; packet_len = ntohs(maph->pkt_len) + sizeof(struct rmnet_map_header); - if (port->data_format & RMNET_INGRESS_FORMAT_MAP_CKSUMV4) + if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4) packet_len += sizeof(struct rmnet_map_dl_csum_trailer); if (((int)skb->len - (int)packet_len) < 0) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h index 98365efc2d9a..b9cc4f85f229 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h @@ -18,12 +18,6 @@ #define RMNET_NEEDED_HEADROOM 16 #define RMNET_TX_QUEUE_LEN 1000 -/* Constants */ -#define RMNET_INGRESS_FORMAT_DEAGGREGATION BIT(0) -#define RMNET_INGRESS_FORMAT_MAP_COMMANDS BIT(1) -#define RMNET_INGRESS_FORMAT_MAP_CKSUMV4 BIT(2) -#define RMNET_EGRESS_FORMAT_MAP_CKSUMV4 BIT(3) - /* Replace skb->dev to a virtual rmnet device and pass up the stack */ #define RMNET_EPMODE_VND (1) /* Pass the frame directly to another device with dev_queue_xmit() */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 11d0c0ea2bfa..68699f654118 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -959,4 +959,25 @@ enum { #define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) +/* rmnet section */ + +#define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) +#define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) + +enum { + IFLA_RMNET_UNSPEC, + IFLA_RMNET_MUX_ID, + IFLA_RMNET_FLAGS, + __IFLA_RMNET_MAX, +}; + +#define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) + +struct ifla_rmnet_flags { + __u32 flags; + __u32 mask; +}; + #endif /* _UAPI_LINUX_IF_LINK_H */ -- cgit v1.2.3 From c46234ebb4d1eee5e09819f49169e51cfc6eb909 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:35 -0700 Subject: tls: RX path for ktls Add rx path for tls software implementation. recvmsg, splice_read, and poll implemented. An additional sockopt TLS_RX is added, with the same interface as TLS_TX. Either TLX_RX or TLX_TX may be provided separately, or together (with two different setsockopt calls with appropriate keys). Control messages are passed via CMSG in a similar way to transmit. If no cmsg buffer is passed, then only application data records will be passed to userspace, and EIO is returned for other types of alerts. EBADMSG is passed for decryption errors, and EMSGSIZE is passed for framing too big, and EBADMSG for framing too small (matching openssl semantics). EINVAL is returned for TLS versions that do not match the original setsockopt call. All are unrecoverable. strparser is used to parse TLS framing. Decryption is done directly in to userspace buffers if they are large enough to support it, otherwise sk_cow_data is called (similar to ipsec), and buffers are decrypted in place and copied. splice_read always decrypts in place, since no buffers are provided to decrypt in to. sk_poll is overridden, and only returns POLLIN if a full TLS message is received. Otherwise we wait for strparser to finish reading a full frame. Actual decryption is only done during recvmsg or splice_read calls. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 27 ++- include/uapi/linux/tls.h | 2 + net/tls/Kconfig | 1 + net/tls/tls_main.c | 62 ++++- net/tls/tls_sw.c | 587 ++++++++++++++++++++++++++++++++++++++++++----- 5 files changed, 609 insertions(+), 70 deletions(-) (limited to 'include/uapi') diff --git a/include/net/tls.h b/include/net/tls.h index 095b72283861..437a746300bf 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -58,8 +59,18 @@ struct tls_sw_context { struct crypto_aead *aead_send; + struct crypto_aead *aead_recv; struct crypto_wait async_wait; + /* Receive context */ + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); + struct sk_buff *recv_pkt; + u8 control; + bool decrypted; + /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; @@ -96,12 +107,17 @@ struct tls_context { struct tls_crypto_info crypto_send; struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128; }; + union { + struct tls_crypto_info crypto_recv; + struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; + }; void *priv_ctx; u8 conf:2; struct cipher_context tx; + struct cipher_context rx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; @@ -128,12 +144,19 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx); +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); -void tls_sw_free_tx_resources(struct sock *sk); +void tls_sw_free_resources(struct sock *sk); +int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); void tls_icsk_clean_acked(struct sock *sk); diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 293b2cdad88d..c6633e97eca4 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -38,6 +38,7 @@ /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ +#define TLS_RX 2 /* Set receive parameters */ /* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) @@ -59,6 +60,7 @@ #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE 8 #define TLS_SET_RECORD_TYPE 1 +#define TLS_GET_RECORD_TYPE 2 struct tls_crypto_info { __u16 version; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index eb583038c67e..89b8745a986f 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -7,6 +7,7 @@ config TLS select CRYPTO select CRYPTO_AES select CRYPTO_GCM + select STREAM_PARSER default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index c405beeda765..6f5c1146da4a 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -54,12 +54,15 @@ enum { enum { TLS_BASE, TLS_SW_TX, + TLS_SW_RX, + TLS_SW_RXTX, TLS_NUM_CONFIG, }; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; +static struct proto_ops tls_sw_proto_ops; static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) { @@ -261,9 +264,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); + kfree(ctx->rx.rec_seq); + kfree(ctx->rx.iv); - if (ctx->conf == TLS_SW_TX) - tls_sw_free_tx_resources(sk); + if (ctx->conf == TLS_SW_TX || + ctx->conf == TLS_SW_RX || + ctx->conf == TLS_SW_RXTX) { + tls_sw_free_resources(sk); + } skip_tx_cleanup: release_sock(sk); @@ -365,8 +373,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, return do_tls_getsockopt(sk, optname, optval, optlen); } -static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, - unsigned int optlen) +static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, + unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); @@ -378,7 +386,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - crypto_info = &ctx->crypto_send; + if (tx) + crypto_info = &ctx->crypto_send; + else + crypto_info = &ctx->crypto_recv; + /* Currently we don't support set crypto info more than one time */ if (TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -417,15 +429,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } /* currently SW is default, we will have ethtool in future */ - rc = tls_set_sw_offload(sk, ctx); - conf = TLS_SW_TX; + if (tx) { + rc = tls_set_sw_offload(sk, ctx, 1); + if (ctx->conf == TLS_SW_RX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_TX; + } else { + rc = tls_set_sw_offload(sk, ctx, 0); + if (ctx->conf == TLS_SW_TX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_RX; + } + if (rc) goto err_crypto_info; ctx->conf = conf; update_sk_prot(sk, ctx); - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; + if (tx) { + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; + } else { + sk->sk_socket->ops = &tls_sw_proto_ops; + } goto out; err_crypto_info: @@ -441,8 +469,10 @@ static int do_tls_setsockopt(struct sock *sk, int optname, switch (optname) { case TLS_TX: + case TLS_RX: lock_sock(sk); - rc = do_tls_setsockopt_tx(sk, optval, optlen); + rc = do_tls_setsockopt_conf(sk, optval, optlen, + optname == TLS_TX); release_sock(sk); break; default: @@ -473,6 +503,14 @@ static void build_protos(struct proto *prot, struct proto *base) prot[TLS_SW_TX] = prot[TLS_BASE]; prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; prot[TLS_SW_TX].sendpage = tls_sw_sendpage; + + prot[TLS_SW_RX] = prot[TLS_BASE]; + prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RX].close = tls_sk_proto_close; + + prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; + prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RXTX].close = tls_sk_proto_close; } static int tls_init(struct sock *sk) @@ -531,6 +569,10 @@ static int __init tls_register(void) { build_protos(tls_prots[TLSV4], &tcp_prot); + tls_sw_proto_ops = inet_stream_ops; + tls_sw_proto_ops.poll = tls_sw_poll; + tls_sw_proto_ops.splice_read = tls_sw_splice_read; + tcp_register_ulp(&tcp_tls_ulp_ops); return 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1c79d9ad1731..4dc766b03f00 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -34,11 +34,60 @@ * SOFTWARE. */ +#include #include #include +#include #include +static int tls_do_decryption(struct sock *sk, + struct scatterlist *sgin, + struct scatterlist *sgout, + char *iv_recv, + size_t data_len, + struct sk_buff *skb, + gfp_t flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + struct aead_request *aead_req; + + int ret; + unsigned int req_size = sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_recv); + + aead_req = kzalloc(req_size, flags); + if (!aead_req) + return -ENOMEM; + + aead_request_set_tfm(aead_req, ctx->aead_recv); + aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); + aead_request_set_crypt(aead_req, sgin, sgout, + data_len + tls_ctx->rx.tag_size, + (u8 *)iv_recv); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &ctx->async_wait); + + ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); + + if (ret < 0) + goto out; + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + + ctx->decrypted = true; + + ctx->saved_data_ready(sk); + +out: + kfree(aead_req); + return ret; +} + static void trim_sg(struct sock *sk, struct scatterlist *sg, int *sg_num_elem, unsigned int *sg_size, int target_size) { @@ -581,13 +630,404 @@ sendpage_end: return ret; } -void tls_sw_free_tx_resources(struct sock *sk) +static struct sk_buff *tls_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct sk_buff *skb; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + while (!(skb = ctx->recv_pkt)) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size]; + struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; + struct scatterlist *sgin = &sgin_arr[0]; + struct strp_msg *rxm = strp_msg(skb); + int ret, nsg = ARRAY_SIZE(sgin_arr); + char aad_recv[TLS_AAD_SPACE_SIZE]; + struct sk_buff *unused; + + ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, + iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + tls_ctx->rx.iv_size); + if (ret < 0) + return ret; + + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + if (!sgout) { + nsg = skb_cow_data(skb, 0, &unused) + 1; + sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); + if (!sgout) + sgout = sgin; + } + + sg_init_table(sgin, nsg); + sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv)); + + nsg = skb_to_sgvec(skb, &sgin[1], + rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); + + tls_make_aad(aad_recv, + rxm->full_len - tls_ctx->rx.overhead_size, + tls_ctx->rx.rec_seq, + tls_ctx->rx.rec_seq_size, + ctx->control); + + ret = tls_do_decryption(sk, sgin, sgout, iv, + rxm->full_len - tls_ctx->rx.overhead_size, + skb, sk->sk_allocation); + + if (sgin != &sgin_arr[0]) + kfree(sgin); + + return ret; +} + +static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + + return false; + } + + /* Finished with message */ + ctx->recv_pkt = NULL; + kfree_skb(skb); + strp_unpause(&ctx->strp); + + return true; +} + +int tls_sw_recvmsg(struct sock *sk, + struct msghdr *msg, + size_t len, + int nonblock, + int flags, + int *addr_len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + unsigned char control; + struct strp_msg *rxm; + struct sk_buff *skb; + ssize_t copied = 0; + bool cmsg = false; + int err = 0; + long timeo; + + flags |= nonblock; + + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + do { + bool zc = false; + int chunk = 0; + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto recv_end; + + rxm = strp_msg(skb); + if (!cmsg) { + int cerr; + + cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE, + sizeof(ctx->control), &ctx->control); + cmsg = true; + control = ctx->control; + if (ctx->control != TLS_RECORD_TYPE_DATA) { + if (cerr || msg->msg_flags & MSG_CTRUNC) { + err = -EIO; + goto recv_end; + } + } + } else if (control != ctx->control) { + goto recv_end; + } + + if (!ctx->decrypted) { + int page_count; + int to_copy; + + page_count = iov_iter_npages(&msg->msg_iter, + MAX_SKB_FRAGS); + to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + if (to_copy <= len && page_count < MAX_SKB_FRAGS && + likely(!(flags & MSG_PEEK))) { + struct scatterlist sgin[MAX_SKB_FRAGS + 1]; + char unused[21]; + int pages = 0; + + zc = true; + sg_init_table(sgin, MAX_SKB_FRAGS + 1); + sg_set_buf(&sgin[0], unused, 13); + + err = zerocopy_from_iter(sk, &msg->msg_iter, + to_copy, &pages, + &chunk, &sgin[1], + MAX_SKB_FRAGS, false); + if (err < 0) + goto fallback_to_reg_recv; + + err = decrypt_skb(sk, skb, sgin); + for (; pages > 0; pages--) + put_page(sg_page(&sgin[pages])); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } else { +fallback_to_reg_recv: + err = decrypt_skb(sk, skb, NULL); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } + ctx->decrypted = true; + } + + if (!zc) { + chunk = min_t(unsigned int, rxm->full_len, len); + err = skb_copy_datagram_msg(skb, rxm->offset, msg, + chunk); + if (err < 0) + goto recv_end; + } + + copied += chunk; + len -= chunk; + if (likely(!(flags & MSG_PEEK))) { + u8 control = ctx->control; + + if (tls_sw_advance_skb(sk, skb, chunk)) { + /* Return full control message to + * userspace before trying to parse + * another message type + */ + msg->msg_flags |= MSG_EOR; + if (control != TLS_RECORD_TYPE_DATA) + goto recv_end; + } + } + } while (len); + +recv_end: + release_sock(sk); + return copied ? : err; +} + +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sock->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = NULL; + struct sock *sk = sock->sk; + struct sk_buff *skb; + ssize_t copied = 0; + int err = 0; + long timeo; + int chunk; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto splice_read_end; + + /* splice does not support reading control messages */ + if (ctx->control != TLS_RECORD_TYPE_DATA) { + err = -ENOTSUPP; + goto splice_read_end; + } + + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, NULL); + + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto splice_read_end; + } + ctx->decrypted = true; + } + rxm = strp_msg(skb); + + chunk = min_t(unsigned int, rxm->full_len, len); + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags); + if (copied < 0) + goto splice_read_end; + + if (likely(!(flags & MSG_PEEK))) + tls_sw_advance_skb(sk, skb, copied); + +splice_read_end: + release_sock(sk); + return copied ? : err; +} + +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int ret; + struct sock *sk = sock->sk; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); + + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); + if (ctx->recv_pkt) + ret |= POLLIN | POLLRDNORM; + + return ret; +} + +static int tls_read_size(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char header[tls_ctx->rx.prepend_size]; + struct strp_msg *rxm = strp_msg(skb); + size_t cipher_overhead; + size_t data_len = 0; + int ret; + + /* Verify that we have a full TLS header, or wait for more data */ + if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) + return 0; + + /* Linearize header to local buffer */ + ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); + + if (ret < 0) + goto read_failure; + + ctx->control = header[0]; + + data_len = ((header[4] & 0xFF) | (header[3] << 8)); + + cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size; + + if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) { + ret = -EMSGSIZE; + goto read_failure; + } + if (data_len < cipher_overhead) { + ret = -EBADMSG; + goto read_failure; + } + + if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) || + header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) { + ret = -EINVAL; + goto read_failure; + } + + return data_len + TLS_HEADER_SIZE; + +read_failure: + tls_err_abort(strp->sk, ret); + + return ret; +} + +static void tls_queue(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm; + + rxm = strp_msg(skb); + + ctx->decrypted = false; + + ctx->recv_pkt = skb; + strp_pause(strp); + + strp->sk->sk_state_change(strp->sk); +} + +static void tls_data_ready(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + strp_data_ready(&ctx->strp); +} + +void tls_sw_free_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); if (ctx->aead_send) crypto_free_aead(ctx->aead_send); + if (ctx->aead_recv) { + if (ctx->recv_pkt) { + kfree_skb(ctx->recv_pkt); + ctx->recv_pkt = NULL; + } + crypto_free_aead(ctx->aead_recv); + strp_stop(&ctx->strp); + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = ctx->saved_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); + strp_done(&ctx->strp); + lock_sock(sk); + } tls_free_both_sg(sk); @@ -595,12 +1035,15 @@ void tls_sw_free_tx_resources(struct sock *sk) kfree(tls_ctx); } -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context *sw_ctx; + struct cipher_context *cctx; + struct crypto_aead **aead; + struct strp_callbacks cb; u16 nonce_size, tag_size, iv_size, rec_seq_size; char *iv, *rec_seq; int rc = 0; @@ -610,22 +1053,29 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto out; } - if (ctx->priv_ctx) { - rc = -EEXIST; - goto out; - } - - sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); - if (!sw_ctx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx) { + sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); + if (!sw_ctx) { + rc = -ENOMEM; + goto out; + } + crypto_init_wait(&sw_ctx->async_wait); + } else { + sw_ctx = ctx->priv_ctx; } - crypto_init_wait(&sw_ctx->async_wait); - ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - crypto_info = &ctx->crypto_send; + if (tx) { + crypto_info = &ctx->crypto_send; + cctx = &ctx->tx; + aead = &sw_ctx->aead_send; + } else { + crypto_info = &ctx->crypto_recv; + cctx = &ctx->rx; + aead = &sw_ctx->aead_recv; + } + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -644,48 +1094,49 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto free_priv; } - ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size; - ctx->tx.tag_size = tag_size; - ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size; - ctx->tx.iv_size = iv_size; - ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); - if (!ctx->tx.iv) { + cctx->prepend_size = TLS_HEADER_SIZE + nonce_size; + cctx->tag_size = tag_size; + cctx->overhead_size = cctx->prepend_size + cctx->tag_size; + cctx->iv_size = iv_size; + cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!cctx->iv) { rc = -ENOMEM; goto free_priv; } - memcpy(ctx->tx.iv, gcm_128_info->salt, - TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); - ctx->tx.rec_seq_size = rec_seq_size; - ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); - if (!ctx->tx.rec_seq) { + memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + cctx->rec_seq_size = rec_seq_size; + cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!cctx->rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); - - sg_init_table(sw_ctx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx->sg_encrypted_data)); - sg_init_table(sw_ctx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx->sg_plaintext_data)); - - sg_init_table(sw_ctx->sg_aead_in, 2); - sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_in[1]); - sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); - sg_init_table(sw_ctx->sg_aead_out, 2); - sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_out[1]); - sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); - - if (!sw_ctx->aead_send) { - sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0); - if (IS_ERR(sw_ctx->aead_send)) { - rc = PTR_ERR(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + memcpy(cctx->rec_seq, rec_seq, rec_seq_size); + + if (tx) { + sg_init_table(sw_ctx->sg_encrypted_data, + ARRAY_SIZE(sw_ctx->sg_encrypted_data)); + sg_init_table(sw_ctx->sg_plaintext_data, + ARRAY_SIZE(sw_ctx->sg_plaintext_data)); + + sg_init_table(sw_ctx->sg_aead_in, 2); + sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_in[1]); + sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); + sg_init_table(sw_ctx->sg_aead_out, 2); + sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_out[1]); + sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); + } + + if (!*aead) { + *aead = crypto_alloc_aead("gcm(aes)", 0, 0); + if (IS_ERR(*aead)) { + rc = PTR_ERR(*aead); + *aead = NULL; goto free_rec_seq; } } @@ -694,21 +1145,41 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); - rc = crypto_aead_setkey(sw_ctx->aead_send, keyval, + rc = crypto_aead_setkey(*aead, keyval, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; - rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size); - if (!rc) - return 0; + rc = crypto_aead_setauthsize(*aead, cctx->tag_size); + if (rc) + goto free_aead; + + if (!tx) { + /* Set up strparser */ + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = tls_queue; + cb.parse_msg = tls_read_size; + + strp_init(&sw_ctx->strp, sk, &cb); + + write_lock_bh(&sk->sk_callback_lock); + sw_ctx->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = tls_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + + sw_ctx->sk_poll = sk->sk_socket->ops->poll; + + strp_check_rcv(&sw_ctx->strp); + } + + goto out; free_aead: - crypto_free_aead(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + crypto_free_aead(*aead); + *aead = NULL; free_rec_seq: - kfree(ctx->tx.rec_seq); - ctx->tx.rec_seq = NULL; + kfree(cctx->rec_seq); + cctx->rec_seq = NULL; free_iv: kfree(ctx->tx.iv); ctx->tx.iv = NULL; -- cgit v1.2.3 From d50ccc2d3909fc1b4d40e4af16b026f05dc68707 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:50 +0100 Subject: tipc: add 128-bit node identifier We add a 128-bit node identity, as an alternative to the currently used 32-bit node address. For the sake of compatibility and to minimize message header changes we retain the existing 32-bit address field. When not set explicitly by the user, this field will be filled with a hash value generated from the much longer node identity, and be used as a shorthand value for the latter. We permit either the address or the identity to be set by configuration, but not both, so when the address value is set by a legacy user the corresponding 128-bit node identity is generated based on the that value. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 2 + net/tipc/addr.c | 81 ++++++++++++++++++++++++++++++++------- net/tipc/addr.h | 28 +++++++++++--- net/tipc/core.c | 4 +- net/tipc/core.h | 6 ++- net/tipc/discover.c | 4 +- net/tipc/link.c | 6 ++- net/tipc/name_distr.c | 6 +-- net/tipc/net.c | 51 ++++++++++++++++-------- net/tipc/net.h | 4 +- net/tipc/node.c | 8 +--- net/tipc/node.h | 4 +- 12 files changed, 148 insertions(+), 56 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index d896ded51bcb..0affb682e5e3 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -169,6 +169,8 @@ enum { TIPC_NLA_NET_UNSPEC, TIPC_NLA_NET_ID, /* u32 */ TIPC_NLA_NET_ADDR, /* u32 */ + TIPC_NLA_NET_NODEID, /* u64 */ + TIPC_NLA_NET_NODEID_W1, /* u64 */ __TIPC_NLA_NET_MAX, TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 6e06b4d981f1..4841e98591d0 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -1,7 +1,7 @@ /* * net/tipc/addr.c: TIPC address utility routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -34,18 +34,9 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include #include "addr.h" #include "core.h" -/** - * in_own_node - test for node inclusion; <0.0.0> always matches - */ -int in_own_node(struct net *net, u32 addr) -{ - return addr == tipc_own_addr(net) || !addr; -} - bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) @@ -61,9 +52,71 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) return false; } -char *tipc_addr_string_fill(char *string, u32 addr) +void tipc_set_node_id(struct net *net, u8 *id) +{ + struct tipc_net *tn = tipc_net(net); + u32 *tmp = (u32 *)id; + + memcpy(tn->node_id, id, NODE_ID_LEN); + tipc_nodeid2string(tn->node_id_string, id); + tn->node_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; + pr_info("Own node identity %s, cluster identity %u\n", + tipc_own_id_string(net), tn->net_id); +} + +void tipc_set_node_addr(struct net *net, u32 addr) { - snprintf(string, 16, "<%u.%u.%u>", - tipc_zone(addr), tipc_cluster(addr), tipc_node(addr)); - return string; + struct tipc_net *tn = tipc_net(net); + u8 node_id[NODE_ID_LEN] = {0,}; + + tn->node_addr = addr; + if (!tipc_own_id(net)) { + sprintf(node_id, "%x", addr); + tipc_set_node_id(net, node_id); + } + pr_info("32-bit node address hash set to %x\n", addr); +} + +char *tipc_nodeid2string(char *str, u8 *id) +{ + int i; + u8 c; + + /* Already a string ? */ + for (i = 0; i < NODE_ID_LEN; i++) { + c = id[i]; + if (c >= '0' && c <= '9') + continue; + if (c >= 'A' && c <= 'Z') + continue; + if (c >= 'a' && c <= 'z') + continue; + if (c == '.') + continue; + if (c == ':') + continue; + if (c == '_') + continue; + if (c == '-') + continue; + if (c == '@') + continue; + if (c != 0) + break; + } + if (i == NODE_ID_LEN) { + memcpy(str, id, NODE_ID_LEN); + str[NODE_ID_LEN] = 0; + return str; + } + + /* Translate to hex string */ + for (i = 0; i < NODE_ID_LEN; i++) + sprintf(&str[2 * i], "%02x", id[i]); + + /* Strip off trailing zeroes */ + for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) + str[i] = 0; + + return str; } diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 6b48f0dc0205..31bee0ea7b3e 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -1,7 +1,7 @@ /* * net/tipc/addr.h: Include file for TIPC address utility routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, Wind River Systems * All rights reserved. * @@ -44,10 +44,22 @@ #include "core.h" static inline u32 tipc_own_addr(struct net *net) +{ + return tipc_net(net)->node_addr; +} + +static inline u8 *tipc_own_id(struct net *net) { struct tipc_net *tn = tipc_net(net); - return tn->own_addr; + if (!strlen(tn->node_id_string)) + return NULL; + return tn->node_id; +} + +static inline char *tipc_own_id_string(struct net *net) +{ + return tipc_net(net)->node_id_string; } static inline u32 tipc_cluster_mask(u32 addr) @@ -65,9 +77,15 @@ static inline int tipc_scope2node(struct net *net, int sc) return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net); } -u32 tipc_own_addr(struct net *net); -int in_own_node(struct net *net, u32 addr); +static inline int in_own_node(struct net *net, u32 addr) +{ + return addr == tipc_own_addr(net) || !addr; +} + bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); -char *tipc_addr_string_fill(char *string, u32 addr); +void tipc_set_node_id(struct net *net, u8 *id); +void tipc_set_node_addr(struct net *net, u32 addr); +char *tipc_nodeid2string(char *str, u8 *id); +u32 tipc_node_id2hash(u8 *id128); #endif diff --git a/net/tipc/core.c b/net/tipc/core.c index 04fd91bb11d7..e92fed49e095 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -56,7 +56,9 @@ static int __net_init tipc_init_net(struct net *net) int err; tn->net_id = 4711; - tn->own_addr = 0; + tn->node_addr = 0; + memset(tn->node_id, 0, sizeof(tn->node_id)); + memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); diff --git a/net/tipc/core.h b/net/tipc/core.h index bd2b112680a4..eabad41cc832 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -72,13 +72,17 @@ struct tipc_monitor; #define NODE_HTABLE_SIZE 512 #define MAX_BEARERS 3 #define TIPC_DEF_MON_THRESHOLD 32 +#define NODE_ID_LEN 16 +#define NODE_ID_STR_LEN (NODE_ID_LEN * 2 + 1) extern unsigned int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; struct tipc_net { - u32 own_addr; + u8 node_id[NODE_ID_LEN]; + u32 node_addr; + char node_id_string[NODE_ID_STR_LEN]; int net_id; int random; bool legacy_addr_format; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 94d524018ca5..b4c4cd176b9b 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -118,13 +118,11 @@ static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { - char node_addr_str[16]; char media_addr_str[64]; - tipc_addr_string_fill(node_addr_str, node_addr); tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); - pr_warn("Duplicate %s using %s seen on <%s>\n", node_addr_str, + pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } diff --git a/net/tipc/link.c b/net/tipc/link.c index 4aa56e3bf4fc..bcd76b1e440c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -442,6 +442,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link) { + char *self_str = tipc_own_id_string(net); struct tipc_link *l; l = kzalloc(sizeof(*l), GFP_ATOMIC); @@ -451,7 +452,10 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->session = session; /* Note: peer i/f name is completed by reset/activate message */ - sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer); + if (strlen(self_str) > 16) + sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer); + else + sprintf(l->name, "%s:%s-%x:unknown", self_str, if_name, peer); strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 7e571f4f47bc..8240a85b0d0c 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -318,7 +318,6 @@ void tipc_named_process_backlog(struct net *net) { struct distr_queue_item *e, *tmp; struct tipc_net *tn = net_generic(net, tipc_net_id); - char addr[16]; unsigned long now = get_jiffies_64(); list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { @@ -326,12 +325,11 @@ void tipc_named_process_backlog(struct net *net) if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype)) continue; } else { - tipc_addr_string_fill(addr, e->node); - pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n", + pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %x key=%u\n", e->dtype, ntohl(e->i.type), ntohl(e->i.lower), ntohl(e->i.upper), - addr, ntohl(e->i.key)); + e->node, ntohl(e->i.key)); } list_del(&e->next); kfree(e); diff --git a/net/tipc/net.c b/net/tipc/net.c index 7f140a5308ee..e78674891166 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -104,27 +104,31 @@ * - A local spin_lock protecting the queue of subscriber events. */ -int tipc_net_start(struct net *net, u32 addr) +int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { - struct tipc_net *tn = tipc_net(net); - char addr_string[16]; + if (tipc_own_id(net)) { + pr_info("Cannot configure node identity twice\n"); + return -1; + } + pr_info("Started in network mode\n"); - tn->own_addr = addr; + if (node_id) { + tipc_set_node_id(net, node_id); + tipc_net_finalize(net, tipc_own_addr(net)); + } + if (addr) + tipc_net_finalize(net, addr); + return 0; +} - /* Ensure that the new address is visible before we reinit. */ +void tipc_net_finalize(struct net *net, u32 addr) +{ + tipc_set_node_addr(net, addr); smp_mb(); - tipc_named_reinit(net); tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); - - pr_info("Started in network mode\n"); - pr_info("Own node address %s, cluster identity %u\n", - tipc_addr_string_fill(addr_string, addr), - tn->net_id); - return 0; } void tipc_net_stop(struct net *net) @@ -146,8 +150,10 @@ void tipc_net_stop(struct net *net) static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); - void *hdr; + u64 *w0 = (u64 *)&tn->node_id[0]; + u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; + void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); @@ -160,7 +166,10 @@ static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; - + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) + goto attr_msg_full; + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) + goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); @@ -212,6 +221,7 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); + if (err) return err; @@ -236,9 +246,18 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) if (!addr) return -EINVAL; tn->legacy_addr_format = true; - tipc_net_start(net, addr); + tipc_net_init(net, NULL, addr); } + if (attrs[TIPC_NLA_NET_NODEID]) { + u8 node_id[NODE_ID_LEN]; + u64 *w0 = (u64 *)&node_id[0]; + u64 *w1 = (u64 *)&node_id[8]; + + *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); + *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); + tipc_net_init(net, node_id, 0); + } return 0; } diff --git a/net/tipc/net.h b/net/tipc/net.h index c0306aa2374b..08efa6010022 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -41,10 +41,8 @@ extern const struct nla_policy tipc_nl_net_policy[]; -int tipc_net_start(struct net *net, u32 addr); - +void tipc_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); - int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/node.c b/net/tipc/node.c index 8a4b04933ecc..7b0c99347406 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -883,11 +883,9 @@ void tipc_node_delete_links(struct net *net, int bearer_id) static void tipc_node_reset_links(struct tipc_node *n) { - char addr_string[16]; int i; - pr_warn("Resetting all links to %s\n", - tipc_addr_string_fill(addr_string, n->addr)); + pr_warn("Resetting all links to %x\n", n->addr); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); @@ -1074,15 +1072,13 @@ illegal_evt: static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { - char addr_string[16]; struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; - pr_debug("Lost contact with %s\n", - tipc_addr_string_fill(addr_string, n->addr)); + pr_debug("Lost contact with %x\n", n->addr); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); diff --git a/net/tipc/node.h b/net/tipc/node.h index 5fb38cf0bb5c..e06faf4fa55e 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -49,14 +49,14 @@ enum { TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), - TIPC_NODE_ID32 = (1 << 5) + TIPC_NODE_ID128 = (1 << 5) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ TIPC_BCAST_STATE_NACK | \ TIPC_BCAST_RCAST | \ TIPC_BLOCK_FLOWCTL | \ - TIPC_NODE_ID32) + TIPC_NODE_ID128) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); -- cgit v1.2.3 From e1577c1c881b09e9f15a743a4a1907815b74d0f7 Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Mon, 20 Nov 2017 16:14:30 +0200 Subject: ethtool: Add support for configuring PFC stall prevention in ethtool In the event where the device unexpectedly becomes unresponsive for a long period of time, flow control mechanism may propagate pause frames which will cause congestion spreading to the entire network. To prevent this scenario, when the device is stalled for a period longer than a pre-configured timeout, flow control mechanisms are automatically disabled. This patch adds support for the ETHTOOL_PFC_STALL_PREVENTION as a tunable. This API provides support for configuring flow control storm prevention timeout (msec). Signed-off-by: Inbar Karmy Cc: Michal Kubecek Cc: Andrew Lunn Signed-off-by: Saeed Mahameed --- include/uapi/linux/ethtool.h | 4 ++++ net/core/ethtool.c | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 20da156aaf64..4ca65b56084f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -217,10 +217,14 @@ struct ethtool_value { __u32 data; }; +#define PFC_STORM_PREVENTION_AUTO 0xffff +#define PFC_STORM_PREVENTION_DISABLE 0 + enum tunable_id { ETHTOOL_ID_UNSPEC, ETHTOOL_RX_COPYBREAK, ETHTOOL_TX_COPYBREAK, + ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ /* * Add your fresh new tubale attribute above and remember to update * tunable_strings[] in net/core/ethtool.c diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 157cd9efa4be..bb6e498c6e3d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -121,6 +121,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_ID_UNSPEC] = "Unspec", [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", + [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", }; static const char @@ -2311,6 +2312,11 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; break; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (tuna->len != sizeof(u16) || + tuna->type_id != ETHTOOL_TUNABLE_U16) + return -EINVAL; + break; default: return -EINVAL; } -- cgit v1.2.3 From c4f6699dfcb8558d138fe838f741b2c10f416cf9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Mar 2018 12:05:37 -0700 Subject: bpf: introduce BPF_RAW_TRACEPOINT Introduce BPF_PROG_TYPE_RAW_TRACEPOINT bpf program type to access kernel internal arguments of the tracepoints in their raw form. >From bpf program point of view the access to the arguments look like: struct bpf_raw_tracepoint_args { __u64 args[0]; }; int bpf_prog(struct bpf_raw_tracepoint_args *ctx) { // program can read args[N] where N depends on tracepoint // and statically verified at program load+attach time } kprobe+bpf infrastructure allows programs access function arguments. This feature allows programs access raw tracepoint arguments. Similar to proposed 'dynamic ftrace events' there are no abi guarantees to what the tracepoints arguments are and what their meaning is. The program needs to type cast args properly and use bpf_probe_read() helper to access struct fields when argument is a pointer. For every tracepoint __bpf_trace_##call function is prepared. In assembler it looks like: (gdb) disassemble __bpf_trace_xdp_exception Dump of assembler code for function __bpf_trace_xdp_exception: 0xffffffff81132080 <+0>: mov %ecx,%ecx 0xffffffff81132082 <+2>: jmpq 0xffffffff811231f0 where TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), The above assembler snippet is casting 32-bit 'act' field into 'u64' to pass into bpf_trace_run3(), while 'dev' and 'xdp' args are passed as-is. All of ~500 of __bpf_trace_*() functions are only 5-10 byte long and in total this approach adds 7k bytes to .text. This approach gives the lowest possible overhead while calling trace_xdp_exception() from kernel C code and transitioning into bpf land. Since tracepoint+bpf are used at speeds of 1M+ events per second this is valuable optimization. The new BPF_RAW_TRACEPOINT_OPEN sys_bpf command is introduced that returns anon_inode FD of 'bpf-raw-tracepoint' object. The user space looks like: // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type prog_fd = bpf_prog_load(...); // receive anon_inode fd for given bpf_raw_tracepoint with prog attached raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd); Ctrl-C of tracing daemon or cmdline tool that uses this feature will automatically detach bpf program, unload it and unregister tracepoint probe. On the kernel side the __bpf_raw_tp_map section of pointers to tracepoint definition and to __bpf_trace_*() probe function is used to find a tracepoint with "xdp_exception" name and corresponding __bpf_trace_xdp_exception() probe function which are passed to tracepoint_probe_register() to connect probe with tracepoint. Addition of bpf_raw_tracepoint doesn't interfere with ftrace and perf tracepoint mechanisms. perf_event_open() can be used in parallel on the same tracepoint. Multiple bpf_raw_tracepoint_open("xdp_exception", prog_fd) are permitted. Each with its own bpf program. The kernel will execute all tracepoint probes and all attached bpf programs. In the future bpf_raw_tracepoints can be extended with query/introspection logic. __bpf_raw_tp_map section logic was contributed by Steven Rostedt Signed-off-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) Acked-by: Steven Rostedt (VMware) Signed-off-by: Daniel Borkmann --- include/asm-generic/vmlinux.lds.h | 10 +++ include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 42 +++++++++ include/linux/tracepoint-defs.h | 6 ++ include/trace/bpf_probe.h | 92 +++++++++++++++++++ include/trace/define_trace.h | 1 + include/uapi/linux/bpf.h | 11 +++ kernel/bpf/syscall.c | 78 ++++++++++++++++ kernel/trace/bpf_trace.c | 183 ++++++++++++++++++++++++++++++++++++++ 9 files changed, 424 insertions(+) create mode 100644 include/trace/bpf_probe.h (limited to 'include/uapi') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 1ab0e520d6fc..8add3493a202 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -178,6 +178,15 @@ #define TRACE_SYSCALLS() #endif +#ifdef CONFIG_BPF_EVENTS +#define BPF_RAW_TP() STRUCT_ALIGN(); \ + VMLINUX_SYMBOL(__start__bpf_raw_tp) = .; \ + KEEP(*(__bpf_raw_tp_map)) \ + VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .; +#else +#define BPF_RAW_TP() +#endif + #ifdef CONFIG_SERIAL_EARLYCON #define EARLYCON_TABLE() STRUCT_ALIGN(); \ VMLINUX_SYMBOL(__earlycon_table) = .; \ @@ -249,6 +258,7 @@ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ + BPF_RAW_TP() \ TRACEPOINT_STR() /* diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5e2e8a49fb21..6d7243bfb0ff 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8a1442c4e513..b0357cd198b0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -468,6 +468,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -487,6 +490,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + return NULL; +} #endif enum { @@ -546,6 +561,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); +void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3); +void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4); +void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5); +void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6); +void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); +void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8); +void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9); +void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10); +void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11); +void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 64ed7064f1fa..22c5a46e9693 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -35,4 +35,10 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; +struct bpf_raw_event_map { + struct tracepoint *tp; + void *bpf_func; + u32 num_args; +} __aligned(32); + #endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h new file mode 100644 index 000000000000..505dae0bed80 --- /dev/null +++ b/include/trace/bpf_probe.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM_VAR + +#ifdef CONFIG_BPF_EVENTS + +#undef __entry +#define __entry entry + +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) + +#undef __get_dynamic_array_len +#define __get_dynamic_array_len(field) \ + ((__entry->__data_loc_##field >> 16) & 0xffff) + +#undef __get_str +#define __get_str(field) ((char *)__get_dynamic_array(field)) + +#undef __get_bitmask +#define __get_bitmask(field) (char *)__get_dynamic_array(field) + +#undef __perf_count +#define __perf_count(c) (c) + +#undef __perf_task +#define __perf_task(t) (t) + +/* cast any integer, pointer, or small struct to u64 */ +#define UINTTYPE(size) \ + __typeof__(__builtin_choose_expr(size == 1, (u8)1, \ + __builtin_choose_expr(size == 2, (u16)2, \ + __builtin_choose_expr(size == 4, (u32)3, \ + __builtin_choose_expr(size == 8, (u64)4, \ + (void)5))))) +#define __CAST_TO_U64(x) ({ \ + typeof(x) __src = (x); \ + UINTTYPE(sizeof(x)) __dst; \ + memcpy(&__dst, &__src, sizeof(__dst)); \ + (u64)__dst; }) + +#define __CAST1(a,...) __CAST_TO_U64(a) +#define __CAST2(a,...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__) +#define __CAST3(a,...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__) +#define __CAST4(a,...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__) +#define __CAST5(a,...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__) +#define __CAST6(a,...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__) +#define __CAST7(a,...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__) +#define __CAST8(a,...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__) +#define __CAST9(a,...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__) +#define __CAST10(a,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__) +#define __CAST11(a,...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__) +#define __CAST12(a,...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__) +/* tracepoints with more than 12 arguments will hit build error */ +#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +static notrace void \ +__bpf_trace_##call(void *__data, proto) \ +{ \ + struct bpf_prog *prog = __data; \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ +} + +/* + * This part is compiled out, it is only here as a build time check + * to make sure that if the tracepoint handling changes, the + * bpf probe will fail to compile unless it too is updated. + */ +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ +static inline void bpf_test_probe_##call(void) \ +{ \ + check_trace_callback_type_##call(__bpf_trace_##template); \ +} \ +static struct bpf_raw_event_map __used \ + __attribute__((section("__bpf_raw_tp_map"))) \ +__bpf_trace_tp_map_##call = { \ + .tp = &__tracepoint_##call, \ + .bpf_func = (void *)__bpf_trace_##template, \ + .num_args = COUNT_ARGS(args), \ +}; + + +#undef DEFINE_EVENT_PRINT +#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ + DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_BPF_EVENTS */ diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index d9e3d4aa3f6e..cb30c5532144 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -95,6 +95,7 @@ #ifdef TRACEPOINTS_ENABLED #include #include +#include #endif #undef TRACE_EVENT diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18b7c510c511..1878201c2d77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_cmd { BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, }; enum bpf_map_type { @@ -134,6 +135,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, }; enum bpf_attach_type { @@ -344,6 +346,11 @@ union bpf_attr { __aligned_u64 prog_ids; __u32 prog_cnt; } query; + + struct { + __u64 name; + __u32 prog_fd; + } raw_tracepoint; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -1152,4 +1159,8 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 77d45bd9f507..95ca2523fa6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1315,6 +1315,81 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +struct bpf_raw_tracepoint { + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; +}; + +static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +{ + struct bpf_raw_tracepoint *raw_tp = filp->private_data; + + if (raw_tp->prog) { + bpf_probe_unregister(raw_tp->btp, raw_tp->prog); + bpf_prog_put(raw_tp->prog); + } + kfree(raw_tp); + return 0; +} + +static const struct file_operations bpf_raw_tp_fops = { + .release = bpf_raw_tracepoint_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; + char tp_name[128]; + int tp_fd, err; + + if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(tp_name) - 1) < 0) + return -EFAULT; + tp_name[sizeof(tp_name) - 1] = 0; + + btp = bpf_find_raw_tracepoint(tp_name); + if (!btp) + return -ENOENT; + + raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); + if (!raw_tp) + return -ENOMEM; + raw_tp->btp = btp; + + prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, + BPF_PROG_TYPE_RAW_TRACEPOINT); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out_free_tp; + } + + err = bpf_probe_register(raw_tp->btp, prog); + if (err) + goto out_put_prog; + + raw_tp->prog = prog; + tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, + O_CLOEXEC); + if (tp_fd < 0) { + bpf_probe_unregister(raw_tp->btp, prog); + err = tp_fd; + goto out_put_prog; + } + return tp_fd; + +out_put_prog: + bpf_prog_put(prog); +out_free_tp: + kfree(raw_tp); + return err; +} + #ifdef CONFIG_CGROUP_BPF #define BPF_PROG_ATTACH_LAST_FIELD attach_flags @@ -1925,6 +2000,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr); break; + case BPF_RAW_TRACEPOINT_OPEN: + err = bpf_raw_tracepoint_open(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7f9691c86b6e..463e72d18c4c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -735,6 +735,86 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) } } +/* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added + * inside bpf_*_event_output and/or bpf_get_stack_id + */ +static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return ____bpf_perf_event_output(regs, map, flags, data, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .func = bpf_perf_event_output_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { + .func = bpf_get_stackid_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_raw_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool raw_tp_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + /* largest tracepoint in the kernel has 12 args */ + if (off < 0 || off >= sizeof(__u64) * 12) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -908,3 +988,106 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return ret; } + +extern struct bpf_raw_event_map __start__bpf_raw_tp[]; +extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; + +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + struct bpf_raw_event_map *btp = __start__bpf_raw_tp; + + for (; btp < __stop__bpf_raw_tp; btp++) { + if (!strcmp(btp->tp->name, name)) + return btp; + } + return NULL; +} + +static __always_inline +void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +{ + rcu_read_lock(); + preempt_disable(); + (void) BPF_PROG_RUN(prog, args); + preempt_enable(); + rcu_read_unlock(); +} + +#define UNPACK(...) __VA_ARGS__ +#define REPEAT_1(FN, DL, X, ...) FN(X) +#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) +#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) +#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) +#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) +#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) +#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) +#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) +#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) +#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) +#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) +#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) +#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) + +#define SARG(X) u64 arg##X +#define COPY(X) args[X] = arg##X + +#define __DL_COM (,) +#define __DL_SEM (;) + +#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + +#define BPF_TRACE_DEFN_x(x) \ + void bpf_trace_run##x(struct bpf_prog *prog, \ + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ + { \ + u64 args[x]; \ + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ + __bpf_trace_run(prog, args); \ + } \ + EXPORT_SYMBOL_GPL(bpf_trace_run##x) +BPF_TRACE_DEFN_x(1); +BPF_TRACE_DEFN_x(2); +BPF_TRACE_DEFN_x(3); +BPF_TRACE_DEFN_x(4); +BPF_TRACE_DEFN_x(5); +BPF_TRACE_DEFN_x(6); +BPF_TRACE_DEFN_x(7); +BPF_TRACE_DEFN_x(8); +BPF_TRACE_DEFN_x(9); +BPF_TRACE_DEFN_x(10); +BPF_TRACE_DEFN_x(11); +BPF_TRACE_DEFN_x(12); + +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + struct tracepoint *tp = btp->tp; + + /* + * check that program doesn't access arguments beyond what's + * available in this tracepoint + */ + if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) + return -EINVAL; + + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); +} + +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = __bpf_probe_register(btp, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} + +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} -- cgit v1.2.3 From f8d16d3edb4dbae080df04318423c360de3c594d Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:45 -0500 Subject: nl80211: Add SOCKET_OWNER support to JOIN_IBSS Signed-off-by: Denis Kenzior [johannes: fix race with wdev lock/unlock by just acquiring once] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/core.h | 8 ++++---- net/wireless/ibss.c | 27 ++++++--------------------- net/wireless/nl80211.c | 7 ++++++- 4 files changed, 18 insertions(+), 26 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 60fefc5a2ea4..266b43c4f6e1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1962,6 +1962,8 @@ enum nl80211_commands { * multicast group. * If set during %NL80211_CMD_ASSOCIATE or %NL80211_CMD_CONNECT the * station will deauthenticate when the socket is closed. + * If set during %NL80211_CMD_JOIN_IBSS the IBSS will be automatically + * torn down when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. diff --git a/net/wireless/core.h b/net/wireless/core.h index eaff636169c2..b5cf3ea8d4df 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -282,10 +282,10 @@ void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); /* IBSS */ -int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys); +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys); void cfg80211_clear_ibss(struct net_device *dev, bool nowext); int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index a1d10993d08a..d1743e6abc34 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -84,14 +84,15 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, } EXPORT_SYMBOL(cfg80211_ibss_joined); -static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys) +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; + ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); if (wdev->ssid_len) @@ -146,23 +147,6 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, return 0; } -int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; - - ASSERT_RTNL(); - - wdev_lock(wdev); - err = __cfg80211_join_ibss(rdev, dev, params, connkeys); - wdev_unlock(wdev); - - return err; -} - static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -224,6 +208,7 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, if (err) return err; + wdev->conn_owner_nlportid = 0; __cfg80211_clear_ibss(dev, nowext); return 0; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fe27ab443d01..13f7c002f562 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8679,9 +8679,14 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) ibss.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); - err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys); if (err) kzfree(connkeys); + else if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + wdev_unlock(dev->ieee80211_ptr); + return err; } -- cgit v1.2.3 From 188c1b3c04d69e842122daf201f07a34fcfad039 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:46 -0500 Subject: nl80211: Add SOCKET_OWNER support to JOIN_MESH Signed-off-by: Denis Kenzior [johannes: fix race with wdev lock/unlock by just acquiring once] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/core.h | 4 ---- net/wireless/mesh.c | 16 +--------------- net/wireless/nl80211.c | 10 ++++++++-- 4 files changed, 11 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 266b43c4f6e1..98eb37def37d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1964,6 +1964,8 @@ enum nl80211_commands { * station will deauthenticate when the socket is closed. * If set during %NL80211_CMD_JOIN_IBSS the IBSS will be automatically * torn down when the socket is closed. + * If set during %NL80211_CMD_JOIN_MESH the mesh setup will be + * automatically torn down when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. diff --git a/net/wireless/core.h b/net/wireless/core.h index b5cf3ea8d4df..63eb1b5fdd04 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -303,10 +303,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_setup *setup, const struct mesh_config *conf); -int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct mesh_setup *setup, - const struct mesh_config *conf); int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index b12da6ef3c12..eac5aa1419fc 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -217,21 +217,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, return err; } -int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct mesh_setup *setup, - const struct mesh_config *conf) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; - - wdev_lock(wdev); - err = __cfg80211_join_mesh(rdev, dev, setup, conf); - wdev_unlock(wdev); - - return err; -} - int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) @@ -286,6 +271,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, err = rdev_leave_mesh(rdev, dev); if (!err) { + wdev->conn_owner_nlportid = 0; wdev->mesh_id_len = 0; wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 13f7c002f562..1d6e81e5b2c8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10092,7 +10092,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; } else { - /* cfg80211_join_mesh() will sort it out */ + /* __cfg80211_join_mesh() will sort it out */ setup.chandef.chan = NULL; } @@ -10130,7 +10130,13 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) setup.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); - return cfg80211_join_mesh(rdev, dev, &setup, &cfg); + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg); + if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + wdev_unlock(dev->ieee80211_ptr); + + return err; } static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3 From 466a306142c002b40deaa58da94741af4153d1c4 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:47 -0500 Subject: nl80211: Add SOCKET_OWNER support to START_AP Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/ap.c | 1 + net/wireless/nl80211.c | 3 +++ 3 files changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 98eb37def37d..9ea3d6039eca 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1966,6 +1966,8 @@ enum nl80211_commands { * torn down when the socket is closed. * If set during %NL80211_CMD_JOIN_MESH the mesh setup will be * automatically torn down when the socket is closed. + * If set during %NL80211_CMD_START_AP the AP will be automatically + * disabled when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. diff --git a/net/wireless/ap.c b/net/wireless/ap.c index 63682176c96c..882d97bdc6bf 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -27,6 +27,7 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, err = rdev_stop_ap(rdev, dev); if (!err) { + wdev->conn_owner_nlportid = 0; wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); wdev->ssid_len = 0; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1d6e81e5b2c8..cfaf2aeb9783 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4134,6 +4134,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) wdev->chandef = params.chandef; wdev->ssid_len = params.ssid_len; memcpy(wdev->ssid, params.ssid, wdev->ssid_len); + + if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) + wdev->conn_owner_nlportid = info->snd_portid; } wdev_unlock(wdev); -- cgit v1.2.3 From 6a671a50f8199b3e1fe49fa8afff0fc8335da79c Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:41 -0500 Subject: nl80211: Add CMD_CONTROL_PORT_FRAME API This commit also adds cfg80211_rx_control_port function. This is used to generate a CMD_CONTROL_PORT_FRAME event out to userspace. The conn_owner_nlportid is used as the unicast destination. This means that userspace must specify NL80211_ATTR_SOCKET_OWNER flag if control port over nl80211 routing is requested in NL80211_CMD_CONNECT, NL80211_CMD_ASSOCIATE, NL80211_CMD_START_AP or IBSS/mesh join. Signed-off-by: Denis Kenzior [johannes: fix return value of cfg80211_rx_control_port()] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 22 +++++++++++++++++ include/uapi/linux/nl80211.h | 13 ++++++++++ net/wireless/nl80211.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 21 ++++++++++++++++ 4 files changed, 114 insertions(+) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bfe174896fcf..df145f76adad 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5721,6 +5721,28 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, const u8 *buf, size_t len, bool ack, gfp_t gfp); +/** + * cfg80211_rx_control_port - notification about a received control port frame + * @dev: The device the frame matched to + * @buf: control port frame + * @len: length of the frame data + * @addr: The peer from which the frame was received + * @proto: frame protocol, typically PAE or Pre-authentication + * @unencrypted: Whether the frame was received unencrypted + * + * This function is used to inform userspace about a received control port + * frame. It should only be used if userspace indicated it wants to receive + * control port frames over nl80211. + * + * The frame is the data portion of the 802.3 or 802.11 data frame with all + * network layer headers removed (e.g. the raw EAPoL frame). + * + * Return: %true if the frame was passed to userspace + */ +bool cfg80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted); + /** * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event * @dev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9ea3d6039eca..6a3cc7a635b5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -990,6 +990,17 @@ * &NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed * &NL80211_CMD_DISCONNECT should be indicated instead. * + * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request + * and RX notification. This command is used both as a request to transmit + * a control port frame and as a notification that a control port frame + * has been received. %NL80211_ATTR_FRAME is used to specify the + * frame contents. The frame is the raw EAPoL data, without ethernet or + * 802.11 headers. + * When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, + * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added + * indicating the protocol type of the received frame; whether the frame + * was received unencrypted and the MAC address of the peer respectively. + * * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. * * @NL80211_CMD_EXTERNAL_AUTH: This interface is exclusively defined for host @@ -1228,6 +1239,8 @@ enum nl80211_commands { NL80211_CMD_STA_OPMODE_CHANGED, + NL80211_CMD_CONTROL_PORT_FRAME, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cfaf2aeb9783..0870447fbd55 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14553,6 +14553,64 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, } EXPORT_SYMBOL(cfg80211_mgmt_tx_status); +static int __nl80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, + bool unencrypted, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid); + + if (!nlportid) + return -ENOENT; + + msg = nlmsg_new(100 + len, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONTROL_PORT_FRAME); + if (!hdr) { + nlmsg_free(msg); + return -ENOBUFS; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_FRAME, len, buf) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || + nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) || + (unencrypted && nla_put_flag(msg, + NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); + + nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +bool cfg80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted) +{ + int ret; + + trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted); + ret = __nl80211_rx_control_port(dev, buf, len, addr, proto, + unencrypted, GFP_ATOMIC); + trace_cfg80211_return_bool(ret == 0); + return ret == 0; +} +EXPORT_SYMBOL(cfg80211_rx_control_port); + static struct sk_buff *cfg80211_prepare_cqm(struct net_device *dev, const char *mac, gfp_t gfp) { diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 5152938b358d..42fd338f879e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2600,6 +2600,27 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); +TRACE_EVENT(cfg80211_rx_control_port, + TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted), + TP_ARGS(netdev, buf, len, addr, proto, unencrypted), + TP_STRUCT__entry( + NETDEV_ENTRY + MAC_ENTRY(addr) + __field(u16, proto) + __field(bool, unencrypted) + ), + TP_fast_assign( + NETDEV_ASSIGN; + MAC_ASSIGN(addr, addr); + __entry->proto = proto; + __entry->unencrypted = unencrypted; + ), + TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s", + NETDEV_PR_ARG, MAC_PR_ARG(addr), + __entry->proto, BOOL_TO_STR(__entry->unencrypted)) +); + TRACE_EVENT(cfg80211_cqm_rssi_notify, TP_PROTO(struct net_device *netdev, enum nl80211_cqm_rssi_threshold_event rssi_event, -- cgit v1.2.3 From 2576a9ace47eba28a682d249d1d6402f891808c9 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:42 -0500 Subject: nl80211: Implement TX of control port frames This commit implements the TX side of NL80211_CMD_CONTROL_PORT_FRAME. Userspace provides the raw EAPoL frame using NL80211_ATTR_FRAME. Userspace should also provide the destination address and the protocol type to use when sending the frame. This is used to implement TX of Pre-authentication frames. If CONTROL_PORT_ETHERTYPE_NO_ENCRYPT is specified, then the driver will be asked not to encrypt the outgoing frame. A new EXT_FEATURE flag is introduced so that nl80211 code can check whether a given wiphy has capability to pass EAPoL frames over nl80211. Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 ++++++ include/uapi/linux/nl80211.h | 3 ++ net/wireless/nl80211.c | 71 +++++++++++++++++++++++++++++++++++++++++++- net/wireless/rdev-ops.h | 15 ++++++++++ net/wireless/trace.h | 26 ++++++++++++++++ 5 files changed, 123 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index df145f76adad..de2894a4ad10 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2961,6 +2961,9 @@ struct cfg80211_external_auth_params { * * @external_auth: indicates result of offloaded authentication processing from * user space + * + * @tx_control_port: TX a control port frame (EAPoL). The noencrypt parameter + * tells the driver that the frame should not be encrypted. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3256,6 +3259,12 @@ struct cfg80211_ops { const u8 *aa); int (*external_auth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_external_auth_params *params); + + int (*tx_control_port)(struct wiphy *wiphy, + struct net_device *dev, + const u8 *buf, size_t len, + const u8 *dest, const __be16 proto, + const bool noencrypt); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6a3cc7a635b5..3167d6f7fc68 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5024,6 +5024,8 @@ enum nl80211_feature_flags { * channel change triggered by radar detection event. * No need to start CAC from user-space, no need to react to * "radar detected" event. + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and + * receiving control port frames over nl80211 instead of the netdevice. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -5055,6 +5057,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_LOW_POWER_SCAN, NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, NL80211_EXT_FEATURE_DFS_OFFLOAD, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0870447fbd55..6eb286784924 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12535,6 +12535,68 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) return rdev_external_auth(rdev, dev, ¶ms); } +static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *buf; + size_t len; + u8 *dest; + u16 proto; + bool noencrypt; + int err; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211)) + return -EOPNOTSUPP; + + if (!rdev->ops->tx_control_port) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_FRAME] || + !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { + GENL_SET_ERR_MSG(info, "Frame, MAC or ethertype missing"); + return -EINVAL; + } + + wdev_lock(wdev); + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + break; + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + wdev_unlock(wdev); + + buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); + len = nla_len(info->attrs[NL80211_ATTR_FRAME]); + dest = nla_data(info->attrs[NL80211_ATTR_MAC]); + proto = nla_get_u16(info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); + noencrypt = + nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); + + return rdev_tx_control_port(rdev, dev, buf, len, + dest, cpu_to_be16(proto), noencrypt); + + out: + wdev_unlock(wdev); + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13438,7 +13500,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, - + { + .cmd = NL80211_CMD_CONTROL_PORT_FRAME, + .doit = nl80211_tx_control_port, + .policy = nl80211_policy, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 84f23ae015fc..87479a53411b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -714,6 +714,21 @@ static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const void *buf, size_t len, + const u8 *dest, __be16 proto, + const bool noencrypt) +{ + int ret; + trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, + dest, proto, noencrypt); + ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, + dest, proto, noencrypt); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 42fd338f879e..a64291ae52a6 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1882,6 +1882,32 @@ TRACE_EVENT(rdev_mgmt_tx, BOOL_TO_STR(__entry->dont_wait_for_ack)) ); +TRACE_EVENT(rdev_tx_control_port, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *buf, size_t len, const u8 *dest, __be16 proto, + bool unencrypted), + TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dest) + __field(__be16, proto) + __field(bool, unencrypted) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dest, dest); + __entry->proto = proto; + __entry->unencrypted = unencrypted; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT "," + " proto: 0x%x, unencrypted: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest), + be16_to_cpu(__entry->proto), + BOOL_TO_STR(__entry->unencrypted)) +); + TRACE_EVENT(rdev_set_noack_map, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u16 noack_map), -- cgit v1.2.3 From 64bf3d4bc2b0725b3c5ffadd982a9746bfc738b7 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:43 -0500 Subject: nl80211: Add CONTROL_PORT_OVER_NL80211 attribute Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 14 +++++++++++++- net/wireless/nl80211.c | 26 ++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index de2894a4ad10..0bd957b37208 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -647,6 +647,8 @@ struct survey_info { * allowed through even on unauthorized ports * @control_port_no_encrypt: TRUE to prevent encryption of control port * protocol frames. + * @control_port_over_nl80211: TRUE if userspace expects to exchange control + * port frames over NL80211 instead of the network interface. * @wep_keys: static WEP keys, if not NULL points to an array of * CFG80211_MAX_WEP_KEYS WEP keys * @wep_tx_key: key index (0..3) of the default TX static WEP key @@ -662,6 +664,7 @@ struct cfg80211_crypto_settings { bool control_port; __be16 control_port_ethertype; bool control_port_no_encrypt; + bool control_port_over_nl80211; struct key_params *wep_keys; int wep_tx_key; const u8 *psk; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 3167d6f7fc68..15daf5e2638d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -542,7 +542,8 @@ * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_USE_MFP, * %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, %NL80211_ATTR_CONTROL_PORT, * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, - * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, %NL80211_ATTR_MAC_HINT, and + * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, + * %NL80211_ATTR_CONTROL_PORT_OVER_NL80211, %NL80211_ATTR_MAC_HINT, and * %NL80211_ATTR_WIPHY_FREQ_HINT. * If included, %NL80211_ATTR_MAC and %NL80211_ATTR_WIPHY_FREQ are * restrictions on BSS selection, i.e., they effectively prevent roaming @@ -1488,6 +1489,15 @@ enum nl80211_commands { * @NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT: When included along with * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, indicates that the custom * ethertype frames used for key negotiation must not be encrypted. + * @NL80211_ATTR_CONTROL_PORT_OVER_NL80211: A flag indicating whether control + * port frames (e.g. of type given in %NL80211_ATTR_CONTROL_PORT_ETHERTYPE) + * will be sent directly to the network interface or sent via the NL80211 + * socket. If this attribute is missing, then legacy behavior of sending + * control port frames directly to the network interface is used. If the + * flag is included, then control port frames are sent over NL80211 instead + * using %CMD_CONTROL_PORT_FRAME. If control port routing over NL80211 is + * to be used then userspace must also use the %NL80211_ATTR_SOCKET_OWNER + * flag. * * @NL80211_ATTR_TESTDATA: Testmode data blob, passed through to the driver. * We recommend using nested, driver-specific attributes within this. @@ -2647,6 +2657,8 @@ enum nl80211_attrs { NL80211_ATTR_NSS, NL80211_ATTR_ACK_SIGNAL, + NL80211_ATTR_CONTROL_PORT_OVER_NL80211, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6eb286784924..d3b14d9d002a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -287,6 +287,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, + [NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, @@ -8211,6 +8212,22 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return err; } +static int validate_pae_over_nl80211(struct cfg80211_registered_device *rdev, + struct genl_info *info) +{ + if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + GENL_SET_ERR_MSG(info, "SOCKET_OWNER not set"); + return -EINVAL; + } + + if (!rdev->ops->tx_control_port || + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211)) + return -EOPNOTSUPP; + + return 0; +} + static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, struct genl_info *info, struct cfg80211_crypto_settings *settings, @@ -8234,6 +8251,15 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, } else settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE); + if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { + int r = validate_pae_over_nl80211(rdev, info); + + if (r < 0) + return r; + + settings->control_port_over_nl80211 = true; + } + if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { void *data; int len, i; -- cgit v1.2.3 From 39c202d228c3da5a5531be847e9b06cc9b787f31 Mon Sep 17 00:00:00 2001 From: Bernie Harris Date: Wed, 21 Mar 2018 15:42:15 +1300 Subject: netfilter: ebtables: Add support for specifying match revision Currently ebtables assumes that the revision number of all match modules is 0, which is an issue when trying to use existing xtables matches with ebtables. The solution is to modify ebtables to allow extensions to specify a revision number, similar to iptables. This gets passed down to the kernel, which is then able to find the match module correctly. To main binary backwards compatibility, the size of the ebt_entry structures is not changed, only the size of the name field is decreased by 1 byte to make room for the revision field. Signed-off-by: Bernie Harris Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 16 +++++++-- net/bridge/netfilter/ebtables.c | 47 ++++++++++++++++---------- 2 files changed, 42 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 9ff57c0a0199..0c7dc8315013 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -20,6 +20,7 @@ #define EBT_TABLE_MAXNAMELEN 32 #define EBT_CHAIN_MAXNAMELEN EBT_TABLE_MAXNAMELEN #define EBT_FUNCTION_MAXNAMELEN EBT_TABLE_MAXNAMELEN +#define EBT_EXTENSION_MAXNAMELEN 31 /* verdicts >0 are "branches" */ #define EBT_ACCEPT -1 @@ -120,7 +121,10 @@ struct ebt_entries { struct ebt_entry_match { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + uint8_t revision; + }; struct xt_match *match; } u; /* size of data */ @@ -130,7 +134,10 @@ struct ebt_entry_match { struct ebt_entry_watcher { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + uint8_t revision; + }; struct xt_target *watcher; } u; /* size of data */ @@ -140,7 +147,10 @@ struct ebt_entry_watcher { struct ebt_entry_target { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + uint8_t revision; + }; struct xt_target *target; } u; /* size of data */ diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9a26d2b7420f..a8cb543e3296 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -356,12 +356,12 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, left - sizeof(struct ebt_entry_match) < m->match_size) return -EINVAL; - match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) { if (!IS_ERR(match)) module_put(match->me); request_module("ebt_%s", m->u.name); - match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); } if (IS_ERR(match)) return PTR_ERR(match); @@ -1350,16 +1350,17 @@ static int update_counters(struct net *net, const void __user *user, static inline int ebt_obj_to_user(char __user *um, const char *_name, const char *data, int entrysize, - int usersize, int datasize) + int usersize, int datasize, u8 revision) { - char name[EBT_FUNCTION_MAXNAMELEN] = {0}; + char name[EBT_EXTENSION_MAXNAMELEN] = {0}; - /* ebtables expects 32 bytes long names but xt_match names are 29 bytes + /* ebtables expects 31 bytes long names but xt_match names are 29 bytes * long. Copy 29 bytes and fill remaining bytes with zeroes. */ strlcpy(name, _name, sizeof(name)); - if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) || - put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) || + if (copy_to_user(um, name, EBT_EXTENSION_MAXNAMELEN) || + put_user(revision, (u8 __user *)(um + EBT_EXTENSION_MAXNAMELEN)) || + put_user(datasize, (int __user *)(um + EBT_EXTENSION_MAXNAMELEN + 1)) || xt_data_to_user(um + entrysize, data, usersize, datasize, XT_ALIGN(datasize))) return -EFAULT; @@ -1372,7 +1373,8 @@ static inline int ebt_match_to_user(const struct ebt_entry_match *m, { return ebt_obj_to_user(ubase + ((char *)m - base), m->u.match->name, m->data, sizeof(*m), - m->u.match->usersize, m->match_size); + m->u.match->usersize, m->match_size, + m->u.match->revision); } static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, @@ -1380,7 +1382,8 @@ static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, { return ebt_obj_to_user(ubase + ((char *)w - base), w->u.watcher->name, w->data, sizeof(*w), - w->u.watcher->usersize, w->watcher_size); + w->u.watcher->usersize, w->watcher_size, + w->u.watcher->revision); } static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, @@ -1411,7 +1414,8 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, if (ret != 0) return ret; ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t), - t->u.target->usersize, t->target_size); + t->u.target->usersize, t->target_size, + t->u.target->revision); if (ret != 0) return ret; @@ -1599,7 +1603,10 @@ struct compat_ebt_replace { /* struct ebt_entry_match, _target and _watcher have same layout */ struct compat_ebt_entry_mwt { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + u8 revision; + }; compat_uptr_t ptr; } u; compat_uint_t match_size; @@ -1638,8 +1645,9 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, BUG_ON(off >= m->match_size); - if (copy_to_user(cm->u.name, match->name, - strlen(match->name) + 1) || put_user(msize, &cm->match_size)) + if (copy_to_user(cm->u.name, match->name, strlen(match->name) + 1) || + put_user(match->revision, &cm->u.revision) || + put_user(msize, &cm->match_size)) return -EFAULT; if (match->compat_to_user) { @@ -1668,8 +1676,9 @@ static int compat_target_to_user(struct ebt_entry_target *t, BUG_ON(off >= t->target_size); - if (copy_to_user(cm->u.name, target->name, - strlen(target->name) + 1) || put_user(tsize, &cm->match_size)) + if (copy_to_user(cm->u.name, target->name, strlen(target->name) + 1) || + put_user(target->revision, &cm->u.revision) || + put_user(tsize, &cm->match_size)) return -EFAULT; if (target->compat_to_user) { @@ -1933,7 +1942,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, struct ebt_entries_buf_state *state, const unsigned char *base) { - char name[EBT_FUNCTION_MAXNAMELEN]; + char name[EBT_EXTENSION_MAXNAMELEN]; struct xt_match *match; struct xt_target *wt; void *dst = NULL; @@ -1947,7 +1956,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, switch (compat_mwt) { case EBT_COMPAT_MATCH: - match = xt_request_find_match(NFPROTO_BRIDGE, name, 0); + match = xt_request_find_match(NFPROTO_BRIDGE, name, + mwt->u.revision); if (IS_ERR(match)) return PTR_ERR(match); @@ -1966,7 +1976,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; case EBT_COMPAT_WATCHER: /* fallthrough */ case EBT_COMPAT_TARGET: - wt = xt_request_find_target(NFPROTO_BRIDGE, name, 0); + wt = xt_request_find_target(NFPROTO_BRIDGE, name, + mwt->u.revision); if (IS_ERR(wt)) return PTR_ERR(wt); off = xt_compat_target_offset(wt); -- cgit v1.2.3 From 5e43f899b03a3492ce5fc44e8900becb04dae9c0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:00 -0700 Subject: bpf: Check attach type at prog load time == The problem == There are use-cases when a program of some type can be attached to multiple attach points and those attach points must have different permissions to access context or to call helpers. E.g. context structure may have fields for both IPv4 and IPv6 but it doesn't make sense to read from / write to IPv6 field when attach point is somewhere in IPv4 stack. Same applies to BPF-helpers: it may make sense to call some helper from some attach point, but not from other for same prog type. == The solution == Introduce `expected_attach_type` field in in `struct bpf_attr` for `BPF_PROG_LOAD` command. If scenario described in "The problem" section is the case for some prog type, the field will be checked twice: 1) At load time prog type is checked to see if attach type for it must be known to validate program permissions correctly. Prog will be rejected with EINVAL if it's the case and `expected_attach_type` is not specified or has invalid value. 2) At attach time `attach_type` is compared with `expected_attach_type`, if prog type requires to have one, and, if they differ, attach will be rejected with EINVAL. The `expected_attach_type` is now available as part of `struct bpf_prog` in both `bpf_verifier_ops->is_valid_access()` and `bpf_verifier_ops->get_func_proto()` () and can be used to check context accesses and calls to helpers correspondingly. Initially the idea was discussed by Alexei Starovoitov and Daniel Borkmann here: https://marc.info/?l=linux-netdev&m=152107378717201&w=2 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 ++++- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 3 ++- kernel/bpf/syscall.c | 31 ++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 6 +++--- kernel/trace/bpf_trace.c | 27 ++++++++++++++++++--------- net/core/filter.c | 39 +++++++++++++++++++++++++-------------- 8 files changed, 88 insertions(+), 29 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 819229c80eca..95a7abd0ee92 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -208,12 +208,15 @@ struct bpf_prog_ops { struct bpf_verifier_ops { /* return eBPF function prototype for verification */ - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + const struct bpf_func_proto * + (*get_func_proto)(enum bpf_func_id func_id, + const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 897ff3d95968..13c044e4832d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -469,6 +469,7 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1878201c2d77..102718624d1e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -296,6 +296,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c1c0b60d3f2f..8730b24ed540 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -545,7 +545,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -566,6 +566,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id) static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 95ca2523fa6e..9d3b572d4dec 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1171,8 +1171,27 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +static int +bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex +#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type static int bpf_prog_load(union bpf_attr *attr) { @@ -1209,11 +1228,16 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + prog->expected_attach_type = attr->expected_attach_type; + prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); @@ -1474,6 +1498,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); + if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { + bpf_prog_put(prog); + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8acd2207e412..10024323031d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1323,7 +1323,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, }; if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -2349,7 +2349,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id); + fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -5572,7 +5572,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 463e72d18c4c..d88e96d4e12c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -524,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -568,7 +569,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -582,12 +584,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -661,7 +664,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -669,11 +673,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) @@ -721,7 +726,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -731,7 +737,7 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } @@ -781,7 +787,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -789,12 +796,13 @@ static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { /* largest tracepoint in the kernel has 12 args */ @@ -816,6 +824,7 @@ const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_u64 = sizeof(u64); diff --git a/net/core/filter.c b/net/core/filter.c index e989bf313195..7790fd128614 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3685,7 +3685,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sock_filter_func_proto(enum bpf_func_id func_id) +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process @@ -3699,7 +3699,7 @@ sock_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3714,7 +3714,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -tc_cls_act_func_proto(enum bpf_func_id func_id) +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3781,7 +3781,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -xdp_func_proto(enum bpf_func_id func_id) +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -3804,7 +3804,7 @@ xdp_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id) +lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3831,7 +3831,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * - sock_ops_func_proto(enum bpf_func_id func_id) +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: @@ -3847,7 +3847,8 @@ static const struct bpf_func_proto * } } -static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: @@ -3863,7 +3864,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3888,7 +3890,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_xmit_func_proto(enum bpf_func_id func_id) +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: @@ -3918,11 +3920,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id); + return lwt_inout_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -3966,6 +3969,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3986,11 +3990,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4020,11 +4025,12 @@ static bool lwt_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4096,6 +4102,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4125,7 +4132,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool __is_valid_xdp_access(int off, int size) @@ -4142,6 +4149,7 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -4174,6 +4182,7 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -4220,6 +4229,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4249,11 +4259,12 @@ static bool sk_skb_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) -- cgit v1.2.3 From 4fbac77d2d092b475dda9eea66da674369665427 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:02 -0700 Subject: bpf: Hooks for sys_bind == The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 21 ++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 23 +++++ kernel/bpf/cgroup.c | 36 +++++++ kernel/bpf/syscall.c | 36 +++++-- kernel/bpf/verifier.c | 1 + net/core/filter.c | 232 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 7 ++ net/ipv6/af_inet6.c | 7 ++ 10 files changed, 366 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8a4566691c8f..67dc4a6471ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include struct sock; +struct sockaddr; struct cgroup; struct sk_buff; struct bpf_sock_ops_kern; @@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type); + int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); @@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 6d7243bfb0ff..2b28fcf6f6ae 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) diff --git a/include/linux/filter.h b/include/linux/filter.h index 13c044e4832d..fc4e8f91b03d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1021,6 +1021,16 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_addr_kern { + struct sock *sk; + struct sockaddr *uaddr; + /* Temporary "register" to make indirect stores to nested structures + * defined above. We need three registers to make such a store, but + * only two (src and dst) are available at convert_ctx_access time + */ + u64 tmp_reg; +}; + struct bpf_sock_ops_kern { struct sock *sk; u32 op; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 102718624d1e..ce3e69e3c793 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,6 +136,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -147,6 +148,8 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -1010,6 +1013,26 @@ struct bpf_map_info { __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8730b24ed540..43171a0bb02b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -494,6 +494,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); +/** + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user + * @type: The type of program to be exectuted + * + * socket is expected to be of type INET or INET6. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type) +{ + struct bpf_sock_addr_kern ctx = { + .sk = sk, + .uaddr = uaddr, + }; + struct cgroup *cgrp; + int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). + */ + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + /** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9d3b572d4dec..2cad66a4cacb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1175,19 +1175,29 @@ static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + return 0; + default: + return -EINVAL; + } + default: + return 0; + } } static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } } /* last field in 'union bpf_attr' used by this command */ @@ -1479,6 +1489,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1541,6 +1555,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1590,6 +1608,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10024323031d..5dd1dcb902bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3887,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index 7790fd128614..c08e5b121558 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3698,6 +3698,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + /* inet and inet6 sockets are created in a process + * context so there is always a valid uid/gid + */ + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4180,6 +4194,69 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + /* Only narrow read access allowed for now. */ + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (size != size_default) + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (size != size_default) + return false; + break; + default: + if (type == BPF_READ) { + if (size != size_default) + return false; + } else { + return false; + } + } + + return true; +} + static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -4724,6 +4801,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * It doesn't support SIZE argument though since narrow stores are not + * supported for now. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_STX_MEM( \ + BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ + TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != + FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr_in6, uaddr, + sin6_port, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -5181,6 +5404,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = { const struct bpf_prog_ops cg_sock_prog_ops = { }; +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e8c7fad8c329..2dec266507dc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,6 +450,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + if (err) + goto out; + if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index dbbe04018813..fa24e3f06ac6 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,6 +295,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; -- cgit v1.2.3 From d74bad4e74ee373787a9ae24197c17b7cdc428d5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:05 -0700 Subject: bpf: Hooks for sys_connect == The problem == See description of the problem in the initial patch of this patch set. == The solution == The patch provides much more reliable in-kernel solution for the 2nd part of the problem: making outgoing connecttion from desired IP. It adds new attach types `BPF_CGROUP_INET4_CONNECT` and `BPF_CGROUP_INET6_CONNECT` for program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both source and destination of a connection at connect(2) time. Local end of connection can be bound to desired IP using newly introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though, and doesn't support binding to port, i.e. leverages `IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this: * looking for a free port is expensive and can affect performance significantly; * there is no use-case for port. As for remote end (`struct sockaddr *` passed by user), both parts of it can be overridden, remote IP and remote port. It's useful if an application inside cgroup wants to connect to another application inside same cgroup or to itself, but knows nothing about IP assigned to the cgroup. Support is added for IPv4 and IPv6, for TCP and UDP. IPv4 and IPv6 have separate attach types for same reason as sys_bind hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. == Implementation notes == The patch introduces new field in `struct proto`: `pre_connect` that is a pointer to a function with same signature as `connect` but is called before it. The reason is in some cases BPF hooks should be called way before control is passed to `sk->sk_prot->connect`. Specifically `inet_dgram_connect` autobinds socket before calling `sk->sk_prot->connect` and there is no way to call `bpf_bind()` from hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since it'd cause double-bind. On the other hand `proto.pre_connect` provides a flexible way to add BPF hooks for connect only for necessary `proto` and call them at desired time before `connect`. Since `bpf_bind()` is allowed to bind only to IP and autobind in `inet_dgram_connect` binds only port there is no chance of double-bind. bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite of value of `bind_address_no_port` socket field. bpf_bind() sets `with_lock` to `false` when calling to __inet_bind() and __inet6_bind() since all call-sites, where bpf_bind() is called, already hold socket lock. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 31 +++++++++++++++++++++++++ include/net/addrconf.h | 7 ++++++ include/net/sock.h | 3 +++ include/net/udp.h | 1 + include/uapi/linux/bpf.h | 12 +++++++++- kernel/bpf/syscall.c | 8 +++++++ net/core/filter.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 13 +++++++++++ net/ipv4/tcp_ipv4.c | 16 +++++++++++++ net/ipv4/udp.c | 14 ++++++++++++ net/ipv6/af_inet6.c | 5 ++++ net/ipv6/tcp_ipv6.c | 16 +++++++++++++ net/ipv6/udp.c | 20 ++++++++++++++++ 13 files changed, 202 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 67dc4a6471ad..c6ab295e6dcb 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + release_sock(sk); \ + } \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ + sk->sk_prot->pre_connect) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -151,11 +177,16 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 132e5b95167a..378d601258be 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -231,6 +231,13 @@ struct ipv6_stub { }; extern const struct ipv6_stub *ipv6_stub __read_mostly; +/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ +struct ipv6_bpf_stub { + int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); +}; +extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; + /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/sock.h b/include/net/sock.h index b8ff435fa96e..49bd2c1796b0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1026,6 +1026,9 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) struct proto { void (*close)(struct sock *sk, long timeout); + int (*pre_connect)(struct sock *sk, + struct sockaddr *uaddr, + int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/include/net/udp.h b/include/net/udp.h index 850a8e581cce..0676b272f6ac 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -273,6 +273,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ce3e69e3c793..77afaf1ba556 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -150,6 +150,8 @@ enum bpf_attach_type { BPF_SK_MSG_VERDICT, BPF_CGROUP_INET4_BIND, BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, __MAX_BPF_ATTACH_TYPE }; @@ -744,6 +746,13 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_PASS * + * int bpf_bind(ctx, addr, addr_len) + * Bind socket to address. Only binding to IP is supported, no port can be + * set in addr. + * @ctx: pointer to context of type bpf_sock_addr + * @addr: pointer to struct sockaddr to bind socket to + * @addr_len: length of sockaddr structure + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -809,7 +818,8 @@ union bpf_attr { FN(msg_redirect_map), \ FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ - FN(msg_pull_data), + FN(msg_pull_data), \ + FN(bind), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2cad66a4cacb..cf1b29bc0ab8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1180,6 +1180,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: return 0; default: return -EINVAL; @@ -1491,6 +1493,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1557,6 +1561,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1610,6 +1616,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index c08e5b121558..bdb9cadd4d27 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -3656,6 +3657,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .arg2_type = ARG_ANYTHING, }; +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + int err; + + /* Binding to port can be expensive so it's prohibited in the helper. + * Only binding to IP is supported. + */ + err = -EINVAL; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + return err; + return __inet_bind(sk, addr, addr_len, true, false); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + return err; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3707,6 +3754,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -4213,6 +4268,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: break; default: return false; @@ -4221,6 +4277,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: break; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e203a39d6988..488fe26ac8e5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -547,12 +547,19 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + return err; + } + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, uaddr, addr_len); @@ -633,6 +640,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + goto out; + } + err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2c6aec2643e8..3c11d992d784 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -140,6 +140,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v4_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2409,6 +2424,7 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 908fc02fb4f8..9c6c77fec963 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1658,6 +1658,19 @@ csum_copy_err: goto try_again; } +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} +EXPORT_SYMBOL(udp_pre_connect); + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2530,6 +2543,7 @@ struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udp_pre_connect, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 13110bee5c14..566cec0e0a44 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -887,6 +887,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, }; +static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { + .inet6_bind = __inet6_bind, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -1043,6 +1047,7 @@ static int __init inet6_init(void) /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; + ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5425d7b100ee..6469b741cf5a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -117,6 +117,21 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v6_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -1925,6 +1940,7 @@ struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ad30f5e31969..6861ed479469 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -957,6 +957,25 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } +static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* The following checks are replicated from __ip6_datagram_connect() + * and intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + if (uaddr->sa_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + return udp_pre_connect(sk, uaddr, addr_len); + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1512,6 +1531,7 @@ struct proto udpv6_prot = { .name = "UDPv6", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udpv6_pre_connect, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, -- cgit v1.2.3 From aac3fc320d9404f2665a8b1249dc3170d5fa3caf Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:07 -0700 Subject: bpf: Post-hooks for sys_bind "Post-hooks" are hooks that are called right before returning from sys_bind. At this time IP and port are already allocated and no further changes to `struct sock` can happen before returning from sys_bind but BPF program has a chance to inspect the socket and change sys_bind result. Specifically it can e.g. inspect what port was allocated and if it doesn't satisfy some policy, BPF program can force sys_bind to fail and return EPERM to user. Another example of usage is recording the IP:port pair to some map to use it in later calls to sys_connect. E.g. if some TCP server inside cgroup was bound to some IP:port_n, it can be recorded to a map. And later when some TCP client inside same cgroup is trying to connect to 127.0.0.1:port_n, BPF hook for sys_connect can override the destination and connect application to IP:port_n instead of 127.0.0.1:port_n. That helps forcing all applications inside a cgroup to use desired IP and not break those applications if they e.g. use localhost to communicate between each other. == Implementation details == Post-hooks are implemented as two new attach types `BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`. Separate attach types for IPv4 and IPv6 are introduced to avoid access to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from `inet6_bind()` since those fields might not make sense in such cases. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 16 +++++-- include/uapi/linux/bpf.h | 11 +++++ kernel/bpf/syscall.c | 43 +++++++++++++++++ net/core/filter.c | 116 +++++++++++++++++++++++++++++++++++++++------ net/ipv4/af_inet.c | 18 ++++--- net/ipv6/af_inet6.c | 21 +++++--- 6 files changed, 195 insertions(+), 30 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c6ab295e6dcb..30d15e64b993 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, \ - BPF_CGROUP_INET_SOCK_CREATE); \ + __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ @@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77afaf1ba556..c5ec89732a8d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_BIND, BPF_CGROUP_INET4_CONNECT, BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -948,6 +950,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf1b29bc0ab8..0244973ee544 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1171,11 +1171,46 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +/* Initially all BPF programs could be loaded w/o specifying + * expected_attach_type. Later for some of them specifying expected_attach_type + * at load time became required so that program could be validated properly. + * Programs of types that are allowed to be loaded both w/ and w/o (for + * backward compatibility) expected_attach_type, should have the default attach + * type assigned to expected_attach_type for the latter case, so that it can be + * validated later at attach time. + * + * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if + * prog type requires it but has some attach types that have to be backward + * compatible. + */ +static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +{ + switch (attr->prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't + * exist so checking for non-zero is the way to go here. + */ + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_CGROUP_INET_SOCK_CREATE; + break; + } +} + static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + switch (expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + return 0; + default: + return -EINVAL; + } case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: @@ -1195,6 +1230,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; default: @@ -1240,6 +1276,7 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) return -EINVAL; @@ -1489,6 +1526,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1557,6 +1596,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1616,6 +1657,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: diff --git a/net/core/filter.c b/net/core/filter.c index bdb9cadd4d27..d31aff93270d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4097,30 +4097,80 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) { - if (type == BPF_WRITE) { - switch (off) { - case offsetof(struct bpf_sock, bound_dev_if): - case offsetof(struct bpf_sock, mark): - case offsetof(struct bpf_sock, priority): - break; + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + goto full_access; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; default: return false; } } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} + +static bool __sock_filter_check_size(int off, int size, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); - if (off < 0 || off + size > sizeof(struct bpf_sock)) + switch (off) { + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; - /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) + if (!__sock_filter_check_attach_type(off, type, + prog->expected_attach_type)) + return false; + if (!__sock_filter_check_size(off, size, info)) return false; - return true; } @@ -4728,6 +4778,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): @@ -4783,6 +4834,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + FIELD_SIZEOF(struct sock_common, + skc_num), + target_size)); + break; } return insn - insn_buf; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 488fe26ac8e5..142d4c35b493 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -519,12 +519,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } if (inet->inet_rcv_saddr) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 566cec0e0a44..41f50472679d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -412,13 +412,20 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - err = -EADDRINUSE; - goto out; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } if (addr_type != IPV6_ADDR_ANY) -- cgit v1.2.3 From 7a74d39cc2927302bc236397c1fdb1fe5be209ce Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 29 Mar 2018 23:20:44 +0200 Subject: tipc: tipc: rename address types in user api The three address type structs in the user API have names that in reality reflect the specific, non-Linux environment where they were originally created. We now give them more intuitive names, in accordance with how TIPC is described in the current documentation. struct tipc_portid -> struct tipc_socket_addr struct tipc_name -> struct tipc_service_addr struct tipc_name_seq -> struct tipc_service_range To avoid confusion, we also update some commmets and macro names to match the new terminology. For compatibility, we add macros that map all old names to the new ones. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 57 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 24 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 4ac9f1f02b06..156224ac3d74 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -45,33 +45,33 @@ * TIPC addressing primitives */ -struct tipc_portid { +struct tipc_socket_addr { __u32 ref; __u32 node; }; -struct tipc_name { +struct tipc_service_addr { __u32 type; __u32 instance; }; -struct tipc_name_seq { +struct tipc_service_range { __u32 type; __u32 lower; __u32 upper; }; /* - * Application-accessible port name types + * Application-accessible service types */ -#define TIPC_CFG_SRV 0 /* configuration service name type */ -#define TIPC_TOP_SRV 1 /* topology service name type */ -#define TIPC_LINK_STATE 2 /* link state name type */ -#define TIPC_RESERVED_TYPES 64 /* lowest user-publishable name type */ +#define TIPC_NODE_STATE 0 /* node state service type */ +#define TIPC_TOP_SRV 1 /* topology server service type */ +#define TIPC_LINK_STATE 2 /* link state service type */ +#define TIPC_RESERVED_TYPES 64 /* lowest user-allowed service type */ /* - * Publication scopes when binding port names and port name sequences + * Publication scopes when binding service / service range */ enum tipc_scope { TIPC_CLUSTER_SCOPE = 2, /* 0 can also be used */ @@ -108,28 +108,28 @@ enum tipc_scope { * TIPC topology subscription service definitions */ -#define TIPC_SUB_PORTS 0x01 /* filter for port availability */ -#define TIPC_SUB_SERVICE 0x02 /* filter for service availability */ -#define TIPC_SUB_CANCEL 0x04 /* cancel a subscription */ +#define TIPC_SUB_PORTS 0x01 /* filter: evt at each match */ +#define TIPC_SUB_SERVICE 0x02 /* filter: evt at first up/last down */ +#define TIPC_SUB_CANCEL 0x04 /* filter: cancel a subscription */ #define TIPC_WAIT_FOREVER (~0) /* timeout for permanent subscription */ struct tipc_subscr { - struct tipc_name_seq seq; /* name sequence of interest */ + struct tipc_service_range seq; /* range of interest */ __u32 timeout; /* subscription duration (in ms) */ __u32 filter; /* bitmask of filter options */ char usr_handle[8]; /* available for subscriber use */ }; #define TIPC_PUBLISHED 1 /* publication event */ -#define TIPC_WITHDRAWN 2 /* withdraw event */ +#define TIPC_WITHDRAWN 2 /* withdrawal event */ #define TIPC_SUBSCR_TIMEOUT 3 /* subscription timeout event */ struct tipc_event { __u32 event; /* event type */ - __u32 found_lower; /* matching name seq instances */ - __u32 found_upper; /* " " " " */ - struct tipc_portid port; /* associated port */ + __u32 found_lower; /* matching range */ + __u32 found_upper; /* " " */ + struct tipc_socket_addr port; /* associated socket */ struct tipc_subscr s; /* associated subscription */ }; @@ -149,20 +149,20 @@ struct tipc_event { #define SOL_TIPC 271 #endif -#define TIPC_ADDR_NAMESEQ 1 -#define TIPC_ADDR_MCAST 1 -#define TIPC_ADDR_NAME 2 -#define TIPC_ADDR_ID 3 +#define TIPC_ADDR_MCAST 1 +#define TIPC_SERVICE_RANGE 1 +#define TIPC_SERVICE_ADDR 2 +#define TIPC_SOCKET_ADDR 3 struct sockaddr_tipc { unsigned short family; unsigned char addrtype; signed char scope; union { - struct tipc_portid id; - struct tipc_name_seq nameseq; + struct tipc_socket_addr id; + struct tipc_service_range nameseq; struct { - struct tipc_name name; + struct tipc_service_addr name; __u32 domain; } name; } addr; @@ -230,8 +230,13 @@ struct tipc_sioc_ln_req { /* The macros and functions below are deprecated: */ +#define TIPC_CFG_SRV 0 #define TIPC_ZONE_SCOPE 1 +#define TIPC_ADDR_NAMESEQ 1 +#define TIPC_ADDR_NAME 2 +#define TIPC_ADDR_ID 3 + #define TIPC_NODE_BITS 12 #define TIPC_CLUSTER_BITS 12 #define TIPC_ZONE_BITS 8 @@ -250,6 +255,10 @@ struct tipc_sioc_ln_req { #define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK) +#define tipc_portid tipc_socket_addr +#define tipc_name tipc_service_addr +#define tipc_name_seq tipc_service_range + static inline __u32 tipc_addr(unsigned int zone, unsigned int cluster, unsigned int node) -- cgit v1.2.3 From 7494cfa6d36d1556f17baa012dd93833620783db Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 29 Mar 2018 23:20:45 +0200 Subject: tipc: avoid possible string overflow gcc points out that the combined length of the fixed-length inputs to l->name is larger than the destination buffer size: net/tipc/link.c: In function 'tipc_link_create': net/tipc/link.c:465:26: error: '%s' directive writing up to 32 bytes into a region of size between 26 and 58 [-Werror=format-overflow=] sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str); net/tipc/link.c:465:2: note: 'sprintf' output 11 or more bytes (assuming 75) into a destination of size 60 sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str); A detailed analysis reveals that the theoretical maximum length of a link name is: max self_str + 1 + max if_name + 1 + max peer_str + 1 + max if_name = 16 + 1 + 15 + 1 + 16 + 1 + 15 = 65 Since we also need space for a trailing zero we now set MAX_LINK_NAME to 68. Just to be on the safe side we also replace the sprintf() call with snprintf(). Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Reported-by: Arnd Bergmann Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 2 +- net/tipc/link.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 156224ac3d74..bf6d28677cfe 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -216,7 +216,7 @@ struct tipc_group_req { #define TIPC_MAX_MEDIA_NAME 16 #define TIPC_MAX_IF_NAME 16 #define TIPC_MAX_BEARER_NAME 32 -#define TIPC_MAX_LINK_NAME 60 +#define TIPC_MAX_LINK_NAME 68 #define SIOCGETLINKNAME SIOCPROTOPRIVATE diff --git a/net/tipc/link.c b/net/tipc/link.c index 8f2a9496439b..695acb783969 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -462,7 +462,8 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, sprintf(peer_str, "%x", peer); } /* Peer i/f name will be completed by reset/activate message */ - sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str); + snprintf(l->name, sizeof(l->name), "%s:%s-%s:unknown", + self_str, if_name, peer_str); strcpy(l->if_name, if_name); l->addr = peer; -- cgit v1.2.3