From cc5453a5b7e90c39f713091a7ebc53c1f87d1700 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Aug 2020 16:15:58 +0200 Subject: netfilter: conntrack: allow sctp hearbeat after connection re-use If an sctp connection gets re-used, heartbeats are flagged as invalid because their vtag doesn't match. Handle this in a similar way as TCP conntrack when it suspects that the endpoints and conntrack are out-of-sync. When a HEARTBEAT request fails its vtag validation, flag this in the conntrack state and accept the packet. When a HEARTBEAT_ACK is received with an invalid vtag in the reverse direction after we allowed such a HEARTBEAT through, assume we are out-of-sync and re-set the vtag info. v2: remove left-over snippet from an older incarnation that moved new_state/old_state assignments, thats not needed so keep that as-is. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h index 9a33f171aa82..625f491b95de 100644 --- a/include/linux/netfilter/nf_conntrack_sctp.h +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -9,6 +9,8 @@ struct ip_ct_sctp { enum sctp_conntrack state; __be32 vtag[IP_CT_DIR_MAX]; + u8 last_dir; + u8 flags; }; #endif /* _NF_CONNTRACK_SCTP_H */ -- cgit v1.2.3 From 4d2d4ce001f283ed8127173543b4cfb65641e357 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:01 +0200 Subject: KVM: arm64: Drop type input from kvm_put_guest We can use typeof() to avoid the need for the type input. Suggested-by: Marc Zyngier Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-4-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 2 +- include/linux/kvm_host.h | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 95f9580275b1..241ded7ee0ad 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -32,7 +32,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) steal_le = cpu_to_le64(steal); idx = srcu_read_lock(&kvm->srcu); offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); - kvm_put_guest(kvm, base + offset, steal_le, u64); + kvm_put_guest(kvm, base + offset, steal_le); srcu_read_unlock(&kvm->srcu, idx); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a23076765b4c..84371fb06209 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -749,25 +749,26 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); -#define __kvm_put_guest(kvm, gfn, offset, value, type) \ +#define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ - type __user *__uaddr = (type __user *)(__addr + offset); \ + typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ - __ret = put_user(value, __uaddr); \ + __ret = put_user(v, __uaddr); \ if (!__ret) \ mark_page_dirty(kvm, gfn); \ __ret; \ }) -#define kvm_put_guest(kvm, gpa, value, type) \ +#define kvm_put_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ + \ __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ - offset_in_page(__gpa), (value), type); \ + offset_in_page(__gpa), v); \ }) int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); -- cgit v1.2.3 From 53f985584e3c2ebe5f2455530fbf87a001528db8 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:02 +0200 Subject: KVM: arm64: pvtime: Fix stolen time accounting across migration When updating the stolen time we should always read the current stolen time from the user provided memory, not from a kernel cache. If we use a cache then we'll end up resetting stolen time to zero on the first update after migration. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-5-drjones@redhat.com --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/pvtime.c | 23 +++++++++-------------- include/linux/kvm_host.h | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65568b23868a..dd9c3b25aa1e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -368,7 +368,6 @@ struct kvm_vcpu_arch { /* Guest PV state */ struct { - u64 steal; u64 last_steal; gpa_t base; } steal; diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 241ded7ee0ad..75234321d896 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -13,26 +13,22 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.steal.base; u64 last_steal = vcpu->arch.steal.last_steal; - u64 steal; - __le64 steal_le; - u64 offset; + u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); + u64 steal = 0; int idx; - u64 base = vcpu->arch.steal.base; if (base == GPA_INVALID) return; - /* Let's do the local bookkeeping */ - steal = vcpu->arch.steal.steal; - vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); - steal += vcpu->arch.steal.last_steal - last_steal; - vcpu->arch.steal.steal = steal; - - steal_le = cpu_to_le64(steal); idx = srcu_read_lock(&kvm->srcu); - offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); - kvm_put_guest(kvm, base + offset, steal_le); + if (!kvm_get_guest(kvm, base + offset, steal)) { + steal = le64_to_cpu(steal); + vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.steal.last_steal - last_steal; + kvm_put_guest(kvm, base + offset, cpu_to_le64(steal)); + } srcu_read_unlock(&kvm->srcu, idx); } @@ -66,7 +62,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) * Start counting stolen time from the time the guest requests * the feature enabled. */ - vcpu->arch.steal.steal = 0; vcpu->arch.steal.last_steal = current->sched_info.run_delay; idx = srcu_read_lock(&kvm->srcu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84371fb06209..05e3c2fb3ef7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -749,6 +749,26 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); +#define __kvm_get_guest(kvm, gfn, offset, v) \ +({ \ + unsigned long __addr = gfn_to_hva(kvm, gfn); \ + typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ + int __ret = -EFAULT; \ + \ + if (!kvm_is_error_hva(__addr)) \ + __ret = get_user(v, __uaddr); \ + __ret; \ +}) + +#define kvm_get_guest(kvm, gpa, v) \ +({ \ + gpa_t __gpa = gpa; \ + struct kvm *__kvm = kvm; \ + \ + __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT, \ + offset_in_page(__gpa), v); \ +}) + #define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ -- cgit v1.2.3 From be769db2f95861cc8c7c8fedcc71a8c39b803b10 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 22 Aug 2020 08:23:29 +1000 Subject: net: Get rid of consume_skb when tracing is off The function consume_skb is only meaningful when tracing is enabled. This patch makes it conditional on CONFIG_TRACEPOINTS. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ net/core/skbuff.c | 2 ++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 46881d902124..e8bca74857a3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1056,7 +1056,16 @@ void kfree_skb(struct sk_buff *skb); void kfree_skb_list(struct sk_buff *segs); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); + +#ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); +#else +static inline void consume_skb(struct sk_buff *skb) +{ + return kfree_skb(skb); +} +#endif + void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); extern struct kmem_cache *skbuff_head_cache; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e18184ffa9c3..6faf73d6a0f7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -820,6 +820,7 @@ void skb_tx_error(struct sk_buff *skb) } EXPORT_SYMBOL(skb_tx_error); +#ifdef CONFIG_TRACEPOINTS /** * consume_skb - free an skbuff * @skb: buffer to free @@ -837,6 +838,7 @@ void consume_skb(struct sk_buff *skb) __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); +#endif /** * consume_stateless_skb - free an skbuff, assuming it is stateless -- cgit v1.2.3 From aac544c3553d98ebe150dda19a25aa253f7ad3fe Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Tue, 25 Aug 2020 01:25:11 +0200 Subject: Compiler Attributes: remove comment about sparse not supporting __has_attribute Sparse supports __has_attribute() since 2018-08-31, so the comment is not true anymore but more importantly is rather confusing. So remove it. Signed-off-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 6122efdad6ad..af7a58c19e20 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -24,12 +24,6 @@ * __has_attribute is supported on gcc >= 5, clang >= 2.9 and icc >= 17. * In the meantime, to support 4.6 <= gcc < 5, we implement __has_attribute * by hand. - * - * sparse does not support __has_attribute (yet) and defines __GNUC_MINOR__ - * depending on the compiler used to build it; however, these attributes have - * no semantic effects for sparse, so it does not matter. Also note that, - * in order to avoid sparse's warnings, even the unsupported ones must be - * defined to 0. */ #ifndef __has_attribute # define __has_attribute(x) __GCC4_has_attribute_##x -- cgit v1.2.3 From 5861af92ff2a2e002449191413c35f3ec5f721fe Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Tue, 25 Aug 2020 01:25:26 +0200 Subject: Compiler Attributes: fix comment concerning GCC 4.6 GCC 4.6 is not supported anymore, so remove a reference to it, leaving just the part about version prior GCC 5. Signed-off-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index af7a58c19e20..ea7b756b1c8f 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -22,7 +22,7 @@ /* * __has_attribute is supported on gcc >= 5, clang >= 2.9 and icc >= 17. - * In the meantime, to support 4.6 <= gcc < 5, we implement __has_attribute + * In the meantime, to support gcc < 5, we implement __has_attribute * by hand. */ #ifndef __has_attribute -- cgit v1.2.3 From 645f08975f49441b3e753d8dc5b740cbcb226594 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Aug 2020 07:27:49 -0400 Subject: net: Fix some comments Fix some comments, including wrong function name, duplicated word and so on. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- include/uapi/linux/in.h | 2 +- net/core/sock.c | 2 +- net/ipv4/raw.c | 2 +- net/l3mdev/l3mdev.c | 2 +- net/socket.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e8bca74857a3..8d9ab50b08c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -71,7 +71,7 @@ * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or - * IPv4|UDP where the Next Header field in the IPv6 + * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with @@ -2667,7 +2667,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) * * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS) * to reduce average number of cache lines per packet. - * get_rps_cpus() for example only access one 64 bytes aligned block : + * get_rps_cpu() for example only access one 64 bytes aligned block : * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 3d0d8231dc19..7d6687618d80 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -135,7 +135,7 @@ struct in_addr { * this socket to prevent accepting spoofed ones. */ #define IP_PMTUDISC_INTERFACE 4 -/* weaker version of IP_PMTUDISC_INTERFACE, which allos packets to get +/* weaker version of IP_PMTUDISC_INTERFACE, which allows packets to get * fragmented if they exeed the interface mtu */ #define IP_PMTUDISC_OMIT 5 diff --git a/net/core/sock.c b/net/core/sock.c index e4f40b175acb..8eb2c924805a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3254,7 +3254,7 @@ void sk_common_release(struct sock *sk) sk->sk_prot->destroy(sk); /* - * Observation: when sock_common_release is called, processes have + * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6fd4330287c2..407956be7deb 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -610,7 +610,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) { ipc.oif = inet->uc_index; } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { - /* oif is set, packet is to local broadcast and + /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index e71ca5aec684..864326f150e2 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -154,7 +154,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); /** - * l3mdev_fib_table - get FIB table id associated with an L3 + * l3mdev_fib_table_rcu - get FIB table id associated with an L3 * master interface * @dev: targeted interface */ diff --git a/net/socket.c b/net/socket.c index dbbe8ea7d395..0c0144604f81 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3610,7 +3610,7 @@ int kernel_getsockname(struct socket *sock, struct sockaddr *addr) EXPORT_SYMBOL(kernel_getsockname); /** - * kernel_peername - get the address which the socket is connected (kernel space) + * kernel_getpeername - get the address which the socket is connected (kernel space) * @sock: socket * @addr: address holder * @@ -3671,7 +3671,7 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, EXPORT_SYMBOL(kernel_sendpage_locked); /** - * kernel_shutdown - shut down part of a full-duplex connection (kernel space) + * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket * @how: connection part * -- cgit v1.2.3 From ee921183557af39c1a0475f982d43b0fcac25e2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 23 Aug 2020 13:55:36 +0200 Subject: netfilter: nfnetlink: nfnetlink_unicast() reports EAGAIN instead of ENOBUFS Frontend callback reports EAGAIN to nfnetlink to retry a command, this is used to signal that module autoloading is required. Unfortunately, nlmsg_unicast() reports EAGAIN in case the receiver socket buffer gets full, so it enters a busy-loop. This patch updates nfnetlink_unicast() to turn EAGAIN into ENOBUFS and to use nlmsg_unicast(). Remove the flags field in nfnetlink_unicast() since this is always MSG_DONTWAIT in the existing code which is exactly what nlmsg_unicast() passes to netlink_unicast() as parameter. Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 3 +- net/netfilter/nf_tables_api.c | 61 ++++++++++++++++++------------------- net/netfilter/nfnetlink.c | 11 +++++-- net/netfilter/nfnetlink_log.c | 3 +- net/netfilter/nfnetlink_queue.c | 2 +- 5 files changed, 40 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 851425c3178f..89016d08f6a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -43,8 +43,7 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags); +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 71e501c5ad21..b7dc1cbf40ea 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -815,11 +815,11 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) - goto err; + goto err_fill_table_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_table_info: kfree_skb(skb2); return err; } @@ -1563,11 +1563,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) - goto err; + goto err_fill_chain_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_chain_info: kfree_skb(skb2); return err; } @@ -3008,11 +3008,11 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, NULL); if (err < 0) - goto err; + goto err_fill_rule_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_rule_info: kfree_skb(skb2); return err; } @@ -3968,11 +3968,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); if (err < 0) - goto err; + goto err_fill_set_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_set_info: kfree_skb(skb2); return err; } @@ -4860,24 +4860,18 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) - goto err1; + return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, NFT_MSG_NEWSETELEM, 0, set, &elem); if (err < 0) - goto err2; + goto err_fill_setelem; - err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT); - /* This avoids a loop in nfnetlink. */ - if (err < 0) - goto err1; + return nfnetlink_unicast(skb, ctx->net, ctx->portid); - return 0; -err2: +err_fill_setelem: kfree_skb(skb); -err1: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return err; } /* called with rcu_read_lock held */ @@ -6182,10 +6176,11 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) - goto err; + goto err_fill_obj_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_obj_info: kfree_skb(skb2); return err; } @@ -7045,10 +7040,11 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable, &flowtable->hook_list); if (err < 0) - goto err; + goto err_fill_flowtable_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_flowtable_info: kfree_skb(skb2); return err; } @@ -7234,10 +7230,11 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq); if (err < 0) - goto err; + goto err_fill_gen_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_gen_info: kfree_skb(skb2); return err; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 5f24edf95830..3a2e64e13b22 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -149,10 +149,15 @@ int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { - return netlink_unicast(net->nfnl, skb, portid, flags); + int err; + + err = nlmsg_unicast(net->nfnl, skb, portid); + if (err == -EAGAIN) + err = -ENOBUFS; + + return err; } EXPORT_SYMBOL_GPL(nfnetlink_unicast); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f02992419850..b35e8d9a5b37 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -356,8 +356,7 @@ __nfulnl_send(struct nfulnl_instance *inst) goto out; } } - nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, - MSG_DONTWAIT); + nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index dadfc06245a3..d1d8bca03b4f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -681,7 +681,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; -- cgit v1.2.3 From e5fc436f06eef54ef512ea55a9db8eb9f2e76959 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Fri, 28 Aug 2020 10:53:01 +0200 Subject: sparse: use static inline for __chk_{user,io}_ptr() __chk_user_ptr() & __chk_io_ptr() are dummy extern functions which only exist to enforce the typechecking of __user or __iomem pointers in macros when using sparse. This typechecking is done by inserting a call to these functions. But the presence of these calls can inhibit some simplifications and so influence the result of sparse's analysis of context/locking. Fix this by changing these calls into static inline calls with an empty body. Signed-off-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4b33cb385f96..6e390d58a9f8 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -11,8 +11,8 @@ # define __iomem __attribute__((noderef, address_space(__iomem))) # define __percpu __attribute__((noderef, address_space(__percpu))) # define __rcu __attribute__((noderef, address_space(__rcu))) -extern void __chk_user_ptr(const volatile void __user *); -extern void __chk_io_ptr(const volatile void __iomem *); +static inline void __chk_user_ptr(const volatile void __user *ptr) { } +static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } /* context/locking */ # define __must_hold(x) __attribute__((context(x,1,1))) # define __acquires(x) __attribute__((context(x,0,1))) -- cgit v1.2.3 From 35556bed836f8dc07ac55f69c8d17dce3e7f0e25 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Sep 2020 10:52:33 +0100 Subject: HID: core: Sanitize event code and type when mapping input When calling into hid_map_usage(), the passed event code is blindly stored as is, even if it doesn't fit in the associated bitmap. This event code can come from a variety of sources, including devices masquerading as input devices, only a bit more "programmable". Instead of taking the event code at face value, check that it actually fits the corresponding bitmap, and if it doesn't: - spit out a warning so that we know which device is acting up - NULLify the bitmap pointer so that we catch unexpected uses Code paths that can make use of untrusted inputs can now check that the mapping was indeed correct and bail out if not. Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-input.c | 4 ++++ drivers/hid/hid-multitouch.c | 2 ++ include/linux/hid.h | 42 +++++++++++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index b8eabf206e74..88e19996427e 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1132,6 +1132,10 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel } mapped: + /* Mapping failed, bail out */ + if (!bit) + return; + if (device->driver->input_mapped && device->driver->input_mapped(device, hidinput, field, usage, &bit, &max) < 0) { diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 3f94b4954225..e3152155c4b8 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -856,6 +856,8 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, code = BTN_0 + ((usage->hid - 1) & HID_USAGE); hid_map_usage(hi, usage, bit, max, EV_KEY, code); + if (!*bit) + return -1; input_set_capability(hi->input, EV_KEY, code); return 1; diff --git a/include/linux/hid.h b/include/linux/hid.h index 875f71132b14..c7044a14200e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -959,34 +959,49 @@ static inline void hid_device_io_stop(struct hid_device *hid) { * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type + * + * The value pointed to by @bit will be set to NULL if either @type is + * an unhandled event type, or if @c is out of range for @type. This + * can be used as an error condition. */ static inline void hid_map_usage(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, - __u8 type, __u16 c) + __u8 type, unsigned int c) { struct input_dev *input = hidinput->input; - - usage->type = type; - usage->code = c; + unsigned long *bmap = NULL; + unsigned int limit = 0; switch (type) { case EV_ABS: - *bit = input->absbit; - *max = ABS_MAX; + bmap = input->absbit; + limit = ABS_MAX; break; case EV_REL: - *bit = input->relbit; - *max = REL_MAX; + bmap = input->relbit; + limit = REL_MAX; break; case EV_KEY: - *bit = input->keybit; - *max = KEY_MAX; + bmap = input->keybit; + limit = KEY_MAX; break; case EV_LED: - *bit = input->ledbit; - *max = LED_MAX; + bmap = input->ledbit; + limit = LED_MAX; break; } + + if (unlikely(c > limit || !bmap)) { + pr_warn_ratelimited("%s: Invalid code %d type %d\n", + input->name, c, type); + *bit = NULL; + return; + } + + usage->type = type; + usage->code = c; + *max = limit; + *bit = bmap; } /** @@ -1000,7 +1015,8 @@ static inline void hid_map_usage_clear(struct hid_input *hidinput, __u8 type, __u16 c) { hid_map_usage(hidinput, usage, bit, max, type, c); - clear_bit(c, *bit); + if (*bit) + clear_bit(usage->code, *bit); } /** -- cgit v1.2.3 From 3b5455636fe26ea21b4189d135a424a6da016418 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Sep 2020 12:32:45 -0400 Subject: libata: implement ATA_HORKAGE_MAX_TRIM_128M and apply to Sandisks All three generations of Sandisk SSDs lock up hard intermittently. Experiments showed that disabling NCQ lowered the failure rate significantly and the kernel has been disabling NCQ for some models of SD7's and 8's, which is obviously undesirable. Karthik worked with Sandisk to root cause the hard lockups to trim commands larger than 128M. This patch implements ATA_HORKAGE_MAX_TRIM_128M which limits max trim size to 128M and applies it to all three generations of Sandisk SSDs. Signed-off-by: Tejun Heo Cc: Karthik Shivaram Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 5 ++--- drivers/ata/libata-scsi.c | 8 +++++++- include/linux/libata.h | 1 + 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index b1cd4d97bc2a..1be73d29119a 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3868,9 +3868,8 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { /* https://bugzilla.kernel.org/show_bug.cgi?id=15573 */ { "C300-CTFDDAC128MAG", "0001", ATA_HORKAGE_NONCQ, }, - /* Some Sandisk SSDs lock up hard with NCQ enabled. Reported on - SD7SN6S256G and SD8SN8U256G */ - { "SanDisk SD[78]SN*G", NULL, ATA_HORKAGE_NONCQ, }, + /* Sandisk SD7/8/9s lock up hard on large trims */ + { "SanDisk SD[789]*", NULL, ATA_HORKAGE_MAX_TRIM_128M, }, /* devices which puke on READ_NATIVE_MAX */ { "HDS724040KLSA80", "KFAOA20N", ATA_HORKAGE_BROKEN_HPA, }, diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index ec233208585b..c7b5049b42d1 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -2080,6 +2080,7 @@ static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf) static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf) { + struct ata_device *dev = args->dev; u16 min_io_sectors; rbuf[1] = 0xb0; @@ -2105,7 +2106,12 @@ static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf) * with the unmap bit set. */ if (ata_id_has_trim(args->id)) { - put_unaligned_be64(65535 * ATA_MAX_TRIM_RNUM, &rbuf[36]); + u64 max_blocks = 65535 * ATA_MAX_TRIM_RNUM; + + if (dev->horkage & ATA_HORKAGE_MAX_TRIM_128M) + max_blocks = 128 << (20 - SECTOR_SHIFT); + + put_unaligned_be64(max_blocks, &rbuf[36]); put_unaligned_be32(1, &rbuf[28]); } diff --git a/include/linux/libata.h b/include/linux/libata.h index 77ccf040a128..5f550eb27f81 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -421,6 +421,7 @@ enum { ATA_HORKAGE_NO_DMA_LOG = (1 << 23), /* don't use DMA for log read */ ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ + ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 7e24969022cbd61ddc586f14824fc205661bb124 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Aug 2020 18:00:55 +0800 Subject: block: allow for_each_bvec to support zero len bvec Block layer usually doesn't support or allow zero-length bvec. Since commit 1bdc76aea115 ("iov_iter: use bvec iterator to implement iterate_bvec()"), iterate_bvec() switches to bvec iterator. However, Al mentioned that 'Zero-length segments are not disallowed' in iov_iter. Fixes for_each_bvec() so that it can move on after seeing one zero length bvec. Fixes: 1bdc76aea115 ("iov_iter: use bvec iterator to implement iterate_bvec()") Reported-by: syzbot Signed-off-by: Ming Lei Tested-by: Tetsuo Handa Cc: Al Viro Cc: Matthew Wilcox Cc: Link: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2262077.html Signed-off-by: Jens Axboe --- include/linux/bvec.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ac0c7299d5b8..dd74503f7e5e 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -117,11 +117,18 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, return true; } +static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter) +{ + iter->bi_bvec_done = 0; + iter->bi_idx++; +} + #define for_each_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ - bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) + (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter), \ + (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter))) /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ -- cgit v1.2.3 From 4533d3aed857c558d6aabd00d0cb04100c5a2258 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 1 Sep 2020 10:33:25 +0200 Subject: memremap: rename MEMORY_DEVICE_DEVDAX to MEMORY_DEVICE_GENERIC This is in preparation for the logic behind MEMORY_DEVICE_DEVDAX also being used by non DAX devices. No functional change intended. Signed-off-by: Roger Pau Monné Reviewed-by: Ira Weiny Acked-by: Andrew Morton Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20200901083326.21264-3-roger.pau@citrix.com Signed-off-by: Juergen Gross --- drivers/dax/device.c | 2 +- include/linux/memremap.h | 9 ++++----- mm/memremap.c | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 4c0af2eb7e19..1e89513f3c59 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -429,7 +429,7 @@ int dev_dax_probe(struct device *dev) return -EBUSY; } - dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX; + dev_dax->pgmap.type = MEMORY_DEVICE_GENERIC; addr = devm_memremap_pages(dev, &dev_dax->pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 5f5b2df06e61..e5862746751b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -46,11 +46,10 @@ struct vmem_altmap { * wakeup is used to coordinate physical address space management (ex: * fs truncate/hole punch) vs pinned pages (ex: device dma). * - * MEMORY_DEVICE_DEVDAX: + * MEMORY_DEVICE_GENERIC: * Host memory that has similar access semantics as System RAM i.e. DMA - * coherent and supports page pinning. In contrast to - * MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax - * character device. + * coherent and supports page pinning. This is for example used by DAX devices + * that expose memory using a character device. * * MEMORY_DEVICE_PCI_P2PDMA: * Device memory residing in a PCI BAR intended for use with Peer-to-Peer @@ -60,7 +59,7 @@ enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_FS_DAX, - MEMORY_DEVICE_DEVDAX, + MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, }; diff --git a/mm/memremap.c b/mm/memremap.c index 03e38b7a38f1..006dace60b1a 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -216,7 +216,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; - case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_GENERIC: need_devmap_managed = false; break; case MEMORY_DEVICE_PCI_P2PDMA: -- cgit v1.2.3 From 4facb95b7adaf77e2da73aafb9ba60996fe42a12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Sep 2020 01:50:54 +0200 Subject: x86/entry: Unbreak 32bit fast syscall Andy reported that the syscall treacing for 32bit fast syscall fails: # ./tools/testing/selftests/x86/ptrace_syscall_32 ... [RUN] SYSEMU [FAIL] Initial args are wrong (nr=224, args=10 11 12 13 14 4289172732) ... [RUN] SYSCALL [FAIL] Initial args are wrong (nr=29, args=0 0 0 0 0 4289172732) The eason is that the conversion to generic entry code moved the retrieval of the sixth argument (EBP) after the point where the syscall entry work runs, i.e. ptrace, seccomp, audit... Unbreak it by providing a split up version of syscall_enter_from_user_mode(). - syscall_enter_from_user_mode_prepare() establishes state and enables interrupts - syscall_enter_from_user_mode_work() runs the entry work Replace the call to syscall_enter_from_user_mode() in the 32bit fast syscall C-entry with the split functions and stick the EBP retrieval between them. Fixes: 27d6b4d14f5c ("x86/entry: Use generic syscall entry function") Reported-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87k0xdjbtt.fsf@nanos.tec.linutronix.de --- arch/x86/entry/common.c | 29 +++++++++++++++++-------- include/linux/entry-common.h | 51 ++++++++++++++++++++++++++++++++++++-------- kernel/entry/common.c | 35 ++++++++++++++++++++++++------ 3 files changed, 91 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 48512c7944e7..2f84c7ca74ea 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -60,16 +60,10 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { - unsigned int nr = (unsigned int)regs->orig_ax; - if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; - /* - * Subtlety here: if ptrace pokes something larger than 2^32-1 into - * orig_ax, the unsigned int return value truncates it. This may - * or may not be necessary, but it matches the old asm behavior. - */ - return (unsigned int)syscall_enter_from_user_mode(regs, nr); + + return (unsigned int)regs->orig_ax; } /* @@ -91,15 +85,29 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { unsigned int nr = syscall_32_enter(regs); + /* + * Subtlety here: if ptrace pokes something larger than 2^32-1 into + * orig_ax, the unsigned int return value truncates it. This may + * or may not be necessary, but it matches the old asm behavior. + */ + nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + do_syscall_32_irqs_on(regs, nr); syscall_exit_to_user_mode(regs); } static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + unsigned int nr = syscall_32_enter(regs); int res; + /* + * This cannot use syscall_enter_from_user_mode() as it has to + * fetch EBP before invoking any of the syscall entry work + * functions. + */ + syscall_enter_from_user_mode_prepare(regs); + instrumentation_begin(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { @@ -122,6 +130,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return false; } + /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ + nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); + /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); syscall_exit_to_user_mode(regs); diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index efebbffcd5cc..159c7476b11b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -110,15 +110,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs #endif /** - * syscall_enter_from_user_mode - Check and handle work before invoking - * a syscall + * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts * @regs: Pointer to currents pt_regs - * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts * disabled. The calling code has to be non-instrumentable. When the - * function returns all state is correct and the subsequent functions can be - * instrumented. + * function returns all state is correct, interrupts are enabled and the + * subsequent functions can be instrumented. + * + * This handles lockdep, RCU (context tracking) and tracing state. + * + * This is invoked when there is extra architecture specific functionality + * to be done between establishing state and handling user mode entry work. + */ +void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); + +/** + * syscall_enter_from_user_mode_work - Check and handle work before invoking + * a syscall + * @regs: Pointer to currents pt_regs + * @syscall: The syscall number + * + * Invoked from architecture specific syscall entry code with interrupts + * enabled after invoking syscall_enter_from_user_mode_prepare() and extra + * architecture specific work. * * Returns: The original or a modified syscall number * @@ -127,12 +142,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs * syscall_set_return_value() first. If neither of those are called and -1 * is returned, then the syscall will fail with ENOSYS. * - * The following functionality is handled here: + * It handles the following work items: * - * 1) Establish state (lockdep, RCU (context tracking), tracing) - * 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(), + * 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(), * __secure_computing(), trace_sys_enter() - * 3) Invocation of audit_syscall_entry() + * 2) Invocation of audit_syscall_entry() + */ +long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); + +/** + * syscall_enter_from_user_mode - Establish state and check and handle work + * before invoking a syscall + * @regs: Pointer to currents pt_regs + * @syscall: The syscall number + * + * Invoked from architecture specific syscall entry code with interrupts + * disabled. The calling code has to be non-instrumentable. When the + * function returns all state is correct, interrupts are enabled and the + * subsequent functions can be instrumented. + * + * This is combination of syscall_enter_from_user_mode_prepare() and + * syscall_enter_from_user_mode_work(). + * + * Returns: The original or a modified syscall number. See + * syscall_enter_from_user_mode_work() for further explanation. */ long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index fcae019158ca..18683598edbc 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -69,22 +69,45 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall_get_nr(current, regs); } -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +static __always_inline long +__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { unsigned long ti_work; - enter_from_user_mode(regs); - instrumentation_begin(); - - local_irq_enable(); ti_work = READ_ONCE(current_thread_info()->flags); if (ti_work & SYSCALL_ENTER_WORK) syscall = syscall_trace_enter(regs, syscall, ti_work); - instrumentation_end(); return syscall; } +long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + return __syscall_enter_from_user_work(regs, syscall); +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = __syscall_enter_from_user_work(regs, syscall); + instrumentation_end(); + + return ret; +} + +noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + instrumentation_begin(); + local_irq_enable(); + instrumentation_end(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * -- cgit v1.2.3 From a2d375eda771a6a4866f3717a8ed81b63acb1dbd Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Mon, 31 Aug 2020 12:22:09 -0600 Subject: dyndbg: refine export, rename to dynamic_debug_exec_queries() commit 4c0d77828d4f ("dyndbg: export ddebug_exec_queries") had a few problems: - broken non DYNAMIC_DEBUG_CORE configs, sparse warning - the exported function modifies query string, breaks on RO strings. - func name follows internal convention, shouldn't be exposed as is. 1st is fixed in header with ifdefd function prototype or stub defn. Also remove an obsolete HAVE-symbol ifdef-comment, and add others. Fix others by wrapping existing internal function with a new one, named in accordance with module-prefix naming convention, before export hits v5.9.0. In new function, copy query string to a local buffer, so users can pass hard-coded/RO queries, and internal function can be used unchanged. Fixes: 4c0d77828d4f ("dyndbg: export ddebug_exec_queries") Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20200831182210.850852-3-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 20 ++++++++++++++++---- lib/dynamic_debug.c | 27 +++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index aa9ff9e1c0b3..8aa0c7c2608c 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -49,6 +49,10 @@ struct _ddebug { #if defined(CONFIG_DYNAMIC_DEBUG_CORE) + +/* exported for module authors to exercise >control */ +int dynamic_debug_exec_queries(const char *query, const char *modname); + int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); @@ -105,7 +109,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, static_branch_unlikely(&descriptor.key.dd_key_false) #endif -#else /* !HAVE_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL */ #define _DPRINTK_KEY_INIT @@ -117,7 +121,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) #endif -#endif +#endif /* CONFIG_JUMP_LABEL */ #define __dynamic_func_call(id, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA(id, fmt); \ @@ -172,10 +176,11 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, KERN_DEBUG, prefix_str, prefix_type, \ rowsize, groupsize, buf, len, ascii) -#else +#else /* !CONFIG_DYNAMIC_DEBUG_CORE */ #include #include +#include static inline int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname) @@ -210,6 +215,13 @@ static inline int ddebug_dyndbg_module_param_cb(char *param, char *val, print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, \ rowsize, groupsize, buf, len, ascii); \ } while (0) -#endif + +static inline int dynamic_debug_exec_queries(const char *query, const char *modname) +{ + pr_warn("kernel not built with CONFIG_DYNAMIC_DEBUG_CORE\n"); + return 0; +} + +#endif /* !CONFIG_DYNAMIC_DEBUG_CORE */ #endif diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 01b7d0210412..08e4b057514c 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -525,7 +525,7 @@ static int ddebug_exec_query(char *query_string, const char *modname) last error or number of matching callsites. Module name is either in param (for boot arg) or perhaps in query string. */ -int ddebug_exec_queries(char *query, const char *modname) +static int ddebug_exec_queries(char *query, const char *modname) { char *split; int i, errs = 0, exitcode = 0, rc, nfound = 0; @@ -557,7 +557,30 @@ int ddebug_exec_queries(char *query, const char *modname) return exitcode; return nfound; } -EXPORT_SYMBOL_GPL(ddebug_exec_queries); + +/** + * dynamic_debug_exec_queries - select and change dynamic-debug prints + * @query: query-string described in admin-guide/dynamic-debug-howto + * @modname: string containing module name, usually &module.mod_name + * + * This uses the >/proc/dynamic_debug/control reader, allowing module + * authors to modify their dynamic-debug callsites. The modname is + * canonically struct module.mod_name, but can also be null or a + * module-wildcard, for example: "drm*". + */ +int dynamic_debug_exec_queries(const char *query, const char *modname) +{ + int rc; + char *qry = kstrndup(query, PAGE_SIZE, GFP_KERNEL); + + if (!query) + return -ENOMEM; + + rc = ddebug_exec_queries(qry, modname); + kfree(qry); + return rc; +} +EXPORT_SYMBOL_GPL(dynamic_debug_exec_queries); #define PREFIX_SIZE 64 -- cgit v1.2.3 From 1a0cf26323c80e2f1c58fc04f15686de61bfab0c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 21 Aug 2020 19:49:56 -0400 Subject: mm/ksm: Remove reuse_ksm_page() Remove the function as the last reference has gone away with the do_wp_page() changes. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 7 ------- mm/ksm.c | 25 ------------------------- 2 files changed, 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e48b1e453ff5..161e8164abcf 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -53,8 +53,6 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); -bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, unsigned long address); #else /* !CONFIG_KSM */ @@ -88,11 +86,6 @@ static inline void rmap_walk_ksm(struct page *page, static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } -static inline bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, unsigned long address) -{ - return false; -} #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ diff --git a/mm/ksm.c b/mm/ksm.c index 4102034cd55a..12958287fac3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2660,31 +2660,6 @@ again: goto again; } -bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, - unsigned long address) -{ -#ifdef CONFIG_DEBUG_VM - if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || - WARN_ON(!page_mapped(page)) || - WARN_ON(!PageLocked(page))) { - dump_page(page, "reuse_ksm_page"); - return false; - } -#endif - - if (PageSwapCache(page) || !page_stable_node(page)) - return false; - /* Prohibit parallel get_ksm_page() */ - if (!page_ref_freeze(page, 1)) - return false; - - page_move_anon_rmap(page, vma); - page->index = linear_page_index(vma, address); - page_ref_unfreeze(page, 1); - - return true; -} #ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { -- cgit v1.2.3 From 798a6b87ecd72828a6c6b5469aaa2032a57e92b7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 21 Aug 2020 19:49:58 -0400 Subject: mm: Add PGREUSE counter This accounts for wp_page_reuse() case, where we reused a page for COW. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + mm/memory.c | 1 + mm/vmstat.c | 1 + 3 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 24fc7c3ae7d6..58d3f91baad1 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGFAULT, PGMAJFAULT, PGLAZYFREED, PGREFILL, + PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSCAN_KSWAPD, diff --git a/mm/memory.c b/mm/memory.c index 56ae945c6845..20d93001ef93 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2619,6 +2619,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); + count_vm_event(PGREUSE); } /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 3fb23a21f6dd..907a5045bfa3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1198,6 +1198,7 @@ const char * const vmstat_text[] = { "pglazyfreed", "pgrefill", + "pgreuse", "pgsteal_kswapd", "pgsteal_direct", "pgscan_kswapd", -- cgit v1.2.3 From 428fc0aff4e59399ec719ffcc1f7a5d29a4ee476 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 4 Sep 2020 16:36:19 -0700 Subject: include/linux/log2.h: add missing () around n in roundup_pow_of_two() Otherwise gcc generates warnings if the expression is complicated. Fixes: 312a0c170945 ("[PATCH] LOG2: Alter roundup_pow_of_two() so that it can use a ilog2() on a constant") Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/0-v1-8a2697e3c003+41165-log_brackets_jgg@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/log2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/log2.h b/include/linux/log2.h index 83a4a3ca3e8a..c619ec6eff4a 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -173,7 +173,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n) #define roundup_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ - (n == 1) ? 1 : \ + ((n) == 1) ? 1 : \ (1UL << (ilog2((n) - 1) + 1)) \ ) : \ __roundup_pow_of_two(n) \ -- cgit v1.2.3 From 1c30474826682bc970c3200700d8bcfa2b771278 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:42:52 -0700 Subject: PM: : fix @em_pd kernel-doc warning Fix kernel-doc warning in : ../include/linux/device.h:613: warning: Function parameter or member 'em_pd' not described in 'device' Fixes: 1bc138c62295 ("PM / EM: add support for other devices than CPUs in Energy Model") Signed-off-by: Randy Dunlap Cc: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/d97f40ad-3033-703a-c3cb-2843ce0f6371@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index ca18da4768e3..9e6ea8931a52 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -454,6 +454,7 @@ struct dev_links_info { * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. + * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pinctl.rst for details. * @msi_list: Hosts MSI descriptors -- cgit v1.2.3 From eb02d39ad3099c3dcd96c5b3fdc0303541766ed1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:31:16 -0700 Subject: netdevice.h: fix proto_down_reason kernel-doc warning Fix kernel-doc warning in : ../include/linux/netdevice.h:2158: warning: Function parameter or member 'proto_down_reason' not described in 'net_device' Fixes: 829eb208e80d ("rtnetlink: add support for protodown reason") Signed-off-by: Randy Dunlap Acked-by: Roopa Prabhu Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..bf0e313486be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1784,6 +1784,7 @@ enum netdev_priv_flags { * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers * + * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one -- cgit v1.2.3 From ffa59b0b396cb2c0ea6712ff30ee05c35d4c9c8d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:32:30 -0700 Subject: netdevice.h: fix xdp_state kernel-doc warning Fix kernel-doc warning in : ../include/linux/netdevice.h:2158: warning: Function parameter or member 'xdp_state' not described in 'net_device' Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") Signed-off-by: Randy Dunlap Cc: Andrii Nakryiko Cc: Alexei Starovoitov Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf0e313486be..7bd4fcdd0738 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1849,6 +1849,7 @@ enum netdev_priv_flags { * @udp_tunnel_nic_info: static structure describing the UDP tunnel * offload capabilities of the device * @udp_tunnel_nic: UDP tunnel offload state + * @xdp_state: stores info on attached XDP BPF programs * * FIXME: cleanup struct net_device such that network protocol info * moves out. -- cgit v1.2.3 From 0a355aeb24081e4538d4d424cd189f16c0bbd983 Mon Sep 17 00:00:00 2001 From: Evan Nimmo Date: Wed, 9 Sep 2020 08:32:47 +1200 Subject: i2c: algo: pca: Reapply i2c bus settings after reset If something goes wrong (such as the SCL being stuck low) then we need to reset the PCA chip. The issue with this is that on reset we lose all config settings and the chip ends up in a disabled state which results in a lock up/high CPU usage. We need to re-apply any configuration that had previously been set and re-enable the chip. Signed-off-by: Evan Nimmo Reviewed-by: Chris Packham Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/algos/i2c-algo-pca.c | 35 +++++++++++++++++++++++------------ include/linux/i2c-algo-pca.h | 15 +++++++++++++++ 2 files changed, 38 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/algos/i2c-algo-pca.c b/drivers/i2c/algos/i2c-algo-pca.c index 710fbef9a9c2..384af88e58ad 100644 --- a/drivers/i2c/algos/i2c-algo-pca.c +++ b/drivers/i2c/algos/i2c-algo-pca.c @@ -41,8 +41,22 @@ static void pca_reset(struct i2c_algo_pca_data *adap) pca_outw(adap, I2C_PCA_INDPTR, I2C_PCA_IPRESET); pca_outw(adap, I2C_PCA_IND, 0xA5); pca_outw(adap, I2C_PCA_IND, 0x5A); + + /* + * After a reset we need to re-apply any configuration + * (calculated in pca_init) to get the bus in a working state. + */ + pca_outw(adap, I2C_PCA_INDPTR, I2C_PCA_IMODE); + pca_outw(adap, I2C_PCA_IND, adap->bus_settings.mode); + pca_outw(adap, I2C_PCA_INDPTR, I2C_PCA_ISCLL); + pca_outw(adap, I2C_PCA_IND, adap->bus_settings.tlow); + pca_outw(adap, I2C_PCA_INDPTR, I2C_PCA_ISCLH); + pca_outw(adap, I2C_PCA_IND, adap->bus_settings.thi); + + pca_set_con(adap, I2C_PCA_CON_ENSIO); } else { adap->reset_chip(adap->data); + pca_set_con(adap, I2C_PCA_CON_ENSIO | adap->bus_settings.clock_freq); } } @@ -423,13 +437,14 @@ static int pca_init(struct i2c_adapter *adap) " Use the nominal frequency.\n", adap->name); } - pca_reset(pca_data); - clock = pca_clock(pca_data); printk(KERN_INFO "%s: Clock frequency is %dkHz\n", adap->name, freqs[clock]); - pca_set_con(pca_data, I2C_PCA_CON_ENSIO | clock); + /* Store settings as these will be needed when the PCA chip is reset */ + pca_data->bus_settings.clock_freq = clock; + + pca_reset(pca_data); } else { int clock; int mode; @@ -496,19 +511,15 @@ static int pca_init(struct i2c_adapter *adap) thi = tlow * min_thi / min_tlow; } + /* Store settings as these will be needed when the PCA chip is reset */ + pca_data->bus_settings.mode = mode; + pca_data->bus_settings.tlow = tlow; + pca_data->bus_settings.thi = thi; + pca_reset(pca_data); printk(KERN_INFO "%s: Clock frequency is %dHz\n", adap->name, clock * 100); - - pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_IMODE); - pca_outw(pca_data, I2C_PCA_IND, mode); - pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_ISCLL); - pca_outw(pca_data, I2C_PCA_IND, tlow); - pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_ISCLH); - pca_outw(pca_data, I2C_PCA_IND, thi); - - pca_set_con(pca_data, I2C_PCA_CON_ENSIO); } udelay(500); /* 500 us for oscillator to stabilise */ diff --git a/include/linux/i2c-algo-pca.h b/include/linux/i2c-algo-pca.h index d03071732db4..7c522fdd9ea7 100644 --- a/include/linux/i2c-algo-pca.h +++ b/include/linux/i2c-algo-pca.h @@ -53,6 +53,20 @@ #define I2C_PCA_CON_SI 0x08 /* Serial Interrupt */ #define I2C_PCA_CON_CR 0x07 /* Clock Rate (MASK) */ +/** + * struct pca_i2c_bus_settings - The configured PCA i2c bus settings + * @mode: Configured i2c bus mode + * @tlow: Configured SCL LOW period + * @thi: Configured SCL HIGH period + * @clock_freq: The configured clock frequency + */ +struct pca_i2c_bus_settings { + int mode; + int tlow; + int thi; + int clock_freq; +}; + struct i2c_algo_pca_data { void *data; /* private low level data */ void (*write_byte) (void *data, int reg, int val); @@ -64,6 +78,7 @@ struct i2c_algo_pca_data { * For PCA9665, use the frequency you want here. */ unsigned int i2c_clock; unsigned int chip; + struct pca_i2c_bus_settings bus_settings; }; int i2c_pca_add_bus(struct i2c_adapter *); -- cgit v1.2.3 From 4a009cb04aeca0de60b73f37b102573354214b52 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Sep 2020 01:27:40 -0700 Subject: net: add __must_check to skb_put_padto() skb_put_padto() and __skb_put_padto() callers must check return values or risk use-after-free. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ed9bea924dc3..04a18e01b362 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3223,8 +3223,9 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error if @free_on_error is true. */ -static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, - bool free_on_error) +static inline int __must_check __skb_put_padto(struct sk_buff *skb, + unsigned int len, + bool free_on_error) { unsigned int size = skb->len; @@ -3247,7 +3248,7 @@ static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ -static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len) { return __skb_put_padto(skb, len, true); } -- cgit v1.2.3 From 2d2fe8433796603091ac8ea235b9165ac5a85f9a Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 9 Sep 2020 20:43:08 +0300 Subject: net: qed: Disable aRFS for NPAR and 100G In CMT and NPAR the PF is unknown when the GFS block processes the packet. Therefore cannot use searcher as it has a per PF database, and thus ARFS must be disabled. Fixes: d51e4af5c209 ("qed: aRFS infrastructure support") Signed-off-by: Manish Chopra Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 11 ++++++++++- drivers/net/ethernet/qlogic/qed/qed_l2.c | 3 +++ drivers/net/ethernet/qlogic/qed/qed_main.c | 2 ++ include/linux/qed/qed_if.h | 1 + 4 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index b8f076e4e6b8..3db181f3617a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -4253,7 +4253,8 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) | BIT(QED_MF_LLH_PROTO_CLSS) | BIT(QED_MF_LL2_NON_UNICAST) | - BIT(QED_MF_INTER_PF_SWITCH); + BIT(QED_MF_INTER_PF_SWITCH) | + BIT(QED_MF_DISABLE_ARFS); break; case NVM_CFG1_GLOB_MF_MODE_DEFAULT: cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) | @@ -4266,6 +4267,14 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n", cdev->mf_bits); + + /* In CMT the PF is unknown when the GFS block processes the + * packet. Therefore cannot use searcher as it has a per PF + * database, and thus ARFS must be disabled. + * + */ + if (QED_IS_CMT(cdev)) + cdev->mf_bits |= BIT(QED_MF_DISABLE_ARFS); } DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n", diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index 4c6ac8862744..07824bf9d68d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -1980,6 +1980,9 @@ void qed_arfs_mode_configure(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, struct qed_arfs_config_params *p_cfg_params) { + if (test_bit(QED_MF_DISABLE_ARFS, &p_hwfn->cdev->mf_bits)) + return; + if (p_cfg_params->mode != QED_FILTER_CONFIG_MODE_DISABLE) { qed_gft_config(p_hwfn, p_ptt, p_hwfn->rel_pf_id, p_cfg_params->tcp, diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index f39f629242a1..50e5eb22e60a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -444,6 +444,8 @@ int qed_fill_dev_info(struct qed_dev *cdev, dev_info->fw_eng = FW_ENGINEERING_VERSION; dev_info->b_inter_pf_switch = test_bit(QED_MF_INTER_PF_SWITCH, &cdev->mf_bits); + if (!test_bit(QED_MF_DISABLE_ARFS, &cdev->mf_bits)) + dev_info->b_arfs_capable = true; dev_info->tx_switching = true; if (hw_info->b_wol_support == QED_WOL_SUPPORT_PME) diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index cd6a5c7e56eb..cdd73afc4c46 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -623,6 +623,7 @@ struct qed_dev_info { #define QED_MFW_VERSION_3_OFFSET 24 u32 flash_size; + bool b_arfs_capable; bool b_inter_pf_switch; bool tx_switching; bool rdma_supported; -- cgit v1.2.3 From baaabecfc80fad255f866563b53b8c7a3eec176e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 9 Sep 2020 15:53:54 -0700 Subject: test_firmware: Test platform fw loading on non-EFI systems On non-EFI systems, it wasn't possible to test the platform firmware loader because it will have never set "checked_fw" during __init. Instead, allow the test code to override this check. Additionally split the declarations into a private symbol namespace so there is greater enforcement of the symbol visibility. Fixes: 548193cba2a7 ("test_firmware: add support for firmware_request_platform") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20200909225354.3118328-1-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/embedded-firmware.c | 10 +++++----- include/linux/efi_embedded_fw.h | 6 ++---- lib/test_firmware.c | 9 +++++++++ 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/embedded-firmware.c b/drivers/firmware/efi/embedded-firmware.c index e97a9c9d010c..21ae0c48232a 100644 --- a/drivers/firmware/efi/embedded-firmware.c +++ b/drivers/firmware/efi/embedded-firmware.c @@ -16,9 +16,9 @@ /* Exported for use by lib/test_firmware.c only */ LIST_HEAD(efi_embedded_fw_list); -EXPORT_SYMBOL_GPL(efi_embedded_fw_list); - -static bool checked_for_fw; +EXPORT_SYMBOL_NS_GPL(efi_embedded_fw_list, TEST_FIRMWARE); +bool efi_embedded_fw_checked; +EXPORT_SYMBOL_NS_GPL(efi_embedded_fw_checked, TEST_FIRMWARE); static const struct dmi_system_id * const embedded_fw_table[] = { #ifdef CONFIG_TOUCHSCREEN_DMI @@ -116,14 +116,14 @@ void __init efi_check_for_embedded_firmwares(void) } } - checked_for_fw = true; + efi_embedded_fw_checked = true; } int efi_get_embedded_fw(const char *name, const u8 **data, size_t *size) { struct efi_embedded_fw *iter, *fw = NULL; - if (!checked_for_fw) { + if (!efi_embedded_fw_checked) { pr_warn("Warning %s called while we did not check for embedded fw\n", __func__); return -ENOENT; diff --git a/include/linux/efi_embedded_fw.h b/include/linux/efi_embedded_fw.h index 57eac5241303..a97a12bb2c9e 100644 --- a/include/linux/efi_embedded_fw.h +++ b/include/linux/efi_embedded_fw.h @@ -8,8 +8,8 @@ #define EFI_EMBEDDED_FW_PREFIX_LEN 8 /* - * This struct and efi_embedded_fw_list are private to the efi-embedded fw - * implementation they are in this header for use by lib/test_firmware.c only! + * This struct is private to the efi-embedded fw implementation. + * They are in this header for use by lib/test_firmware.c only! */ struct efi_embedded_fw { struct list_head list; @@ -18,8 +18,6 @@ struct efi_embedded_fw { size_t length; }; -extern struct list_head efi_embedded_fw_list; - /** * struct efi_embedded_fw_desc - This struct is used by the EFI embedded-fw * code to search for embedded firmwares. diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 9fee2b93a8d1..06c955057756 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -26,6 +26,8 @@ #include #include +MODULE_IMPORT_NS(TEST_FIRMWARE); + #define TEST_FIRMWARE_NAME "test-firmware.bin" #define TEST_FIRMWARE_NUM_REQS 4 #define TEST_FIRMWARE_BUF_SIZE SZ_1K @@ -489,6 +491,9 @@ out: static DEVICE_ATTR_WO(trigger_request); #ifdef CONFIG_EFI_EMBEDDED_FIRMWARE +extern struct list_head efi_embedded_fw_list; +extern bool efi_embedded_fw_checked; + static ssize_t trigger_request_platform_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -501,6 +506,7 @@ static ssize_t trigger_request_platform_store(struct device *dev, }; struct efi_embedded_fw efi_embedded_fw; const struct firmware *firmware = NULL; + bool saved_efi_embedded_fw_checked; char *name; int rc; @@ -513,6 +519,8 @@ static ssize_t trigger_request_platform_store(struct device *dev, efi_embedded_fw.data = (void *)test_data; efi_embedded_fw.length = sizeof(test_data); list_add(&efi_embedded_fw.list, &efi_embedded_fw_list); + saved_efi_embedded_fw_checked = efi_embedded_fw_checked; + efi_embedded_fw_checked = true; pr_info("loading '%s'\n", name); rc = firmware_request_platform(&firmware, name, dev); @@ -530,6 +538,7 @@ static ssize_t trigger_request_platform_store(struct device *dev, rc = count; out: + efi_embedded_fw_checked = saved_efi_embedded_fw_checked; release_firmware(firmware); list_del(&efi_embedded_fw.list); kfree(name); -- cgit v1.2.3 From 95035eac763294eb4543aea9afd48d2f7c8caa5c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:42:52 -0700 Subject: PM: : fix @em_pd kernel-doc warning Fix kernel-doc warning in : ../include/linux/device.h:613: warning: Function parameter or member 'em_pd' not described in 'device' Fixes: 1bc138c62295 ("PM / EM: add support for other devices than CPUs in Energy Model") Signed-off-by: Randy Dunlap Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index ca18da4768e3..9e6ea8931a52 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -454,6 +454,7 @@ struct dev_links_info { * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. + * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pinctl.rst for details. * @msi_list: Hosts MSI descriptors -- cgit v1.2.3 From cc88b78c0870ebcab2123ba9e73689d97fbf3b14 Mon Sep 17 00:00:00 2001 From: Amit Kucheria Date: Thu, 10 Sep 2020 15:57:46 +0530 Subject: powercap: make documentation reflect code Fix up the documentation of the struct powercap_control_type members to match the code. Also fixup stray whitespace. Signed-off-by: Amit Kucheria [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/powercap.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/powercap.h b/include/linux/powercap.h index 4537f57f9e42..3d557bbcd2c7 100644 --- a/include/linux/powercap.h +++ b/include/linux/powercap.h @@ -44,19 +44,18 @@ struct powercap_control_type_ops { }; /** - * struct powercap_control_type- Defines a powercap control_type - * @name: name of control_type + * struct powercap_control_type - Defines a powercap control_type * @dev: device for this control_type * @idr: idr to have unique id for its child - * @root_node: Root holding power zones for this control_type + * @nr_zones: counter for number of zones of this type * @ops: Pointer to callback struct - * @node_lock: mutex for control type + * @lock: mutex for control type * @allocated: This is possible that client owns the memory * used by this structure. In this case * this flag is set to false by framework to * prevent deallocation during release process. * Otherwise this flag is set to true. - * @ctrl_inst: link to the control_type list + * @node: linked-list node * * Defines powercap control_type. This acts as a container for power * zones, which use same method to control power. E.g. RAPL, RAPL-PCI etc. @@ -129,7 +128,7 @@ struct powercap_zone_ops { * this flag is set to false by framework to * prevent deallocation during release process. * Otherwise this flag is set to true. - * @constraint_ptr: List of constraints for this zone. + * @constraints: List of constraints for this zone. * * This defines a power zone instance. The fields of this structure are * private, and should not be used by client drivers. -- cgit v1.2.3 From 83896b0bd8223ac33bcc609bcc82a57a587002ff Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 10 Sep 2020 04:46:40 -0400 Subject: net: Fix broken NETIF_F_CSUM_MASK spell in netdev_features.h Remove the weird space inside the NETIF_F_CSUM_MASK. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2cc3cf80b49a..0b17c4322b09 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -193,7 +193,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) #define NETIF_F_GSO_MASK (__NETIF_F_BIT(NETIF_F_GSO_LAST + 1) - \ __NETIF_F_BIT(NETIF_F_GSO_SHIFT)) -/* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be +/* List of IP checksum features. Note that NETIF_F_HW_CSUM should not be * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- * this would be contradictory */ -- cgit v1.2.3 From e0830dbf71f191851ed3772d2760f007b7c5bc3a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Sep 2020 16:31:01 +0200 Subject: serial: core: fix console port-lock regression Fix the port-lock initialisation regression introduced by commit a3cb39d258ef ("serial: core: Allow detach and attach serial device for console") by making sure that the lock is again initialised during console setup. The console may be registered before the serial controller has been probed in which case the port lock needs to be initialised during console setup by a call to uart_set_options(). The console-detach changes introduced a regression in several drivers by effectively removing that initialisation by not initialising the lock when the port is used as a console (which is always the case during console setup). Add back the early lock initialisation and instead use a new console-reinit flag to handle the case where a console is being re-attached through sysfs. The question whether the console-detach interface should have been added in the first place is left for another discussion. Note that the console-enabled check in uart_set_options() is not redundant because of kgdboc, which can end up reinitialising an already enabled console (see commit 42b6a1baa3ec ("serial_core: Don't re-initialize a previously initialized spinlock.")). Fixes: a3cb39d258ef ("serial: core: Allow detach and attach serial device for console") Cc: stable # 5.7 Signed-off-by: Johan Hovold Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200909143101.15389-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 32 +++++++++++++++----------------- include/linux/serial_core.h | 1 + 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 53b79e1fcbc8..124524ecfe26 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -1916,24 +1916,12 @@ static inline bool uart_console_enabled(struct uart_port *port) return uart_console(port) && (port->cons->flags & CON_ENABLED); } -static void __uart_port_spin_lock_init(struct uart_port *port) +static void uart_port_spin_lock_init(struct uart_port *port) { spin_lock_init(&port->lock); lockdep_set_class(&port->lock, &port_lock_key); } -/* - * Ensure that the serial console lock is initialised early. - * If this port is a console, then the spinlock is already initialised. - */ -static inline void uart_port_spin_lock_init(struct uart_port *port) -{ - if (uart_console(port)) - return; - - __uart_port_spin_lock_init(port); -} - #if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(CONFIG_CONSOLE_POLL) /** * uart_console_write - write a console message to a serial port @@ -2086,7 +2074,15 @@ uart_set_options(struct uart_port *port, struct console *co, struct ktermios termios; static struct ktermios dummy; - uart_port_spin_lock_init(port); + /* + * Ensure that the serial-console lock is initialised early. + * + * Note that the console-enabled check is needed because of kgdboc, + * which can end up calling uart_set_options() for an already enabled + * console via tty_find_polling_driver() and uart_poll_init(). + */ + if (!uart_console_enabled(port) && !port->console_reinit) + uart_port_spin_lock_init(port); memset(&termios, 0, sizeof(struct ktermios)); @@ -2794,10 +2790,12 @@ static ssize_t console_store(struct device *dev, if (oldconsole && !newconsole) { ret = unregister_console(uport->cons); } else if (!oldconsole && newconsole) { - if (uart_console(uport)) + if (uart_console(uport)) { + uport->console_reinit = 1; register_console(uport->cons); - else + } else { ret = -ENOENT; + } } } else { ret = -ENXIO; @@ -2898,7 +2896,7 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport) * initialised. */ if (!uart_console_enabled(uport)) - __uart_port_spin_lock_init(uport); + uart_port_spin_lock_init(uport); if (uport->cons && uport->dev) of_console_check(uport->dev->of_node, uport->cons->name, uport->line); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 01fc4d9c9c54..8a99279a579b 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -248,6 +248,7 @@ struct uart_port { unsigned char hub6; /* this should be in the 8250 driver */ unsigned char suspended; + unsigned char console_reinit; const char *name; /* port name */ struct attribute_group *attr_group; /* port specific attributes */ const struct attribute_group **tty_groups; /* all attributes (serial core use only) */ -- cgit v1.2.3 From e6b1a44eccfcab5e5e280be376f65478c3b2c7a2 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 15 Sep 2020 22:07:50 +0800 Subject: locking/percpu-rwsem: Use this_cpu_{inc,dec}() for read_count The __this_cpu*() accessors are (in general) IRQ-unsafe which, given that percpu-rwsem is a blocking primitive, should be just fine. However, file_end_write() is used from IRQ context and will cause load-store issues on architectures where the per-cpu accessors are not natively irq-safe. Fix it by using the IRQ-safe this_cpu_*() for operations on read_count. This will generate more expensive code on a number of platforms, which might cause a performance regression for some of the other percpu-rwsem users. If any such is reported, we can consider alternative solutions. Fixes: 70fe2f48152e ("aio: fix freeze protection of aio writes") Signed-off-by: Hou Tao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Acked-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20200915140750.137881-1-houtao1@huawei.com --- include/linux/percpu-rwsem.h | 8 ++++---- kernel/locking/percpu-rwsem.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5e033fe1ff4e..5fda40f97fe9 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -60,7 +60,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) * anything we did within this RCU-sched read-size critical section. */ if (likely(rcu_sync_is_idle(&sem->rss))) - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); else __percpu_down_read(sem, false); /* Unconditional memory barrier */ /* @@ -79,7 +79,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); else ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ preempt_enable(); @@ -103,7 +103,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) { - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); } else { /* * slowpath; reader will only ever wake a single blocked @@ -115,7 +115,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) * aggregate zero, as that is the only time it matters) they * will also see our critical section. */ - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); rcuwait_wake_up(&sem->writer); } preempt_enable(); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 8bbafe3e5203..70a32a576f3f 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); /* * Due to having preemption disabled the decrement happens on @@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(!atomic_read_acquire(&sem->block))) return true; - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); /* Prod writer to re-evaluate readers_active_check() */ rcuwait_wake_up(&sem->writer); -- cgit v1.2.3 From 8747f2022fe8d8029193707ee86ff5c792cbef9b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2020 12:32:00 +0200 Subject: cpuidle: Allow cpuidle drivers to take over RCU-idle Some drivers have to do significant work, some of which relies on RCU still being active. Instead of using RCU_NONIDLE in the drivers and flipping RCU back on, allow drivers to take over RCU-idle duty. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ulf Hansson Tested-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 15 ++++++++++----- include/linux/cpuidle.h | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 04becd70cc41..6c7e5621cf9a 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -138,6 +138,7 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { ktime_t time_start, time_end; + struct cpuidle_state *target_state = &drv->states[index]; time_start = ns_to_ktime(local_clock()); @@ -153,8 +154,9 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv, * suspended is generally unsafe. */ stop_critical_timings(); - rcu_idle_enter(); - drv->states[index].enter_s2idle(dev, drv, index); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + rcu_idle_enter(); + target_state->enter_s2idle(dev, drv, index); if (WARN_ON_ONCE(!irqs_disabled())) local_irq_disable(); /* @@ -162,7 +164,8 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv, * first CPU executing it calls functions containing RCU read-side * critical sections, so tell RCU about that. */ - rcu_idle_exit(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + rcu_idle_exit(); tick_unfreeze(); start_critical_timings(); @@ -239,9 +242,11 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, time_start = ns_to_ktime(local_clock()); stop_critical_timings(); - rcu_idle_enter(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + rcu_idle_enter(); entered_state = target_state->enter(dev, drv, index); - rcu_idle_exit(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + rcu_idle_exit(); start_critical_timings(); sched_clock_idle_wakeup_event(); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 75895e6363b8..6175c77bf25e 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -82,6 +82,7 @@ struct cpuidle_state { #define CPUIDLE_FLAG_UNUSABLE BIT(3) /* avoid using this state */ #define CPUIDLE_FLAG_OFF BIT(4) /* disable this state by default */ #define CPUIDLE_FLAG_TLB_FLUSHED BIT(5) /* idle-state flushes TLBs */ +#define CPUIDLE_FLAG_RCU_IDLE BIT(6) /* idle-state takes care of RCU */ struct cpuidle_device_kobj; struct cpuidle_state_kobj; -- cgit v1.2.3 From ffbc3dd1975f1e2dac7b1752aa8b5cac3cd5b459 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 16 Sep 2020 23:18:43 +0300 Subject: fs: fix cast in fsparam_u32hex() macro Signed-off-by: Alexey Dobriyan Signed-off-by: Al Viro --- include/linux/fs_parser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 2eab6d5f6736..aab0ffc6bac6 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -120,7 +120,7 @@ static inline bool fs_validate_description(const char *name, #define fsparam_u32oct(NAME, OPT) \ __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8) #define fsparam_u32hex(NAME, OPT) \ - __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *16)) + __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16) #define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL) #define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL) #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) -- cgit v1.2.3 From 75df529bec9110dad43ab30e2d9490242529e8b8 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 16 Sep 2020 17:45:30 +0200 Subject: arm64: paravirt: Initialize steal time when cpu is online Steal time initialization requires mapping a memory region which invokes a memory allocation. Doing this at CPU starting time results in the following trace when CONFIG_DEBUG_ATOMIC_SLEEP is enabled: BUG: sleeping function called from invalid context at mm/slab.h:498 in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/1 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.9.0-rc5+ #1 Call trace: dump_backtrace+0x0/0x208 show_stack+0x1c/0x28 dump_stack+0xc4/0x11c ___might_sleep+0xf8/0x130 __might_sleep+0x58/0x90 slab_pre_alloc_hook.constprop.101+0xd0/0x118 kmem_cache_alloc_node_trace+0x84/0x270 __get_vm_area_node+0x88/0x210 get_vm_area_caller+0x38/0x40 __ioremap_caller+0x70/0xf8 ioremap_cache+0x78/0xb0 memremap+0x9c/0x1a8 init_stolen_time_cpu+0x54/0xf0 cpuhp_invoke_callback+0xa8/0x720 notify_cpu_starting+0xc8/0xd8 secondary_start_kernel+0x114/0x180 CPU1: Booted secondary processor 0x0000000001 [0x431f0a11] However we don't need to initialize steal time at CPU starting time. We can simply wait until CPU online time, just sacrificing a bit of accuracy by returning zero for steal time until we know better. While at it, add __init to the functions that are only called by pv_time_init() which is __init. Signed-off-by: Andrew Jones Fixes: e0685fa228fd ("arm64: Retrieve stolen time as paravirtualized guest") Cc: stable@vger.kernel.org Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20200916154530.40809-1-drjones@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/paravirt.c | 26 +++++++++++++++----------- include/linux/cpuhotplug.h | 1 - 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 295d66490584..c07d7a034941 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -50,16 +50,19 @@ static u64 pv_steal_clock(int cpu) struct pv_time_stolen_time_region *reg; reg = per_cpu_ptr(&stolen_time_region, cpu); - if (!reg->kaddr) { - pr_warn_once("stolen time enabled but not configured for cpu %d\n", - cpu); + + /* + * paravirt_steal_clock() may be called before the CPU + * online notification callback runs. Until the callback + * has run we just return zero. + */ + if (!reg->kaddr) return 0; - } return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time)); } -static int stolen_time_dying_cpu(unsigned int cpu) +static int stolen_time_cpu_down_prepare(unsigned int cpu) { struct pv_time_stolen_time_region *reg; @@ -73,7 +76,7 @@ static int stolen_time_dying_cpu(unsigned int cpu) return 0; } -static int init_stolen_time_cpu(unsigned int cpu) +static int stolen_time_cpu_online(unsigned int cpu) { struct pv_time_stolen_time_region *reg; struct arm_smccc_res res; @@ -103,19 +106,20 @@ static int init_stolen_time_cpu(unsigned int cpu) return 0; } -static int pv_time_init_stolen_time(void) +static int __init pv_time_init_stolen_time(void) { int ret; - ret = cpuhp_setup_state(CPUHP_AP_ARM_KVMPV_STARTING, - "hypervisor/arm/pvtime:starting", - init_stolen_time_cpu, stolen_time_dying_cpu); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "hypervisor/arm/pvtime:online", + stolen_time_cpu_online, + stolen_time_cpu_down_prepare); if (ret < 0) return ret; return 0; } -static bool has_pv_steal_clock(void) +static bool __init has_pv_steal_clock(void) { struct arm_smccc_res res; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 3215023d4852..bf9181cef444 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -142,7 +142,6 @@ enum cpuhp_state { /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, - CPUHP_AP_ARM_KVMPV_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, -- cgit v1.2.3 From 5ef64cc8987a9211d3f3667331ba3411a94ddc79 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Sep 2020 14:05:35 -0700 Subject: mm: allow a controlled amount of unfairness in the page lock Commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") made the page locking entirely fair, in that if a waiter came in while the lock was held, the lock would be transferred to the lockers strictly in order. That was intended to finally get rid of the long-reported watchdog failures that involved the page lock under extreme load, where a process could end up waiting essentially forever, as other page lockers stole the lock from under it. It also improved some benchmarks, but it ended up causing huge performance regressions on others, simply because fair lock behavior doesn't end up giving out the lock as aggressively, causing better worst-case latency, but potentially much worse average latencies and throughput. Instead of reverting that change entirely, this introduces a controlled amount of unfairness, with a sysctl knob to tune it if somebody needs to. But the default value should hopefully be good for any normal load, allowing a few rounds of lock stealing, but enforcing the strict ordering before the lock has been stolen too many times. There is also a hint from Matthieu Baerts that the fair page coloring may end up exposing an ABBA deadlock that is hidden by the usual optimistic lock stealing, and while the unfairness doesn't fix the fundamental issue (and I'm still looking at that), it avoids it in practice. The amount of unfairness can be modified by writing a new value to the 'sysctl_page_lock_unfairness' variable (default value of 5, exposed through /proc/sys/vm/page_lock_unfairness), but that is hopefully something we'd use mainly for debugging rather than being necessary for any deep system tuning. This whole issue has exposed just how critical the page lock can be, and how contended it gets under certain locks. And the main contention doesn't really seem to be anything related to IO (which was the origin of this lock), but for things like just verifying that the page file mapping is stable while faulting in the page into a page table. Link: https://lore.kernel.org/linux-fsdevel/ed8442fd-6f54-dd84-cd4a-941e8b7ee603@MichaelLarabel.com/ Link: https://www.phoronix.com/scan.php?page=article&item=linux-50-59&num=1 Link: https://lore.kernel.org/linux-fsdevel/c560a38d-8313-51fb-b1ec-e904bd8836bc@tessares.net/ Reported-and-tested-by: Michael Larabel Tested-by: Matthieu Baerts Cc: Dave Chinner Cc: Matthew Wilcox Cc: Chris Mason Cc: Jan Kara Cc: Amir Goldstein Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 + include/linux/wait.h | 1 + kernel/sysctl.c | 8 +++ mm/filemap.c | 160 +++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 140 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ca6e6a81576b..b2f370f0b420 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -41,6 +41,8 @@ struct writeback_control; struct bdi_writeback; struct pt_regs; +extern int sysctl_page_lock_unfairness; + void init_mm_internals(void); #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ diff --git a/include/linux/wait.h b/include/linux/wait.h index 898c890fc153..27fb99cfeb02 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -21,6 +21,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int #define WQ_FLAG_WOKEN 0x02 #define WQ_FLAG_BOOKMARK 0x04 #define WQ_FLAG_CUSTOM 0x08 +#define WQ_FLAG_DONE 0x10 /* * A single wait-queue entry structure: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e70ee2332e..afad085960b8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2912,6 +2912,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #ifdef CONFIG_MMU { .procname = "max_map_count", diff --git a/mm/filemap.c b/mm/filemap.c index 1aaea26556cc..6aa08e7714ce 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -988,9 +988,43 @@ void __init pagecache_init(void) page_writeback_init(); } +/* + * The page wait code treats the "wait->flags" somewhat unusually, because + * we have multiple different kinds of waits, not just he usual "exclusive" + * one. + * + * We have: + * + * (a) no special bits set: + * + * We're just waiting for the bit to be released, and when a waker + * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, + * and remove it from the wait queue. + * + * Simple and straightforward. + * + * (b) WQ_FLAG_EXCLUSIVE: + * + * The waiter is waiting to get the lock, and only one waiter should + * be woken up to avoid any thundering herd behavior. We'll set the + * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. + * + * This is the traditional exclusive wait. + * + * (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: + * + * The waiter is waiting to get the bit, and additionally wants the + * lock to be transferred to it for fair lock behavior. If the lock + * cannot be taken, we stop walking the wait queue without waking + * the waiter. + * + * This is the "fair lock handoff" case, and in addition to setting + * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see + * that it now has the lock. + */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { - int ret; + unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); @@ -999,35 +1033,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, return 0; /* - * If it's an exclusive wait, we get the bit for it, and - * stop walking if we can't. - * - * If it's a non-exclusive wait, then the fact that this - * wake function was called means that the bit already - * was cleared, and we don't care if somebody then - * re-took it. + * If it's a lock handoff wait, we get the bit for it, and + * stop walking (and do not wake it up) if we can't. */ - ret = 0; - if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(key->bit_nr, &key->page->flags)) + flags = wait->flags; + if (flags & WQ_FLAG_EXCLUSIVE) { + if (test_bit(key->bit_nr, &key->page->flags)) return -1; - ret = 1; + if (flags & WQ_FLAG_CUSTOM) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + flags |= WQ_FLAG_DONE; + } } - wait->flags |= WQ_FLAG_WOKEN; + /* + * We are holding the wait-queue lock, but the waiter that + * is waiting for this will be checking the flags without + * any locking. + * + * So update the flags atomically, and wake up the waiter + * afterwards to avoid any races. This store-release pairs + * with the load-acquire in wait_on_page_bit_common(). + */ + smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * - * Note that this has to be the absolute last thing we do, - * since after list_del_init(&wait->entry) the wait entry + * Note that this pairs with the "finish_wait()" in the + * waiter, and has to be the absolute last thing we do. + * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); - return ret; + return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1107,8 +1150,8 @@ enum behavior { }; /* - * Attempt to check (or get) the page bit, and mark the - * waiter woken if successful. + * Attempt to check (or get) the page bit, and mark us done + * if successful. */ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, struct wait_queue_entry *wait) @@ -1119,13 +1162,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, } else if (test_bit(bit_nr, &page->flags)) return false; - wait->flags |= WQ_FLAG_WOKEN; + wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } +/* How many times do we accept lock stealing from under a waiter? */ +int sysctl_page_lock_unfairness = 5; + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { + int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; @@ -1143,11 +1190,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } init_wait(wait); - wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; +repeat: + wait->flags = 0; + if (behavior == EXCLUSIVE) { + wait->flags = WQ_FLAG_EXCLUSIVE; + if (--unfairness < 0) + wait->flags |= WQ_FLAG_CUSTOM; + } + /* * Do one last check whether we can get the * page bit synchronously. @@ -1170,27 +1224,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, /* * From now on, all the logic will be based on - * the WQ_FLAG_WOKEN flag, and the and the page - * bit testing (and setting) will be - or has - * already been - done by the wake function. + * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to + * see whether the page bit testing has already + * been done by the wake function. * * We can drop our reference to the page. */ if (behavior == DROP) put_page(page); + /* + * Note that until the "finish_wait()", or until + * we see the WQ_FLAG_WOKEN flag, we need to + * be very careful with the 'wait->flags', because + * we may race with a waker that sets them. + */ for (;;) { + unsigned int flags; + set_current_state(state); - if (signal_pending_state(state, current)) + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(state, current)) + break; + + io_schedule(); + continue; + } + + /* If we were non-exclusive, we're done */ + if (behavior != EXCLUSIVE) break; - if (wait->flags & WQ_FLAG_WOKEN) + /* If the waker got the lock for us, we're done */ + if (flags & WQ_FLAG_DONE) break; - io_schedule(); + /* + * Otherwise, if we're getting the lock, we need to + * try to get it ourselves. + * + * And if that fails, we'll have to retry this all. + */ + if (unlikely(test_and_set_bit(bit_nr, &page->flags))) + goto repeat; + + wait->flags |= WQ_FLAG_DONE; + break; } + /* + * If a signal happened, this 'finish_wait()' may remove the last + * waiter from the wait-queues, but the PageWaiters bit will remain + * set. That's ok. The next wakeup will take care of it, and trying + * to do it here would be difficult and prone to races. + */ finish_wait(q, wait); if (thrashing) { @@ -1200,12 +1290,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } /* - * A signal could leave PageWaiters set. Clearing it here if - * !waitqueue_active would be possible (by open-coding finish_wait), - * but still fail to catch it in the case of wait hash collision. We - * already can fail to clear wait hash collision cases, so don't - * bother with signals either. + * NOTE! The wait->flags weren't stable until we've done the + * 'finish_wait()', and we could have exited the loop above due + * to a signal, and had a wakeup event happen after the signal + * test but before the 'finish_wait()'. + * + * So only after the finish_wait() can we reliably determine + * if we got woken up or not, so we can now figure out the final + * return value based on that state without races. + * + * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive + * waiter, but an exclusive one requires WQ_FLAG_DONE. */ + if (behavior == EXCLUSIVE) + return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } -- cgit v1.2.3 From 54fa9ba564b717fc2cf689427e195c360315999d Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 7 Sep 2020 11:32:07 +0200 Subject: ftrace: Let ftrace_enable_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of ftrace_enable_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/trace/ftrace.c:7544:43: warning: incorrect type in argument 3 (different address spaces) kernel/trace/ftrace.c:7544:43: expected void * kernel/trace/ftrace.c:7544:43: got void [noderef] __user *buffer Link: https://lkml.kernel.org/r/20200907093207.13540-1-tklauser@distanz.ch Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Cc: Andrew Morton Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Tobias Klauser Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +-- kernel/trace/ftrace.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ce2c06f72e86..e5c2d5cc6e6a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -85,8 +85,7 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val extern int ftrace_enabled; extern int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); struct ftrace_ops; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e9f893388245..603255f5f085 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7534,8 +7534,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; -- cgit v1.2.3 From 82d083ab60c3693201c6f5c7a5f23a6ed422098d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:05 +0900 Subject: kprobes: tracing/kprobes: Fix to kill kprobes on initmem after boot Since kprobe_event= cmdline option allows user to put kprobes on the functions in initmem, kprobe has to make such probes gone after boot. Currently the probes on the init functions in modules will be handled by module callback, but the kernel init text isn't handled. Without this, kprobes may access non-exist text area to disable or remove it. Link: https://lkml.kernel.org/r/159972810544.428528.1839307531600646955.stgit@devnote2 Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Cc: Jonathan Corbet Cc: Shuah Khan Cc: Randy Dunlap Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 5 +++++ init/main.c | 2 ++ kernel/kprobes.c | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 9be1bff4f586..8aab327b5539 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -373,6 +373,8 @@ void unregister_kretprobes(struct kretprobe **rps, int num); void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); +void kprobe_free_init_mem(void); + int disable_kprobe(struct kprobe *kp); int enable_kprobe(struct kprobe *kp); @@ -435,6 +437,9 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num) static inline void kprobe_flush_task(struct task_struct *tk) { } +static inline void kprobe_free_init_mem(void) +{ +} static inline int disable_kprobe(struct kprobe *kp) { return -ENOSYS; diff --git a/init/main.c b/init/main.c index ae78fb68d231..038128b2a755 100644 --- a/init/main.c +++ b/init/main.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1402,6 +1403,7 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + kprobe_free_init_mem(); ftrace_free_init_mem(); free_initmem(); mark_readonly(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d43b48ecdb4f..d750e025d1ff 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2453,6 +2453,28 @@ static struct notifier_block kprobe_module_nb = { extern unsigned long __start_kprobe_blacklist[]; extern unsigned long __stop_kprobe_blacklist[]; +void kprobe_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + struct hlist_head *head; + struct kprobe *p; + int i; + + mutex_lock(&kprobe_mutex); + + /* Kill all kprobes on initmem */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry(p, head, hlist) { + if (start <= (void *)p->addr && (void *)p->addr < end) + kill_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); +} + static int __init init_kprobes(void) { int i, err = 0; -- cgit v1.2.3 From 7bb82ac30c3dd4ecf1485685cbe84d2ba10dddf4 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Sep 2020 21:20:34 -0700 Subject: ftrace: let ftrace_enable_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of ftrace_enable_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/trace/ftrace.c:7544:43: warning: incorrect type in argument 3 (different address spaces) kernel/trace/ftrace.c:7544:43: expected void * kernel/trace/ftrace.c:7544:43: got void [noderef] __user *buffer Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200907093207.13540-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- include/linux/ftrace.h | 3 +-- kernel/trace/ftrace.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ce2c06f72e86..e5c2d5cc6e6a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -85,8 +85,7 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val extern int ftrace_enabled; extern int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); struct ftrace_ops; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 275441254bb5..e9fa580f3083 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7531,8 +7531,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; -- cgit v1.2.3 From 4773ef33fc6e59bad2e5d19e334de2fa79c27b74 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Sep 2020 21:20:37 -0700 Subject: stackleak: let stack_erasing_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of stack_erasing_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/stackleak.c:31:50: warning: incorrect type in argument 3 (different address spaces) kernel/stackleak.c:31:50: expected void * kernel/stackleak.c:31:50: got void [noderef] __user *buffer Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200907093253.13656-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- include/linux/stackleak.h | 2 +- kernel/stackleak.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index 3d5c3271a9a8..a59db2f08e76 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -25,7 +25,7 @@ static inline void stackleak_task_init(struct task_struct *t) #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE int stack_erasing_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ diff --git a/kernel/stackleak.c b/kernel/stackleak.c index a8fc9ae1d03d..ce161a8e8d97 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -20,7 +20,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); int stack_erasing_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); -- cgit v1.2.3 From e2ec5128254518cae320d5dc631b71b94160f663 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Sep 2020 08:54:42 -0700 Subject: dm: Call proper helper to determine dax support DM was calling generic_fsdax_supported() to determine whether a device referenced in the DM table supports DAX. However this is a helper for "leaf" device drivers so that they don't have to duplicate common generic checks. High level code should call dax_supported() helper which that calls into appropriate helper for the particular device. This problem manifested itself as kernel messages: dm-3: error: dax access failed (-95) when lvm2-testsuite run in cases where a DM device was stacked on top of another DM device. Fixes: 7bf7eac8d648 ("dax: Arrange for dax_supported check to span multiple devices") Cc: Tested-by: Adrian Huang Signed-off-by: Jan Kara Acked-by: Mike Snitzer Reported-by: kernel test robot Link: https://lore.kernel.org/r/160061715195.13131.5503173247632041975.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 4 ++++ drivers/md/dm-table.c | 10 +++++++--- include/linux/dax.h | 22 ++++++++++++++++++++-- 3 files changed, 31 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index e5767c83ea23..b6284c5cae0a 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -325,11 +325,15 @@ EXPORT_SYMBOL_GPL(dax_direct_access); bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t len) { + if (!dax_dev) + return false; + if (!dax_alive(dax_dev)) return false; return dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, len); } +EXPORT_SYMBOL_GPL(dax_supported); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5edc3079e7c1..229f461e7def 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -860,10 +860,14 @@ EXPORT_SYMBOL_GPL(dm_table_set_type); int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - int blocksize = *(int *) data; + int blocksize = *(int *) data, id; + bool rc; - return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize, - start, len); + id = dax_read_lock(); + rc = dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len); + dax_read_unlock(id); + + return rc; } /* Check devices support synchronous DAX */ diff --git a/include/linux/dax.h b/include/linux/dax.h index 6904d4e0b2e0..497031392e0a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -130,6 +130,8 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, return __generic_fsdax_supported(dax_dev, bdev, blocksize, start, sectors); } +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len); static inline void fs_put_dax(struct dax_device *dax_dev) { @@ -157,6 +159,13 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, return false; } +static inline bool dax_supported(struct dax_device *dax_dev, + struct block_device *bdev, int blocksize, sector_t start, + sector_t len) +{ + return false; +} + static inline void fs_put_dax(struct dax_device *dax_dev) { } @@ -189,14 +198,23 @@ static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) } #endif +#if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); +#else +static inline int dax_read_lock(void) +{ + return 0; +} + +static inline void dax_read_unlock(int id) +{ +} +#endif /* CONFIG_DAX */ bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, -- cgit v1.2.3 From 88b67edd7247466bc47f01e1dc539b0d0d4b931e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Sep 2020 11:33:23 +0200 Subject: dax: Fix compilation for CONFIG_DAX && !CONFIG_FS_DAX dax_supported() is defined whenever CONFIG_DAX is enabled. So dummy implementation should be defined only in !CONFIG_DAX case, not in !CONFIG_FS_DAX case. Fixes: e2ec51282545 ("dm: Call proper helper to determine dax support") Cc: Reported-by: Geert Uytterhoeven Reported-by: Naresh Kamboju Reported-by: kernel test robot Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/dax.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 497031392e0a..43b39ab9de1a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -58,6 +58,8 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev) { __set_dax_synchronous(dax_dev); } +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len); /* * Check if given mapping is supported by the file / underlying device. */ @@ -104,6 +106,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev) static inline void set_dax_synchronous(struct dax_device *dax_dev) { } +static inline bool dax_supported(struct dax_device *dax_dev, + struct block_device *bdev, int blocksize, sector_t start, + sector_t len) +{ + return false; +} static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { @@ -130,8 +138,6 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, return __generic_fsdax_supported(dax_dev, bdev, blocksize, start, sectors); } -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len); static inline void fs_put_dax(struct dax_device *dax_dev) { @@ -159,13 +165,6 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, return false; } -static inline bool dax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t len) -{ - return false; -} - static inline void fs_put_dax(struct dax_device *dax_dev) { } -- cgit v1.2.3