cgroup: cgroup v2 freezer

Cgroup v1 implements the freezer controller, which provides an ability to stop the workload in a cgroup and temporarily free up some resources (cpu, io, network bandwidth and, potentially, memory) for some other tasks. Cgroup v2 lacks this functionality. This patch implements freezer for cgroup v2. Cgroup v2 freezer tries to put tasks into a state similar to jobctl stop. This means that tasks can be killed, ptraced (using PTRACE_SEIZE*), and interrupted. It is possible to attach to a frozen task, get some information (e.g. read registers) and detach. It's also possible to migrate a frozen tasks to another cgroup. This differs cgroup v2 freezer from cgroup v1 freezer, which mostly tried to imitate the system-wide freezer. However uninterruptible sleep is fine when all tasks are going to be frozen (hibernation case), it's not the acceptable state for some subset of the system. Cgroup v2 freezer is not supporting freezing kthreads. If a non-root cgroup contains kthread, the cgroup still can be frozen, but the kthread will remain running, the cgroup will be shown as non-frozen, and the notification will not be delivered. * PTRACE_ATTACH is not working because non-fatal signal delivery is blocked in frozen state. There are some interface differences between cgroup v1 and cgroup v2 freezer too, which are required to conform the cgroup v2 interface design principles: 1) There is no separate controller, which has to be turned on: the functionality is always available and is represented by cgroup.freeze and cgroup.events cgroup control files. 2) The desired state is defined by the cgroup.freeze control file. Any hierarchical configuration is allowed. 3) The interface is asynchronous. The actual state is available using cgroup.events control file ("frozen" field). There are no dedicated transitional states. 4) It's allowed to make any changes with the cgroup hierarchy (create new cgroups, remove old cgroups, move tasks between cgroups) no matter if some cgroups are frozen. Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Tejun Heo <tj@kernel.org> No-objection-from-me-by: Oleg Nesterov <oleg@redhat.com> Cc: kernel-team@fb.com
author: Roman Gushchin 2019-04-19 10:03:04 -0700
committer: Tejun Heo 2019-04-19 11:26:48 -0700
commit: 76f969e8948d82e78e1bc4beb6b9465908e74873 (patch)
tree: 1f5459d94820c5e5ea7293b103e8531d389c15c1 /kernel/signal.c
parent: 4dcabece4c3a9f9522127be12cc12cc120399b2f (diff)
1 files changed, 65 insertions, 5 deletions
diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..095e0fc57b25 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -43,6 +43,7 @@
 #include <linux/compiler.h>
 #include <linux/posix-timers.h>
 #include <linux/livepatch.h>
+#include <linux/cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -146,9 +147,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 static bool recalc_sigpending_tsk(struct task_struct *t)
 {
-	if ((t->jobctl & JOBCTL_PENDING_MASK) ||
+	if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
 	    PENDING(&t->pending, &t->blocked) ||
-	    PENDING(&t->signal->shared_pending, &t->blocked)) {
+	    PENDING(&t->signal->shared_pending, &t->blocked) ||
+	    cgroup_task_frozen(t)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
 		return true;
 	}
@@ -2108,6 +2110,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 		preempt_disable();
 		read_unlock(&tasklist_lock);
 		preempt_enable_no_resched();
+		cgroup_enter_frozen();
 		freezable_schedule();
 	} else {
 		/*
@@ -2286,6 +2289,7 @@ static bool do_signal_stop(int signr)
 		}
 
 		/* Now we don't run again until woken by SIGCONT or SIGKILL */
+		cgroup_enter_frozen();
 		freezable_schedule();
 		return true;
 	} else {
@@ -2332,6 +2336,43 @@ static void do_jobctl_trap(void)
 	}
 }
 
+/**
+ * do_freezer_trap - handle the freezer jobctl trap
+ *
+ * Puts the task into frozen state, if only the task is not about to quit.
+ * In this case it drops JOBCTL_TRAP_FREEZE.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held,
+ * which is always released before returning.
+ */
+static void do_freezer_trap(void)
+	__releases(&current->sighand->siglock)
+{
+	/*
+	 * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
+	 * let's make another loop to give it a chance to be handled.
+	 * In any case, we'll return back.
+	 */
+	if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
+	     JOBCTL_TRAP_FREEZE) {
+		spin_unlock_irq(&current->sighand->siglock);
+		return;
+	}
+
+	/*
+	 * Now we're sure that there is no pending fatal signal and no
+	 * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
+	 * immediately (if there is a non-fatal signal pending), and
+	 * put the task into sleep.
+	 */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	clear_thread_flag(TIF_SIGPENDING);
+	spin_unlock_irq(&current->sighand->siglock);
+	cgroup_enter_frozen();
+	freezable_schedule();
+}
+
 static int ptrace_signal(int signr, kernel_siginfo_t *info)
 {
 	/*
@@ -2442,6 +2483,10 @@ relock:
 		ksig->info.si_signo = signr = SIGKILL;
 		sigdelset(&current->pending.signal, SIGKILL);
 		recalc_sigpending();
+		current->jobctl &= ~JOBCTL_TRAP_FREEZE;
+		spin_unlock_irq(&sighand->siglock);
+		if (unlikely(cgroup_task_frozen(current)))
+			cgroup_leave_frozen(true);
 		goto fatal;
 	}
 
@@ -2452,9 +2497,24 @@ relock:
 		    do_signal_stop(0))
 			goto relock;
 
-		if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
-			do_jobctl_trap();
+		if (unlikely(current->jobctl &
+			     (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
+			if (current->jobctl & JOBCTL_TRAP_MASK) {
+				do_jobctl_trap();
+				spin_unlock_irq(&sighand->siglock);
+			} else if (current->jobctl & JOBCTL_TRAP_FREEZE)
+				do_freezer_trap();
+
+			goto relock;
+		}
+
+		/*
+		 * If the task is leaving the frozen state, let's update
+		 * cgroup counters and reset the frozen bit.
+		 */
+		if (unlikely(cgroup_task_frozen(current))) {
 			spin_unlock_irq(&sighand->siglock);
+			cgroup_leave_frozen(true);
 			goto relock;
 		}
 
@@ -2548,8 +2608,8 @@ relock:
 			continue;
 		}
 
-	fatal:
 		spin_unlock_irq(&sighand->siglock);
+	fatal:
 
 		/*
 		 * Anything else is fatal, maybe with a core dump.
author	Roman Gushchin	2019-04-19 10:03:04 -0700
committer	Tejun Heo	2019-04-19 11:26:48 -0700
commit	76f969e8948d82e78e1bc4beb6b9465908e74873 (patch)
tree	1f5459d94820c5e5ea7293b103e8531d389c15c1 /kernel/signal.c
parent	4dcabece4c3a9f9522127be12cc12cc120399b2f (diff)