diff options
author | Linus Torvalds | 2014-12-11 18:57:19 -0800 |
---|---|---|
committer | Linus Torvalds | 2014-12-11 18:57:19 -0800 |
commit | 2756d373a3f45a3a9ebf4ac389f9e0e02bd35a93 (patch) | |
tree | e248c5adccb3045f96b3cfe0a1ffeb37bb81e4cb /kernel | |
parent | 4e8790f77f051d4cc745a57b48a73052521e8dfc (diff) | |
parent | eeecbd1971517103e06f11750dd1a9a1dc37e4e6 (diff) |
Merge branch 'for-3.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup update from Tejun Heo:
"cpuset got simplified a bit. cgroup core got a fix on unified
hierarchy and grew some effective css related interfaces which will be
used for blkio support for writeback IO traffic which is currently
being worked on"
* 'for-3.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: implement cgroup_get_e_css()
cgroup: add cgroup_subsys->css_e_css_changed()
cgroup: add cgroup_subsys->css_released()
cgroup: fix the async css offline wait logic in cgroup_subtree_control_write()
cgroup: restructure child_subsys_mask handling in cgroup_subtree_control_write()
cgroup: separate out cgroup_calc_child_subsys_mask() from cgroup_refresh_child_subsys_mask()
cpuset: lock vs unlock typo
cpuset: simplify cpuset_node_allowed API
cpuset: convert callback_mutex to a spinlock
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 175 | ||||
-rw-r--r-- | kernel/cpuset.c | 162 |
2 files changed, 178 insertions, 159 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 136eceadeed1..bb263d0caab3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; + /* + * This function is used while updating css associations and thus + * can't test the csses directly. Use ->child_subsys_mask. + */ while (cgroup_parent(cgrp) && !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) cgrp = cgroup_parent(cgrp); @@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } +/** + * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * The returned css must be put using css_put(). + */ +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + do { + css = cgroup_css(cgrp, ss); + + if (css && css_tryget_online(css)) + goto out_unlock; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + css = init_css_set.subsys[ss->id]; + css_get(css); +out_unlock: + rcu_read_unlock(); + return css; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp) } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * cgroup_calc_child_subsys_mask - calculate child_subsys_mask * @cgrp: the target cgroup + * @subtree_control: the new subtree_control mask to consider * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * - * This function determines which subsystems need to be enabled given the - * current @cgrp->subtree_control and records it in - * @cgrp->child_subsys_mask. The resulting mask is always a superset of - * @cgrp->subtree_control and follows the usual hierarchy rules. + * This function calculates which subsystems need to be enabled if + * @subtree_control is to be applied to @cgrp. The returned mask is always + * a superset of @subtree_control and follows the usual hierarchy rules. */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, + unsigned int subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); - unsigned int cur_ss_mask = cgrp->subtree_control; + unsigned int cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) { - cgrp->child_subsys_mask = cur_ss_mask; - return; - } + if (!cgroup_on_dfl(cgrp)) + return cur_ss_mask; while (true) { unsigned int new_ss_mask = cur_ss_mask; @@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) cur_ss_mask = new_ss_mask; } - cgrp->child_subsys_mask = cur_ss_mask; + return cur_ss_mask; +} + +/** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * Update @cgrp->child_subsys_mask according to the current + * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + cgrp->child_subsys_mask = + cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); } /** @@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; - unsigned int css_enable, css_disable, old_ctrl, new_ctrl; + unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ret = -ENOENT; goto out_unlock; } - - /* - * @ss is already enabled through dependency and - * we'll just make it visible. Skip draining. - */ - if (cgrp->child_subsys_mask & (1 << ssid)) - continue; - - /* - * Because css offlining is asynchronous, userland - * might try to re-enable the same controller while - * the previous instance is still around. In such - * cases, wait till it's gone using offline_waitq. - */ - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); @@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * subsystems than specified may need to be enabled or disabled * depending on subsystem dependencies. */ - cgrp->subtree_control |= enable; - cgrp->subtree_control &= ~disable; + old_sc = cgrp->subtree_control; + old_ss = cgrp->child_subsys_mask; + new_sc = (old_sc | enable) & ~disable; + new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); - old_ctrl = cgrp->child_subsys_mask; - cgroup_refresh_child_subsys_mask(cgrp); - new_ctrl = cgrp->child_subsys_mask; - - css_enable = ~old_ctrl & new_ctrl; - css_disable = old_ctrl & ~new_ctrl; + css_enable = ~old_ss & new_ss; + css_disable = old_ss & ~new_ss; enable |= css_enable; disable |= css_disable; /* + * Because css offlining is asynchronous, userland might try to + * re-enable the same controller while the previous instance is + * still around. In such cases, wait till it's gone using + * offline_waitq. + */ + for_each_subsys(ss, ssid) { + if (!(css_enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + DEFINE_WAIT(wait); + + if (!cgroup_css(child, ss)) + continue; + + cgroup_get(child); + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + cgroup_kn_unlock(of->kn); + schedule(); + finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); + + return restart_syscall(); + } + } + + cgrp->subtree_control = new_sc; + cgrp->child_subsys_mask = new_ss; + + /* * Create new csses or make the existing ones visible. A css is * created invisible if it's being implicitly enabled through * dependency. An invisible css is made visible when the userland @@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } } + /* + * The effective csses of all the descendants (excluding @cgrp) may + * have changed. Subsystems can optionally subscribe to this event + * by implementing ->css_e_css_changed() which is invoked if any of + * the effective csses seen from the css's cgroup may have changed. + */ + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); + struct cgroup_subsys_state *css; + + if (!ss->css_e_css_changed || !this_css) + continue; + + css_for_each_descendant_pre(css, this_css) + if (css != this_css) + ss->css_e_css_changed(css); + } + kernfs_activate(cgrp->kn); ret = 0; out_unlock: @@ -2832,9 +2898,8 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->subtree_control &= ~enable; - cgrp->subtree_control |= disable; - cgroup_refresh_child_subsys_mask(cgrp); + cgrp->subtree_control = old_sc; + cgrp->child_subsys_mask = old_ss; for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ cgroup_idr_remove(&ss->css_idr, css->id); + if (ss->css_released) + ss->css_released(css); } else { /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 723cfc9d0ad7..64b257f6bca2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -248,34 +248,34 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global mutexes guarding cpuset structures - cpuset_mutex - * and callback_mutex. The latter may nest inside the former. We also - * require taking task_lock() when dereferencing a task's cpuset pointer. - * See "The task_lock() exception", at the end of this comment. + * There are two global locks guarding cpuset structures - cpuset_mutex and + * callback_lock. We also require taking task_lock() when dereferencing a + * task's cpuset pointer. See "The task_lock() exception", at the end of this + * comment. * - * A task must hold both mutexes to modify cpusets. If a task holds + * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it - * is the only task able to also acquire callback_mutex and be able to + * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while * just holding cpuset_mutex. While it is performing these checks, various - * callback routines can briefly acquire callback_mutex to query cpusets. - * Once it is ready to make the changes, it takes callback_mutex, blocking + * callback routines can briefly acquire callback_lock to query cpusets. + * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. * * Calls to the kernel memory allocator can not be made while holding - * callback_mutex, as that would risk double tripping on callback_mutex + * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * - * If a task is only holding callback_mutex, then it has read-only + * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_mutex across + * The cpuset_common_file_read() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { */ static DEFINE_MUTEX(cpuset_mutex); -static DEFINE_MUTEX(callback_mutex); +static DEFINE_SPINLOCK(callback_lock); /* * CPU / memory hotplug is handled asynchronously. @@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = { * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { @@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cpuset_mutex held + * Call with callback_lock or cpuset_mutex held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) continue; rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, new_cpus); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* use trialcs->cpus_allowed as a temp variable */ update_cpumasks_hier(cs, trialcs->cpus_allowed); @@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) continue; rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); @@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &cs->mems_allowed); @@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) rebuild_sched_domains_locked(); @@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) count = seq_get_buf(sf, &buf); s = buf; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: @@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) seq_commit(sf, -1); } out_unlock: - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); return ret; } @@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (cgroup_on_dfl(cs->css.cgroup)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) } rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); return 0; @@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { mutex_lock(&cpuset_mutex); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (cgroup_on_dfl(root_css->cgroup)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); @@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) top_cpuset.mems_allowed = top_cpuset.effective_mems; } - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); mutex_unlock(&cpuset_mutex); } @@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, { bool is_empty; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, new_cpus); cpumask_copy(cs->effective_cpus, new_cpus); cs->mems_allowed = *new_mems; cs->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, @@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs, if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (cpus_updated) update_tasks_cpumask(cs); @@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); cpumask_copy(top_cpuset.effective_cpus, &new_cpus); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); update_tasks_nodemask(&top_cpuset); } @@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { - mutex_lock(&callback_mutex); + unsigned long flags; + + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) @@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; + unsigned long flags; - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return mask; } @@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding - * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall + * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) @@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /** - * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * flag, yes. * Otherwise, no. * - * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to - * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() - * might sleep, and might allow a node from an enclosing cpuset. - * - * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall - * cpusets, and never sleeps. - * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by * (in get_page_from_freelist()) refusing to consider the zones for @@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * - * Scanning up parent cpusets requires callback_mutex. The + * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the - * cpuset are short of memory, might require taking the callback_mutex - * mutex. + * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, @@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * TIF_MEMDIE - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. - * - * Rule: - * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you - * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables - * the code that might scan up ancestor cpusets and sleep. */ -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +int __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ + unsigned long flags; if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; - might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; /* @@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) return 1; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return allowed; } -/* - * cpuset_node_allowed_hardwall - Can we allocate on a memory node? - * @node: is this an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If the task has been OOM killed and has access to memory reserves as - * specified by the TIF_MEMDIE flag, yes. - * Otherwise, no. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * Unlike the cpuset_node_allowed_softwall() variant, above, - * this variant requires that the node be in the current task's - * mems_allowed or that we're in interrupt. It does not scan up the - * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. - * It never sleeps. - */ -int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) -{ - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - return 0; -} - /** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page |