aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c129
-rw-r--r--kernel/cgroup_freezer.c72
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/kprobes.c7
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/perf_event.c94
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/softirq.c16
-rw-r--r--kernel/taskstats.c172
-rw-r--r--kernel/trace/ring_buffer.c335
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/tsacct.c10
19 files changed, 514 insertions, 406 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9270d532ec3c..5cf366965d0c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
+static int clone_children(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
+
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noprefix");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ if (clone_children(&root->top_cgroup))
+ seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
char *release_agent;
+ bool clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {
*/
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
- char *token, *o = data ?: "all";
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
unsigned long mask = (unsigned long)-1;
int i;
bool module_pin_failed = false;
@@ -1082,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
- if (!strcmp(token, "all")) {
- /* Add all non-disabled subsystems */
- opts->subsys_bits = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!ss->disabled)
- opts->subsys_bits |= 1ul << i;
- }
- } else if (!strcmp(token, "none")) {
+ if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
- } else if (!strcmp(token, "noprefix")) {
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
- } else if (!strncmp(token, "release_agent=", 14)) {
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->clone_children = true;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
@@ -1105,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- } else if (!strncmp(token, "name=", 5)) {
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
@@ -1127,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
- } else {
- struct cgroup_subsys *ss;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!strcmp(token, ss->name)) {
- if (!ss->disabled)
- set_bit(i, &opts->subsys_bits);
- break;
- }
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+
+ continue;
+ }
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (strcmp(token, ss->name))
+ continue;
+ if (ss->disabled)
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ set_bit(i, &opts->subsys_bits);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise 'all, 'none' and a subsystem name options were not
+ * specified, let's default to 'all'
+ */
+ if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (ss->disabled)
+ continue;
+ set_bit(i, &opts->subsys_bits);
}
}
@@ -1355,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
+ if (opts->clone_children)
+ set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
return root;
}
@@ -1880,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ if (strlen(buffer) >= PATH_MAX)
+ return -EINVAL;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
strcpy(cgrp->root->release_agent_path, buffer);
@@ -3173,6 +3217,23 @@ fail:
return ret;
}
+static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return clone_children(cgrp);
+}
+
+static int cgroup_clone_children_write(struct cgroup *cgrp,
+ struct cftype *cft,
+ u64 val)
+{
+ if (val)
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ else
+ clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ return 0;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
@@ -3203,6 +3264,11 @@ static struct cftype files[] = {
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
};
static struct cftype cft_release_agent = {
@@ -3332,6 +3398,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ if (clone_children(parent))
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
@@ -3346,6 +3415,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
/* At error, ->destroy() callback has to free assigned ID. */
+ if (clone_children(parent) && ss->post_clone)
+ ss->post_clone(ss, cgrp);
}
cgroup_lock_hierarchy(root);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
struct freezer, css);
}
-int cgroup_freezing_or_frozen(struct task_struct *task)
+static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
{
- struct freezer *freezer;
- enum freezer_state state;
+ enum freezer_state state = task_freezer(task)->state;
+ return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+}
+int cgroup_freezing_or_frozen(struct task_struct *task)
+{
+ int result;
task_lock(task);
- freezer = task_freezer(task);
- if (!freezer->css.cgroup->parent)
- state = CGROUP_THAWED; /* root cgroup can't be frozen */
- else
- state = freezer->state;
+ result = __cgroup_freezing_or_frozen(task);
task_unlock(task);
-
- return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+ return result;
}
/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
kfree(cgroup_freezer(cgroup));
}
-/* Task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
-{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
-}
-
/*
* The call to cgroup_lock() in the freezer.state write method prevents
* a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
/*
* Anything frozen can't move or be moved to/from.
- *
- * Since orig_freezer->state == FROZEN means that @task has been
- * frozen, so it's sufficient to check the latter condition.
*/
- if (is_task_frozen_enough(task))
+ freezer = cgroup_freezer(new_cgroup);
+ if (freezer->state != CGROUP_THAWED)
return -EBUSY;
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state == CGROUP_FROZEN)
+ rcu_read_lock();
+ if (__cgroup_freezing_or_frozen(task)) {
+ rcu_read_unlock();
return -EBUSY;
+ }
+ rcu_read_unlock();
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (is_task_frozen_enough(c)) {
+ if (__cgroup_freezing_or_frozen(c)) {
rcu_read_unlock();
return -EBUSY;
}
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
/*
* caller must hold freezer->lock
*/
-static void update_freezer_state(struct cgroup *cgroup,
+static void update_if_frozen(struct cgroup *cgroup,
struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
unsigned int nfrozen = 0, ntotal = 0;
+ enum freezer_state old_state = freezer->state;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
ntotal++;
- if (is_task_frozen_enough(task))
+ if (frozen(task))
nfrozen++;
}
- /*
- * Transition to FROZEN when no new tasks can be added ensures
- * that we never exist in the FROZEN state while there are unfrozen
- * tasks.
- */
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- else if (nfrozen > 0)
- freezer->state = CGROUP_FREEZING;
- else
- freezer->state = CGROUP_THAWED;
+ if (old_state == CGROUP_THAWED) {
+ BUG_ON(nfrozen > 0);
+ } else if (old_state == CGROUP_FREEZING) {
+ if (nfrozen == ntotal)
+ freezer->state = CGROUP_FROZEN;
+ } else { /* old_state == CGROUP_FROZEN */
+ BUG_ON(nfrozen != ntotal);
+ }
+
cgroup_iter_end(cgroup, &it);
}
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
if (state == CGROUP_FREEZING) {
/* We change from FREEZING to FROZEN lazily if the cgroup was
* only partially frozen when we exitted write. */
- update_freezer_state(cgroup, freezer);
+ update_if_frozen(cgroup, freezer);
state = freezer->state;
}
spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
while ((task = cgroup_iter_next(cgroup, &it))) {
if (!freeze_task(task, true))
continue;
- if (is_task_frozen_enough(task))
+ if (frozen(task))
continue;
if (!freezing(task) && !freezer_should_skip(task))
num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
spin_lock_irq(&freezer->lock);
- update_freezer_state(cgroup, freezer);
+ update_if_frozen(cgroup, freezer);
if (goal_state == freezer->state)
goto out;
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
- mutex_init(&p->cred_guard_mutex);
-
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
diff --git a/kernel/exit.c b/kernel/exit.c
index 894179a32ec1..b194febf5799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -703,6 +703,8 @@ static void exit_mm(struct task_struct * tsk)
* space.
*/
static struct task_struct *find_new_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *thread;
diff --git a/kernel/fork.c b/kernel/fork.c
index e87aaaaf5131..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -908,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
+ mutex_init(&sig->cred_guard_mutex);
+
return 0;
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9d917ff72675..9988d03797f5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
struct irq_desc *desc = irq_to_desc(irq);
return desc ? desc->kstat_irqs[cpu] : 0;
}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+unsigned int kstat_irqs(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int cpu;
+ int sum = 0;
+
+ if (!desc)
+ return 0;
+ for_each_possible_cpu(cpu)
+ sum += desc->kstat_irqs[cpu];
+ return sum;
+}
+#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 56a891914273..99865c33a60d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_all_disarmed;
-static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
+/* This protects kprobe_table and optimizing_list */
+static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
spinlock_t lock ____cacheline_aligned_in_smp;
@@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
}
#ifdef CONFIG_SYSCTL
+/* This should be called with kprobe_mutex locked */
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
return;
kprobes_allow_optimization = true;
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
- mutex_unlock(&text_mutex);
printk(KERN_INFO "Kprobes globally optimized\n");
}
+/* This should be called with kprobe_mutex locked */
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
diff --git a/kernel/module.c b/kernel/module.c
index 2df46301a7a4..437a74a7524a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
{
}
-static void add_kallsyms(struct module *mod, struct load_info *info)
+static void add_kallsyms(struct module *mod, const struct load_info *info)
{
}
#endif /* CONFIG_KALLSYMS */
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
return ERR_PTR(-EPERM);
if (!cgroup_is_descendant(cgroup, current))
return ERR_PTR(-EPERM);
+ if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
+ printk("ns_cgroup can't be created with parent "
+ "'clone_children' set.\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ printk_once("ns_cgroup deprecated: consider using the "
+ "'clone_children' flag without the ns_cgroup.\n");
ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
if (!ns_cgroup)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c78..517d827f4982 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event)
return event->cpu == -1 || event->cpu == smp_processor_id();
}
-static int
-__event_sched_out(struct perf_event *event,
+static void
+event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event,
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
- return 0;
+ return;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
+ event->tstamp_stopped = ctx->time;
event->pmu->del(event, 0);
event->oncpu = -1;
@@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event,
ctx->nr_active--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- return 1;
-}
-
-static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- int ret;
-
- ret = __event_sched_out(event, cpuctx, ctx);
- if (ret)
- event->tstamp_stopped = ctx->time;
}
static void
@@ -664,7 +652,7 @@ retry:
}
static int
-__event_sched_in(struct perf_event *event,
+event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -684,6 +672,8 @@ __event_sched_in(struct perf_event *event,
return -EAGAIN;
}
+ event->tstamp_running += ctx->time - event->tstamp_stopped;
+
if (!is_software_event(event))
cpuctx->active_oncpu++;
ctx->nr_active++;
@@ -694,35 +684,6 @@ __event_sched_in(struct perf_event *event,
return 0;
}
-static inline int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- int ret = __event_sched_in(event, cpuctx, ctx);
- if (ret)
- return ret;
- event->tstamp_running += ctx->time - event->tstamp_stopped;
- return 0;
-}
-
-static void
-group_commit_event_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- struct perf_event *event;
- u64 now = ctx->time;
-
- group_event->tstamp_running += now - group_event->tstamp_stopped;
- /*
- * Schedule in siblings as one group (if any):
- */
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- event->tstamp_running += now - event->tstamp_stopped;
- }
-}
-
static int
group_sched_in(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
@@ -730,19 +691,15 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = group_event->pmu;
+ u64 now = ctx->time;
+ bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu);
- /*
- * use __event_sched_in() to delay updating tstamp_running
- * until the transaction is committed. In case of failure
- * we will keep an unmodified tstamp_running which is a
- * requirement to get correct timing information
- */
- if (__event_sched_in(group_event, cpuctx, ctx)) {
+ if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
return -EAGAIN;
}
@@ -751,31 +708,42 @@ group_sched_in(struct perf_event *group_event,
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- if (__event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
}
}
- if (!pmu->commit_txn(pmu)) {
- /* commit tstamp_running */
- group_commit_event_sched_in(group_event, cpuctx, ctx);
+ if (!pmu->commit_txn(pmu))
return 0;
- }
+
group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
+ * The events up to the failed event are scheduled out normally,
+ * tstamp_stopped will be updated.
*
- * use __event_sched_out() to avoid updating tstamp_stopped
- * because the event never actually ran
+ * The failed events and the remaining siblings need to have
+ * their timings updated as if they had gone thru event_sched_in()
+ * and event_sched_out(). This is required to get consistent timings
+ * across the group. This also takes care of the case where the group
+ * could never be scheduled by ensuring tstamp_stopped is set to mark
+ * the time the event was actually stopped, such that time delta
+ * calculation in update_event_times() is correct.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
- break;
- __event_sched_out(event, cpuctx, ctx);
+ simulate = true;
+
+ if (simulate) {
+ event->tstamp_running += now - event->tstamp_stopped;
+ event->tstamp_stopped = now;
+ } else {
+ event_sched_out(event, cpuctx, ctx);
+ }
}
- __event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, cpuctx, ctx);
pmu->cancel_txn(pmu);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
* under ptrace.
*/
retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&task->cred_guard_mutex))
+ if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
goto out;
task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
unlock_tasklist:
write_unlock_irq(&tasklist_lock);
unlock_creds:
- mutex_unlock(&task->cred_guard_mutex);
+ mutex_unlock(&task->signal->cred_guard_mutex);
out:
return retval;
}
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
* and reacquire the lock.
*/
void exit_ptrace(struct task_struct *tracer)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
{
struct task_struct *p, *n;
LIST_HEAD(ptrace_dead);
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
return copied;
}
-static int ptrace_setoptions(struct task_struct *child, long data)
+static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
child->ptrace &= ~PT_TRACE_MASK;
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
#define is_sysemu_singlestep(request) 0
#endif
-static int ptrace_resume(struct task_struct *child, long request, long data)
+static int ptrace_resume(struct task_struct *child, long request,
+ unsigned long data)
{
if (!valid_signal(data))
return -EIO;
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
#endif
int ptrace_request(struct task_struct *child, long request,
- long addr, long data)
+ unsigned long addr, unsigned long data)
{
int ret = -EIO;
siginfo_t siginfo;
+ void __user *datavp = (void __user *) data;
+ unsigned long __user *datalp = datavp;
switch (request) {
case PTRACE_PEEKTEXT:
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setoptions(child, data);
break;
case PTRACE_GETEVENTMSG:
- ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+ ret = put_user(child->ptrace_message, datalp);
break;
case PTRACE_GETSIGINFO:
ret = ptrace_getsiginfo(child, &siginfo);
if (!ret)
- ret = copy_siginfo_to_user((siginfo_t __user *) data,
- &siginfo);
+ ret = copy_siginfo_to_user(datavp, &siginfo);
break;
case PTRACE_SETSIGINFO:
- if (copy_from_user(&siginfo, (siginfo_t __user *) data,
- sizeof siginfo))
+ if (copy_from_user(&siginfo, datavp, sizeof siginfo))
ret = -EFAULT;
else
ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
}
mmput(mm);
- ret = put_user(tmp, (unsigned long __user *) data);
+ ret = put_user(tmp, datalp);
break;
}
#endif
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
case PTRACE_SETREGSET:
{
struct iovec kiov;
- struct iovec __user *uiov = (struct iovec __user *) data;
+ struct iovec __user *uiov = datavp;
if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
return -EFAULT;
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
#define arch_ptrace_attach(child) do { } while (0)
#endif
-SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
+ unsigned long, data)
{
struct task_struct *child;
long ret;
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
return ret;
}
-int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
+ unsigned long data)
{
unsigned long tmp;
int copied;
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
return put_user(tmp, (unsigned long __user *)data);
}
-int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
+ unsigned long data)
{
int copied;
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9c9841cb6902 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -453,6 +453,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
if (first == parent)
return first;
+ if (WARN_ON(first == new)) /* duplicated insertion */
+ return first;
if ((first->start > new->start) || (first->end < new->end))
break;
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
return count;
}
-struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+ unsigned long *flags)
{
struct sighand_struct *sighand;
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
* is gone, we keep current->exit_code unless clear_code.
*/
static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+ __releases(&current->sighand->siglock)
+ __acquires(&current->sighand->siglock)
{
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f02a9dfa19bc..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -229,18 +229,20 @@ restart:
do {
if (pending & 1) {
+ unsigned int vec_nr = h - softirq_vec;
int prev_count = preempt_count();
- kstat_incr_softirqs_this_cpu(h - softirq_vec);
- trace_softirq_entry(h, softirq_vec);
+ kstat_incr_softirqs_this_cpu(vec_nr);
+
+ trace_softirq_entry(vec_nr);
h->action(h);
- trace_softirq_exit(h, softirq_vec);
+ trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
- printk(KERN_ERR "huh, entered softirq %td %s %p"
+ printk(KERN_ERR "huh, entered softirq %u %s %p"
"with preempt_count %08x,"
- " exited with %08x?\n", h - softirq_vec,
- softirq_to_name[h - softirq_vec],
- h->action, prev_count, preempt_count());
+ " exited with %08x?\n", vec_nr,
+ softirq_to_name[vec_nr], h->action,
+ prev_count, preempt_count());
preempt_count() = prev_count;
}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
up_write(&listeners->sem);
}
-static int fill_pid(pid_t pid, struct task_struct *tsk,
- struct taskstats *stats)
+static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
{
- int rc = 0;
-
- if (!tsk) {
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk)
- get_task_struct(tsk);
- rcu_read_unlock();
- if (!tsk)
- return -ESRCH;
- } else
- get_task_struct(tsk);
-
memset(stats, 0, sizeof(*stats));
/*
* Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
/* fill in extended acct fields */
xacct_add_tsk(stats, tsk);
+}
- /* Define err: label here if needed */
- put_task_struct(tsk);
- return rc;
+static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
+{
+ struct task_struct *tsk;
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return -ESRCH;
+ fill_stats(tsk, stats);
+ put_task_struct(tsk);
+ return 0;
}
-static int fill_tgid(pid_t tgid, struct task_struct *first,
- struct taskstats *stats)
+static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
{
- struct task_struct *tsk;
+ struct task_struct *tsk, *first;
unsigned long flags;
int rc = -ESRCH;
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
* leaders who are already counted with the dead tasks
*/
rcu_read_lock();
- if (!first)
- first = find_task_by_vpid(tgid);
+ first = find_task_by_vpid(tgid);
if (!first || !lock_task_sighand(first, &flags))
goto out;
@@ -268,7 +263,6 @@ out:
return rc;
}
-
static void fill_tgid_exit(struct task_struct *tsk)
{
unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
struct nlattr *na, *ret;
int aggr;
+ /* If we don't pad, we end up with alignment on a 4 byte boundary.
+ * This causes lots of runtime warnings on systems requiring 8 byte
+ * alignment */
+ u32 pids[2] = { pid, 0 };
+ int pid_size = ALIGN(sizeof(pid), sizeof(long));
+
aggr = (type == TASKSTATS_TYPE_PID)
? TASKSTATS_TYPE_AGGR_PID
: TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
na = nla_nest_start(skb, aggr);
if (!na)
goto err;
- if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+ if (nla_put(skb, type, pid_size, pids) < 0)
goto err;
ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
if (!ret)
@@ -424,39 +424,46 @@ err:
return rc;
}
-static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+static int cmd_attr_register_cpumask(struct genl_info *info)
{
- int rc;
- struct sk_buff *rep_skb;
- struct taskstats *stats;
- size_t size;
cpumask_var_t mask;
+ int rc;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
-
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
if (rc < 0)
- goto free_return_rc;
- if (rc == 0) {
- rc = add_del_listener(info->snd_pid, mask, REGISTER);
- goto free_return_rc;
- }
+ goto out;
+ rc = add_del_listener(info->snd_pid, mask, REGISTER);
+out:
+ free_cpumask_var(mask);
+ return rc;
+}
+
+static int cmd_attr_deregister_cpumask(struct genl_info *info)
+{
+ cpumask_var_t mask;
+ int rc;
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
if (rc < 0)
- goto free_return_rc;
- if (rc == 0) {
- rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
-free_return_rc:
- free_cpumask_var(mask);
- return rc;
- }
+ goto out;
+ rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+out:
free_cpumask_var(mask);
+ return rc;
+}
+
+static int cmd_attr_pid(struct genl_info *info)
+{
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ u32 pid;
+ int rc;
- /*
- * Size includes space for nested attributes
- */
size = nla_total_size(sizeof(u32)) +
nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
@@ -465,33 +472,64 @@ free_return_rc:
return rc;
rc = -EINVAL;
- if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
- u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
- if (!stats)
- goto err;
-
- rc = fill_pid(pid, NULL, stats);
- if (rc < 0)
- goto err;
- } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
- u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
- if (!stats)
- goto err;
-
- rc = fill_tgid(tgid, NULL, stats);
- if (rc < 0)
- goto err;
- } else
+ pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+ if (!stats)
+ goto err;
+
+ rc = fill_stats_for_pid(pid, stats);
+ if (rc < 0)
+ goto err;
+ return send_reply(rep_skb, info);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+static int cmd_attr_tgid(struct genl_info *info)
+{
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ u32 tgid;
+ int rc;
+
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+ if (rc < 0)
+ return rc;
+
+ rc = -EINVAL;
+ tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ if (!stats)
goto err;
+ rc = fill_stats_for_tgid(tgid, stats);
+ if (rc < 0)
+ goto err;
return send_reply(rep_skb, info);
err:
nlmsg_free(rep_skb);
return rc;
}
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
+ return cmd_attr_register_cpumask(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
+ return cmd_attr_deregister_cpumask(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
+ return cmd_attr_pid(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
+ return cmd_attr_tgid(info);
+ else
+ return -EINVAL;
+}
+
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
if (!stats)
goto err;
- rc = fill_pid(-1, tsk, stats);
- if (rc < 0)
- goto err;
+ fill_stats(tsk, stats);
/*
* Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c3dab054d18e..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
RB_LEN_TIME_STAMP = 16,
};
+#define skip_time_extend(event) \
+ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
+
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
return length + RB_EVNT_HDR_SIZE;
}
-/* inline for ring buffer fast paths */
-static unsigned
+/*
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
rb_event_length(struct ring_buffer_event *event)
{
switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
return 0;
}
+/*
+ * Return total length of time extend and data,
+ * or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+ unsigned len = 0;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ /* time extends include the data event after it */
+ len = RB_LEN_TIME_EXTEND;
+ event = skip_time_extend(event);
+ }
+ return len + rb_event_length(event);
+}
+
/**
* ring_buffer_event_length - return the length of the event
* @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
*/
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
- unsigned length = rb_event_length(event);
+ unsigned length;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
+ length = rb_event_length(event);
if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length;
length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static void *
rb_event_data(struct ring_buffer_event *event)
{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
-
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0;
}
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+
+ /* Not the first event on the page? */
+ if (rb_event_index(event)) {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ } else {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
+
+ return skip_time_extend(event);
+}
+
/**
* ring_buffer_update_event - update event type and data
* @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
* data field.
*/
static void
-rb_update_event(struct ring_buffer_event *event,
- unsigned type, unsigned length)
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, unsigned length,
+ int add_timestamp, u64 delta)
{
- event->type_len = type;
-
- switch (type) {
-
- case RINGBUF_TYPE_PADDING:
- case RINGBUF_TYPE_TIME_EXTEND:
- case RINGBUF_TYPE_TIME_STAMP:
- break;
+ /* Only a commit updates the timestamp */
+ if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+ delta = 0;
- case 0:
- length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
- event->array[0] = length;
- else
- event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
- break;
- default:
- BUG();
+ /*
+ * If we need to add a timestamp, then we
+ * add it to the start of the resevered space.
+ */
+ if (unlikely(add_timestamp)) {
+ event = rb_add_time_stamp(event, delta);
+ length -= RB_LEN_TIME_EXTEND;
+ delta = 0;
}
+
+ event->time_delta = delta;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ event->type_len = 0;
+ event->array[0] = length;
+ } else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
}
/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length, unsigned long tail,
- struct buffer_page *tail_page, u64 *ts)
+ struct buffer_page *tail_page, u64 ts)
{
struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Nested commits always have zero deltas, so
* just reread the time stamp
*/
- *ts = rb_time_stamp(buffer);
- next_page->page->time_stamp = *ts;
+ ts = rb_time_stamp(buffer);
+ next_page->page->time_stamp = ts;
}
out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned type, unsigned long length, u64 *ts)
+ unsigned long length, u64 ts,
+ u64 delta, int add_timestamp)
{
struct buffer_page *tail_page;
struct ring_buffer_event *event;
unsigned long tail, write;
+ /*
+ * If the time delta since the last event is too big to
+ * hold in the time field of the event, then we append a
+ * TIME EXTEND event ahead of the data event.
+ */
+ if (unlikely(add_timestamp))
+ length += RB_LEN_TIME_EXTEND;
+
tail_page = cpu_buffer->tail_page;
write = local_add_return(length, &tail_page->write);
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - length;
/* See if we shot pass the end of this buffer page */
- if (write > BUF_PAGE_SIZE)
+ if (unlikely(write > BUF_PAGE_SIZE))
return rb_move_tail(cpu_buffer, length, tail,
tail_page, ts);
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail);
kmemcheck_annotate_bitfield(event, bitfield);
- rb_update_event(event, type, length);
+ rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
- /* The passed in type is zero for DATA */
- if (likely(!type))
- local_inc(&tail_page->entries);
+ local_inc(&tail_page->entries);
/*
* If this is the first commit on the page, then update
* its timestamp.
*/
if (!tail)
- tail_page->page->time_stamp = *ts;
+ tail_page->page->time_stamp = ts;
return event;
}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long addr;
new_index = rb_event_index(event);
- old_index = new_index + rb_event_length(event);
+ old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
addr &= PAGE_MASK;
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
}
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- u64 *ts, u64 *delta)
-{
- struct ring_buffer_event *event;
- int ret;
-
- WARN_ONCE(*delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
- (unsigned long long)*delta,
- (unsigned long long)*ts,
- (unsigned long long)cpu_buffer->write_stamp);
-
- /*
- * The delta is too big, we to add a
- * new timestamp.
- */
- event = __rb_reserve_next(cpu_buffer,
- RINGBUF_TYPE_TIME_EXTEND,
- RB_LEN_TIME_EXTEND,
- ts);
- if (!event)
- return -EBUSY;
-
- if (PTR_ERR(event) == -EAGAIN)
- return -EAGAIN;
-
- /* Only a commited time event can update the write stamp */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * If this is the first on the page, then it was
- * updated with the page itself. Try to discard it
- * and if we can't just make it zero.
- */
- if (rb_event_index(event)) {
- event->time_delta = *delta & TS_MASK;
- event->array[0] = *delta >> TS_SHIFT;
- } else {
- /* try to discard, since we do not need this */
- if (!rb_try_to_discard(cpu_buffer, event)) {
- /* nope, just zero it */
- event->time_delta = 0;
- event->array[0] = 0;
- }
- }
- cpu_buffer->write_stamp = *ts;
- /* let the caller know this was the commit */
- ret = 1;
- } else {
- /* Try to discard the event */
- if (!rb_try_to_discard(cpu_buffer, event)) {
- /* Darn, this is just wasted space */
- event->time_delta = 0;
- event->array[0] = 0;
- }
- ret = 0;
- }
-
- *delta = 0;
-
- return ret;
-}
-
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->committing);
local_inc(&cpu_buffer->commits);
}
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
unsigned long length)
{
struct ring_buffer_event *event;
- u64 ts, delta = 0;
- int commit = 0;
+ u64 ts, delta;
int nr_loops = 0;
+ int add_timestamp;
+ u64 diff;
rb_start_commit(cpu_buffer);
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
length = rb_calculate_event_length(length);
again:
+ add_timestamp = 0;
+ delta = 0;
+
/*
* We allow for interrupts to reenter here and do a trace.
* If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
goto out_fail;
ts = rb_time_stamp(cpu_buffer->buffer);
+ diff = ts - cpu_buffer->write_stamp;
- /*
- * Only the first commit can update the timestamp.
- * Yes there is a race here. If an interrupt comes in
- * just after the conditional and it traces too, then it
- * will also check the deltas. More than one timestamp may
- * also be made. But only the entry that did the actual
- * commit will be something other than zero.
- */
- if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
- rb_page_write(cpu_buffer->tail_page) ==
- rb_commit_index(cpu_buffer))) {
- u64 diff;
-
- diff = ts - cpu_buffer->write_stamp;
-
- /* make sure this diff is calculated here */
- barrier();
-
- /* Did the write stamp get updated already? */
- if (unlikely(ts < cpu_buffer->write_stamp))
- goto get_event;
+ /* make sure this diff is calculated here */
+ barrier();
+ /* Did the write stamp get updated already? */
+ if (likely(ts >= cpu_buffer->write_stamp)) {
delta = diff;
if (unlikely(test_time_stamp(delta))) {
-
- commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
- if (commit == -EBUSY)
- goto out_fail;
-
- if (commit == -EAGAIN)
- goto again;
-
- RB_WARN_ON(cpu_buffer, commit < 0);
+ WARN_ONCE(delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+ (unsigned long long)delta,
+ (unsigned long long)ts,
+ (unsigned long long)cpu_buffer->write_stamp);
+ add_timestamp = 1;
}
}
- get_event:
- event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+ event = __rb_reserve_next(cpu_buffer, length, ts,
+ delta, add_timestamp);
if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again;
if (!event)
goto out_fail;
- if (!rb_event_is_commit(cpu_buffer, event))
- delta = 0;
-
- event->time_delta = delta;
-
return event;
out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
#define TRACE_RECURSIVE_DEPTH 16
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
{
- current->trace_recursion++;
-
- if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
- return 0;
-
/* Disable all tracing before we do anything else */
tracing_off_permanent();
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
in_nmi());
WARN_ON_ONCE(1);
+}
+
+static inline int trace_recursive_lock(void)
+{
+ current->trace_recursion++;
+
+ if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ return 0;
+
+ trace_recursive_fail();
+
return -1;
}
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
{
WARN_ON_ONCE(!current->trace_recursion);
@@ -2308,12 +2298,28 @@ static void
rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
+ u64 delta;
+
/*
* The event first in the commit queue updates the
* time stamp.
*/
- if (rb_event_is_commit(cpu_buffer, event))
- cpu_buffer->write_stamp += event->time_delta;
+ if (rb_event_is_commit(cpu_buffer, event)) {
+ /*
+ * A commit event that is first on a page
+ * updates the write timestamp with the page stamp
+ */
+ if (!rb_event_index(event))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->write_stamp += delta;
+ } else
+ cpu_buffer->write_stamp += event->time_delta;
+ }
}
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static inline void rb_event_discard(struct ring_buffer_event *event)
{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
/* array[0] holds the actual length for the discarded event */
event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
again:
/*
- * We repeat when a timestamp is encountered. It is possible
- * to get multiple timestamps from an interrupt entering just
- * as one timestamp is about to be written, or from discarded
- * commits. The most that we can have is the number on a single page.
+ * We repeat when a time extend is encountered.
+ * Since the time extend is always attached to a data event,
+ * we should never loop more than once.
+ * (We never hit the following condition more than twice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
- * We repeat when a timestamp is encountered.
- * We can get multiple timestamps by nested interrupts or also
- * if filtering is on (discarding commits). Since discarding
- * commits can be frequent we can get a lot of timestamps.
- * But we limit them by not adding timestamps if they begin
- * at the start of a page.
+ * We repeat when a time extend is encountered.
+ * Since the time extend is always attached to a data event,
+ * we should never loop more than once.
+ * (We never hit the following condition more than twice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
if (len > (commit - read))
len = (commit - read);
- size = rb_event_length(event);
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
if (len < size)
goto out_unlock;
@@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
break;
event = rb_reader_event(cpu_buffer);
- size = rb_event_length(event);
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
} while (len > size);
/* update bpage */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu();
struct dentry *d_cpu;
- /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
- char cpu_dir[7];
+ char cpu_dir[30]; /* 30 characters should be more than enough */
- if (cpu > 999 || cpu < 0)
- return;
-
- sprintf(cpu_dir, "cpu%ld", cpu);
+ snprintf(cpu_dir, 30, "cpu%ld", cpu);
d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b8d2852baa4a..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
#include <linux/perf_event.h>
#include <linux/stringify.h>
#include <linux/limits.h>
-#include <linux/uaccess.h>
#include <asm/bitsperlong.h>
#include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
stats->ac_ppid = pid_alive(tsk) ?
rcu_dereference(tsk->real_parent)->tgid : 0;
rcu_read_unlock();
- stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
- stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
- stats->ac_utimescaled =
- cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
- stats->ac_stimescaled =
- cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
+ stats->ac_utime = cputime_to_usecs(tsk->utime);
+ stats->ac_stime = cputime_to_usecs(tsk->stime);
+ stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
+ stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;