diff options
author | Ingo Molnar | 2011-10-12 17:14:47 +0200 |
---|---|---|
committer | Ingo Molnar | 2011-10-12 17:14:47 +0200 |
commit | 910e94dd0cc5abacebf0bd5ffd859f61b9583857 (patch) | |
tree | 7cb33f3f89f5fcdaa4dd4f5f19bb946ff495e9ac | |
parent | 177e2163fe2a5951c1d0531baa4bc1f135c6b7b4 (diff) | |
parent | d696b58ca2c3ca76e784ef89a7e0453d9b7ab187 (diff) |
Merge branch 'tip/perf/core' of git://github.com/rostedt/linux into perf/core
-rw-r--r-- | arch/x86/kernel/jump_label.c | 2 | ||||
-rw-r--r-- | include/linux/module.h | 12 | ||||
-rw-r--r-- | include/linux/ring_buffer.h | 2 | ||||
-rw-r--r-- | include/linux/trace_clock.h | 1 | ||||
-rw-r--r-- | include/linux/tracepoint.h | 25 | ||||
-rw-r--r-- | kernel/module.c | 47 | ||||
-rw-r--r-- | kernel/trace/Makefile | 2 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 8 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 70 | ||||
-rw-r--r-- | kernel/trace/trace.c | 173 | ||||
-rw-r--r-- | kernel/trace/trace.h | 16 | ||||
-rw-r--r-- | kernel/trace/trace_clock.c | 12 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 795 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter_test.h | 50 | ||||
-rw-r--r-- | kernel/trace/trace_irqsoff.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_printk.c | 19 | ||||
-rw-r--r-- | kernel/tracepoint.c | 169 |
17 files changed, 928 insertions, 479 deletions
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 3fee346ef545..cacdd46d184d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -42,7 +42,7 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -void arch_jump_label_text_poke_early(jump_label_t addr) +void __init_or_module arch_jump_label_text_poke_early(jump_label_t addr) { text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); diff --git a/include/linux/module.h b/include/linux/module.h index 1c30087a2d81..863921637d9f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -580,9 +580,6 @@ int unregister_module_notifier(struct notifier_block * nb); extern void print_modules(void); -extern void module_update_tracepoints(void); -extern int module_get_iter_tracepoints(struct tracepoint_iter *iter); - #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) @@ -698,15 +695,6 @@ static inline int unregister_module_notifier(struct notifier_block * nb) static inline void print_modules(void) { } - -static inline void module_update_tracepoints(void) -{ -} - -static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) -{ - return 0; -} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b891de96000f..67be0376d8e3 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -154,6 +154,8 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h index 7a8130384087..4eb490237d4c 100644 --- a/include/linux/trace_clock.h +++ b/include/linux/trace_clock.h @@ -15,5 +15,6 @@ extern u64 notrace trace_clock_local(void); extern u64 notrace trace_clock(void); extern u64 notrace trace_clock_global(void); +extern u64 notrace trace_clock_counter(void); #endif /* _LINUX_TRACE_CLOCK_H */ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d530a4460a0b..df0a779c1bbd 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -54,8 +54,18 @@ extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe, void *data); extern void tracepoint_probe_update_all(void); +#ifdef CONFIG_MODULES +struct tp_module { + struct list_head list; + unsigned int num_tracepoints; + struct tracepoint * const *tracepoints_ptrs; +}; +#endif /* CONFIG_MODULES */ + struct tracepoint_iter { - struct module *module; +#ifdef CONFIG_MODULES + struct tp_module *module; +#endif /* CONFIG_MODULES */ struct tracepoint * const *tracepoint; }; @@ -63,8 +73,6 @@ extern void tracepoint_iter_start(struct tracepoint_iter *iter); extern void tracepoint_iter_next(struct tracepoint_iter *iter); extern void tracepoint_iter_stop(struct tracepoint_iter *iter); extern void tracepoint_iter_reset(struct tracepoint_iter *iter); -extern int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, - struct tracepoint * const *begin, struct tracepoint * const *end); /* * tracepoint_synchronize_unregister must be called between the last tracepoint @@ -78,17 +86,6 @@ static inline void tracepoint_synchronize_unregister(void) #define PARAMS(args...) args -#ifdef CONFIG_TRACEPOINTS -extern -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end); -#else -static inline -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) -{ } -#endif /* CONFIG_TRACEPOINTS */ - #endif /* _LINUX_TRACEPOINT_H */ /* diff --git a/kernel/module.c b/kernel/module.c index 04379f92f843..93342d992f34 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3487,50 +3487,3 @@ void module_layout(struct module *mod, } EXPORT_SYMBOL(module_layout); #endif - -#ifdef CONFIG_TRACEPOINTS -void module_update_tracepoints(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - mutex_unlock(&module_mutex); -} - -/* - * Returns 0 if current not found. - * Returns 1 if current found. - */ -int module_get_iter_tracepoints(struct tracepoint_iter *iter) -{ - struct module *iter_mod; - int found = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(iter_mod, &modules, list) { - if (!iter_mod->taints) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - } - mutex_unlock(&module_mutex); - return found; -} -#endif diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..b384ed512bac 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_events_filter.o := -I$(src) + # # Make the trace clocks available generally: it's infrastructure # relied on by ptrace for example: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e7829..077d85387908 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3863,6 +3863,14 @@ void ftrace_kill(void) } /** + * Test if ftrace is dead or not. + */ +int ftrace_is_dead(void) +{ + return ftrace_disabled; +} + +/** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. * diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..acf6b68dc4a8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -488,12 +488,14 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + local_t entries_bytes; local_t commit_overrun; local_t overrun; local_t entries; local_t committing; local_t commits; unsigned long read; + unsigned long read_bytes; u64 write_stamp; u64 read_stamp; }; @@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); /* * The entries will be zeroed out when we move the @@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!tail) tail_page->page->time_stamp = ts; + /* account for these added bytes */ + local_add(length, &cpu_buffer->entries_bytes); + return event; } @@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, old_index += write_mask; new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); return 1; + } } /* could not discard */ @@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) } /** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +{ + unsigned long flags; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * if the tail is on reader_page, oldest time stamp is on the reader + * page + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + bpage = cpu_buffer->reader_page; + else + bpage = rb_set_head_page(cpu_buffer); + ret = bpage->page->time_stamp; + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + +/** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. @@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); + cpu_buffer->read_bytes += BUF_PAGE_SIZE; /* swap the pages */ rb_init_page(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..f86efe90ca45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -435,6 +435,7 @@ static struct { } trace_clocks[] = { { trace_clock_local, "local" }, { trace_clock_global, "global" }, + { trace_clock_counter, "counter" }, }; int trace_clock_id; @@ -2159,6 +2160,14 @@ void trace_default_header(struct seq_file *m) } } +static void test_ftrace_alive(struct seq_file *m) +{ + if (!ftrace_is_dead()) + return; + seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2168,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v) if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); + test_ftrace_alive(m); } if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); @@ -3569,6 +3579,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r, cpu; + unsigned long size = 0, expanded_size = 0; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + size += tr->entries >> 10; + if (!ring_buffer_expanded) + expanded_size += trace_buf_size >> 10; + } + if (ring_buffer_expanded) + r = sprintf(buf, "%lu\n", size); + else + r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3594,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static int mark_printk(const char *fmt, ...) -{ - int ret; - va_list args; - va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); - va_end(args); - return ret; -} - static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { - char *buf; - size_t written; + unsigned long addr = (unsigned long)ubuf; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + int nr_pages = 1; + ssize_t written; + void *page1; + void *page2; + int offset; + int size; + int len; + int ret; if (tracing_disabled) return -EINVAL; @@ -3617,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 2, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; + /* + * Userspace is injecting traces into the kernel trace buffer. + * We want to be as non intrusive as possible. + * To do so, we do not want to allocate any special buffers + * or take any locks, but instead write the userspace data + * straight into the ring buffer. + * + * First we need to pin the userspace buffer into memory, + * which, most likely it is, because it just referenced it. + * But there's no guarantee that it is. By using get_user_pages_fast() + * and kmap_atomic/kunmap_atomic() we can get access to the + * pages directly. We then write the data directly into the + * ring buffer. + */ + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - if (copy_from_user(buf, ubuf, cnt)) { - kfree(buf); - return -EFAULT; + /* check if we cross pages */ + if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) + nr_pages = 2; + + offset = addr & (PAGE_SIZE - 1); + addr &= PAGE_MASK; + + ret = get_user_pages_fast(addr, nr_pages, 0, pages); + if (ret < nr_pages) { + while (--ret >= 0) + put_page(pages[ret]); + written = -EFAULT; + goto out; } - if (buf[cnt-1] != '\n') { - buf[cnt] = '\n'; - buf[cnt+1] = '\0'; + + page1 = kmap_atomic(pages[0]); + if (nr_pages == 2) + page2 = kmap_atomic(pages[1]); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* possible \n added */ + buffer = global_trace.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (!event) { + /* Ring buffer disabled, return as if not open for write */ + written = -EBADF; + goto out_unlock; + } + + entry = ring_buffer_event_data(event); + entry->ip = _THIS_IP_; + + if (nr_pages == 2) { + len = PAGE_SIZE - offset; + memcpy(&entry->buf, page1 + offset, len); + memcpy(&entry->buf[len], page2, cnt - len); } else - buf[cnt] = '\0'; + memcpy(&entry->buf, page1 + offset, cnt); - written = mark_printk("%s", buf); - kfree(buf); - *fpos += written; + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; - /* don't tell userspace we wrote more - it might confuse them */ - if (written > cnt) - written = cnt; + ring_buffer_unlock_commit(buffer, event); + + written = cnt; + + *fpos += written; + out_unlock: + if (nr_pages == 2) + kunmap_atomic(page2); + kunmap_atomic(page1); + while (nr_pages > 0) + put_page(pages[--nr_pages]); + out: return written; } @@ -3739,6 +3828,12 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_total_entries_fops = { + .open = tracing_open_generic, + .read = tracing_total_entries_read, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_free_buffer_fops = { .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, @@ -4026,6 +4121,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, struct trace_array *tr = &global_trace; struct trace_seq *s; unsigned long cnt; + unsigned long long t; + unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4042,6 +4139,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); + cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + trace_seq_printf(s, "bytes: %ld\n", cnt); + + t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); @@ -4450,6 +4558,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + &global_trace, &tracing_total_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, &global_trace, &tracing_free_buffer_fops); @@ -4566,6 +4677,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) tracing_off(); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + if (disable_tracing) ftrace_kill(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 616846bcfee5..092e1f8d18dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +extern int ftrace_is_dead(void); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } +static inline int ftrace_is_dead(void) { return 0; } #endif /* @@ -761,16 +763,10 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - /* - * Leaf nodes use field_name, ops is used by AND and OR - * nodes. The field_name is always freed when freeing a pred. - * We can overload field_name for ops and have it freed - * as well. - */ - union { - char *field_name; - unsigned short *ops; - }; + unsigned short *ops; +#ifdef CONFIG_FTRACE_STARTUP_TEST + struct ftrace_event_field *field; +#endif int offset; int not; int op; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6302747a1398..394783531cbb 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void) return now; } + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ + return atomic64_add_return(1, &trace_counter); +} diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 256764ecccd6..816d3d074979 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +enum walk_return { + WALK_PRED_ABORT, + WALK_PRED_PARENT, + WALK_PRED_DEFAULT, +}; + +typedef int (*filter_pred_walkcb_t) (enum move_type move, + struct filter_pred *pred, + int *err, void *data); + +static int walk_pred_tree(struct filter_pred *preds, + struct filter_pred *root, + filter_pred_walkcb_t cb, void *data) +{ + struct filter_pred *pred = root; + enum move_type move = MOVE_DOWN; + int done = 0; + + if (!preds) + return -EINVAL; + + do { + int err = 0, ret; + + ret = cb(move, pred, &err, data); + if (ret == WALK_PRED_ABORT) + return err; + if (ret == WALK_PRED_PARENT) + goto get_parent; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + goto get_parent; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + get_parent: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, + &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + /* * A series of AND or ORs where found together. Instead of * climbing up and down the tree branches, an array of the @@ -410,99 +467,91 @@ static int process_ops(struct filter_pred *preds, for (i = 0; i < op->val; i++) { pred = &preds[op->ops[i]]; - match = pred->fn(pred, rec); + if (!WARN_ON_ONCE(!pred->fn)) + match = pred->fn(pred, rec); if (!!match == type) return match; } return match; } +struct filter_match_preds_data { + struct filter_pred *preds; + int match; + void *rec; +}; + +static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_match_preds_data *d = data; + + *err = 0; + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) + return WALK_PRED_DEFAULT; + /* We can treat folded ops as a leaf node */ + d->match = process_ops(d->preds, pred, d->rec); + } else { + if (!WARN_ON_ONCE(!pred->fn)) + d->match = pred->fn(pred, d->rec); + } + + return WALK_PRED_PARENT; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!d->match == (pred->op == OP_OR)) + return WALK_PRED_PARENT; + break; + case MOVE_UP_FROM_RIGHT: + break; + } + + return WALK_PRED_DEFAULT; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1; - enum move_type move = MOVE_DOWN; struct filter_pred *preds; - struct filter_pred *pred; struct filter_pred *root; - int n_preds; - int done = 0; + struct filter_match_preds_data data = { + /* match is currently meaningless */ + .match = -1, + .rec = rec, + }; + int n_preds, ret; /* no filter is considered a match */ if (!filter) return 1; n_preds = filter->n_preds; - if (!n_preds) return 1; /* * n_preds, root and filter->preds are protect with preemption disabled. */ - preds = rcu_dereference_sched(filter->preds); root = rcu_dereference_sched(filter->root); if (!root) return 1; - pred = root; - - /* match is currently meaningless */ - match = -1; - - do { - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) { - /* keep going to down the left side */ - pred = &preds[pred->left]; - continue; - } - /* We can treat folded ops as a leaf node */ - match = process_ops(preds, pred, rec); - } else - match = pred->fn(pred, rec); - /* If this pred is the only pred */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!match == (pred->op == OP_OR)) { - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - /* now go down the right side of the tree. */ - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - /* We finished this equation. */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return match; + data.preds = preds = rcu_dereference_sched(filter->preds); + ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); + WARN_ON(ret); + return data.match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -628,22 +677,6 @@ find_event_field(struct ftrace_event_call *call, char *name) return __find_event_field(head, name); } -static void filter_free_pred(struct filter_pred *pred) -{ - if (!pred) - return; - - kfree(pred->field_name); - kfree(pred); -} - -static void filter_clear_pred(struct filter_pred *pred) -{ - kfree(pred->field_name); - pred->field_name = NULL; - pred->regex.len = 0; -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); @@ -689,20 +722,13 @@ __pop_pred_stack(struct pred_stack *stack) static int filter_set_pred(struct event_filter *filter, int idx, struct pred_stack *stack, - struct filter_pred *src, - filter_pred_fn_t fn) + struct filter_pred *src) { struct filter_pred *dest = &filter->preds[idx]; struct filter_pred *left; struct filter_pred *right; *dest = *src; - if (src->field_name) { - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; - } - dest->fn = fn; dest->index = idx; if (dest->op == OP_OR || dest->op == OP_AND) { @@ -743,11 +769,7 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { - int i; - if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) - kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -840,23 +862,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system) } } -static int filter_add_pred_fn(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - filter_pred_fn_t fn) +static int filter_add_pred(struct filter_parse_state *ps, + struct event_filter *filter, + struct filter_pred *pred, + struct pred_stack *stack) { - int idx, err; + int err; if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } - idx = filter->n_preds; - filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(filter, idx, stack, pred, fn); + err = filter_set_pred(filter, filter->n_preds, stack, pred); if (err) return err; @@ -937,31 +955,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, return fn; } -static int filter_add_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - bool dry_run) +static int init_pred(struct filter_parse_state *ps, + struct ftrace_event_field *field, + struct filter_pred *pred) + { - struct ftrace_event_field *field; - filter_pred_fn_t fn; + filter_pred_fn_t fn = filter_pred_none; unsigned long long val; int ret; - fn = pred->fn = filter_pred_none; - - if (pred->op == OP_AND) - goto add_pred_fn; - else if (pred->op == OP_OR) - goto add_pred_fn; - - field = find_event_field(call, pred->field_name); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return -EINVAL; - } - pred->offset = field->offset; if (!is_legal_op(field, pred->op)) { @@ -1001,9 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, if (pred->op == OP_NE) pred->not = 1; -add_pred_fn: - if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, stack, fn); + pred->fn = fn; return 0; } @@ -1302,39 +1302,37 @@ parse_operand: return 0; } -static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +static struct filter_pred *create_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + int op, char *operand1, char *operand2) { - struct filter_pred *pred; + struct ftrace_event_field *field; + static struct filter_pred pred; - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - return NULL; + memset(&pred, 0, sizeof(pred)); + pred.op = op; - pred->field_name = kstrdup(operand1, GFP_KERNEL); - if (!pred->field_name) { - kfree(pred); + if (op == OP_AND || op == OP_OR) + return &pred; + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); return NULL; } - strcpy(pred->regex.pattern, operand2); - pred->regex.len = strlen(pred->regex.pattern); - - pred->op = op; - - return pred; -} - -static struct filter_pred *create_logical_pred(int op) -{ - struct filter_pred *pred; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + field = find_event_field(call, operand1); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; + } - pred->op = op; + strcpy(pred.regex.pattern, operand2); + pred.regex.len = strlen(pred.regex.pattern); - return pred; +#ifdef CONFIG_FTRACE_STARTUP_TEST + pred.field = field; +#endif + return init_pred(ps, field, &pred) ? NULL : &pred; } static int check_preds(struct filter_parse_state *ps) @@ -1375,6 +1373,23 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +struct check_pred_data { + int count; + int max; +}; + +static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct check_pred_data *d = data; + + if (WARN_ON(d->count++ > d->max)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + return WALK_PRED_DEFAULT; +} + /* * The tree is walked at filtering of an event. If the tree is not correctly * built, it may cause an infinite loop. Check here that the tree does @@ -1383,107 +1398,76 @@ static int count_preds(struct filter_parse_state *ps) static int check_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; - int max; - - /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. - */ - max = 3 * filter->n_preds; + struct check_pred_data data = { + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + .max = 3 * filter->n_preds, + .count = 0, + }; - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; + return walk_pred_tree(filter->preds, root, + check_pred_tree_cb, &data); +} - do { - if (WARN_ON(count++ > max)) - return -EINVAL; +static int count_leafs_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + int *count = data; - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) + (*count)++; - /* We are fine. */ - return 0; + return WALK_PRED_DEFAULT; } static int count_leafs(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; + int count = 0, ret; - pred = root; + ret = walk_pred_tree(preds, root, count_leafs_cb, &count); + WARN_ON(ret); + return count; +} - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - return 1; - count++; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); +struct fold_pred_data { + struct filter_pred *root; + int count; + int children; +}; - return count; +static int fold_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct fold_pred_data *d = data; + struct filter_pred *root = d->root; + + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (pred->left != FILTER_PRED_INVALID) + return WALK_PRED_DEFAULT; + + if (WARN_ON(d->count == d->children)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + + pred->index &= ~FILTER_PRED_FOLD; + root->ops[d->count++] = pred->index; + return WALK_PRED_DEFAULT; } static int fold_pred(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; + struct fold_pred_data data = { + .root = root, + .count = 0, + }; int children; - int done = 0; /* No need to keep the fold flag */ root->index &= ~FILTER_PRED_FOLD; @@ -1501,37 +1485,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) return -ENOMEM; root->val = children; + data.children = children; + return walk_pred_tree(preds, root, fold_pred_cb, &data); +} - pred = root; - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - if (WARN_ON(count == children)) - return -EINVAL; - pred->index &= ~FILTER_PRED_FOLD; - root->ops[count++] = pred->index; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); +static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_pred *preds = data; - return 0; + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (!(pred->index & FILTER_PRED_FOLD)) + return WALK_PRED_DEFAULT; + + *err = fold_pred(preds, pred); + if (*err) + return WALK_PRED_ABORT; + + /* eveyrhing below is folded, continue with parent */ + return WALK_PRED_PARENT; } /* @@ -1542,51 +1515,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) static int fold_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int done = 0; - int err; - - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; - - do { - switch (move) { - case MOVE_DOWN: - if (pred->index & FILTER_PRED_FOLD) { - err = fold_pred(preds, pred); - if (err) - return err; - /* Folded nodes are like leafs */ - } else if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return 0; + return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, + filter->preds); } static int replace_preds(struct ftrace_event_call *call, @@ -1643,27 +1573,17 @@ static int replace_preds(struct ftrace_event_call *call, goto fail; } - if (elt->op == OP_AND || elt->op == OP_OR) { - pred = create_logical_pred(elt->op); - goto add_pred; - } - - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + pred = create_pred(ps, call, elt->op, operand1, operand2); + if (!pred) { err = -EINVAL; goto fail; } - pred = create_pred(elt->op, operand1, operand2); -add_pred: - if (!pred) { - err = -ENOMEM; - goto fail; + if (!dry_run) { + err = filter_add_pred(ps, filter, pred, &stack); + if (err) + goto fail; } - err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); - filter_free_pred(pred); - if (err) - goto fail; operand1 = operand2 = NULL; } @@ -1958,17 +1878,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, int err; struct event_filter *filter; struct filter_parse_state *ps; - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (call->event.type == event_id) - break; - } + call = event->tp_event; err = -EINVAL; - if (&call->list == &ftrace_events) + if (!call) goto out_unlock; err = -EEXIST; @@ -2012,3 +1929,215 @@ out_unlock: #endif /* CONFIG_PERF_EVENTS */ +#ifdef CONFIG_FTRACE_STARTUP_TEST + +#include <linux/types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include "trace_events_filter_test.h" + +static int test_get_filter(char *filter_str, struct ftrace_event_call *call, + struct event_filter **pfilter) +{ + struct event_filter *filter; + struct filter_parse_state *ps; + int err = -ENOMEM; + + filter = __alloc_filter(); + if (!filter) + goto out; + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_filter; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + *pfilter = filter; + + free_ps: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + free_filter: + if (err) + __free_filter(filter); + + out: + return err; +} + +#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ +{ \ + .filter = FILTER, \ + .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ + .e = ve, .f = vf, .g = vg, .h = vh }, \ + .match = m, \ + .not_visited = nvisit, \ +} +#define YES 1 +#define NO 0 + +static struct test_filter_data_t { + char *filter; + struct ftrace_raw_ftrace_test_filter rec; + int match; + char *not_visited; +} test_filter_data[] = { +#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ + "e == 1 && f == 1 && g == 1 && h == 1" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), + DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), + DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), +#undef FILTER +#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ + "e == 1 || f == 1 || g == 1 || h == 1" + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), +#undef FILTER +#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ + "(e == 1 || f == 1) && (g == 1 || h == 1)" + DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), + DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), +#undef FILTER +#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), +#undef FILTER +#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), +#undef FILTER +#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ + "(e == 1 || f == 1)) && (g == 1 || h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), +#undef FILTER +#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ + "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), + DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), +#undef FILTER +#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ + "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), +}; + +#undef DATA_REC +#undef FILTER +#undef YES +#undef NO + +#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) + +static int test_pred_visited; + +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + struct ftrace_event_field *field = pred->field; + + test_pred_visited = 1; + printk(KERN_INFO "\npred visited %s\n", field->name); + return 1; +} + +static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + char *fields = data; + + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) { + struct ftrace_event_field *field = pred->field; + + if (!field) { + WARN(1, "all leafs should have field defined"); + return WALK_PRED_DEFAULT; + } + if (!strchr(fields, *field->name)) + return WALK_PRED_DEFAULT; + + WARN_ON(!pred->fn); + pred->fn = test_pred_visited_fn; + } + return WALK_PRED_DEFAULT; +} + +static __init int ftrace_test_event_filter(void) +{ + int i; + + printk(KERN_INFO "Testing ftrace filter: "); + + for (i = 0; i < DATA_CNT; i++) { + struct event_filter *filter = NULL; + struct test_filter_data_t *d = &test_filter_data[i]; + int err; + + err = test_get_filter(d->filter, &event_ftrace_test_filter, + &filter); + if (err) { + printk(KERN_INFO + "Failed to get filter for '%s', err %d\n", + d->filter, err); + break; + } + + /* + * The preemption disabling is not really needed for self + * tests, but the rcu dereference will complain without it. + */ + preempt_disable(); + if (*d->not_visited) + walk_pred_tree(filter->preds, filter->root, + test_walk_pred_cb, + d->not_visited); + + test_pred_visited = 0; + err = filter_match_preds(filter, &d->rec); + preempt_enable(); + + __free_filter(filter); + + if (test_pred_visited) { + printk(KERN_INFO + "Failed, unwanted pred visited for filter %s\n", + d->filter); + break; + } + + if (err != d->match) { + printk(KERN_INFO + "Failed to match filter '%s', expected %d\n", + d->filter, d->match); + break; + } + } + + if (i == DATA_CNT) + printk(KERN_CONT "OK\n"); + + return 0; +} + +late_initcall(ftrace_test_event_filter); + +#endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 000000000000..bfd4dba0d603 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM test + +#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TEST_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(ftrace_test_filter, + + TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), + + TP_ARGS(a, b, c, d, e, f, g, h), + + TP_STRUCT__entry( + __field(int, a) + __field(int, b) + __field(int, c) + __field(int, d) + __field(int, e) + __field(int, f) + __field(int, g) + __field(int, h) + ), + + TP_fast_assign( + __entry->a = a; + __entry->b = b; + __entry->c = c; + __entry->d = d; + __entry->e = e; + __entry->f = f; + __entry->g = g; + __entry->h = h; + ), + + TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", + __entry->a, __entry->b, __entry->c, __entry->d, + __entry->e, __entry->f, __entry->g, __entry->h) +); + +#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_events_filter_test + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..a1a3359996a7 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); } #endif /* CONFIG_PREEMPT_TRACER */ diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1f06468a10d7..6fd4ffd042f9 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) continue; } + fmt = NULL; tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); - if (tb_fmt) + if (tb_fmt) { fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt && fmt) { - list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(fmt, *iter); - tb_fmt->fmt = fmt; - *iter = tb_fmt->fmt; - } else { - kfree(tb_fmt); - *iter = NULL; + if (fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; + } else + kfree(tb_fmt); } + *iter = fmt; + } mutex_unlock(&btrace_mutex); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index b219f1449c54..db110b8ae030 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; static const int tracepoint_debug; /* - * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the - * builtin and module tracepoints and the hash table. + * Tracepoints mutex protects the builtin and module tracepoints and the hash + * table, as well as the local module list. */ static DEFINE_MUTEX(tracepoints_mutex); +#ifdef CONFIG_MODULES +/* Local list of struct module */ +static LIST_HEAD(tracepoint_module_list); +#endif /* CONFIG_MODULES */ + /* * Tracepoint hash table, containing the active tracepoints. * Protected by tracepoints_mutex. @@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem) * @end: end of the range * * Updates the probe callback corresponding to a range of tracepoints. + * Called with tracepoints_mutex held. */ -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tracepoint_update_probe_range(struct tracepoint * const *begin, + struct tracepoint * const *end) { struct tracepoint * const *iter; struct tracepoint_entry *mark_entry; @@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, if (!begin) return; - mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_tracepoint((*iter)->name); if (mark_entry) { @@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, disable_tracepoint(*iter); } } - mutex_unlock(&tracepoints_mutex); } +#ifdef CONFIG_MODULES +void module_update_tracepoints(void) +{ + struct tp_module *tp_mod; + + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, + tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); +} +#else /* CONFIG_MODULES */ +void module_update_tracepoints(void) +{ +} +#endif /* CONFIG_MODULES */ + + /* * Update probes, removing the faulty probes. + * Called with tracepoints_mutex held. */ static void tracepoint_update_probes(void) { @@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_add_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_remove_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void) if (!list_empty(&old_probes)) list_replace_init(&old_probes, &release_probes); need_update = 0; - mutex_unlock(&tracepoints_mutex); - tracepoint_update_probes(); + mutex_unlock(&tracepoints_mutex); list_for_each_entry_safe(pos, next, &release_probes, u.list) { list_del(&pos->u.list); call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); @@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); * Will return the first tracepoint in the range if the input tracepoint is * NULL. */ -int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, +static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, struct tracepoint * const *begin, struct tracepoint * const *end) { if (!*tracepoint && begin != end) { @@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, return 1; return 0; } -EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); +#ifdef CONFIG_MODULES static void tracepoint_get_iter(struct tracepoint_iter *iter) { int found = 0; + struct tp_module *iter_mod; /* Core kernel tracepoints */ if (!iter->module) { @@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) if (found) goto end; } - /* tracepoints in modules. */ - found = module_get_iter_tracepoints(iter); + /* Tracepoints in modules */ + mutex_lock(&tracepoints_mutex); + list_for_each_entry(iter_mod, &tracepoint_module_list, list) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints_ptrs, + iter_mod->tracepoints_ptrs + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + mutex_unlock(&tracepoints_mutex); end: if (!found) tracepoint_iter_reset(iter); } +#else /* CONFIG_MODULES */ +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints_ptrs, + __stop___tracepoints_ptrs); + if (!found) + tracepoint_iter_reset(iter); +} +#endif /* CONFIG_MODULES */ void tracepoint_iter_start(struct tracepoint_iter *iter) { @@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop); void tracepoint_iter_reset(struct tracepoint_iter *iter) { +#ifdef CONFIG_MODULES iter->module = NULL; +#endif /* CONFIG_MODULES */ iter->tracepoint = NULL; } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +static int tracepoint_module_coming(struct module *mod) +{ + struct tp_module *tp_mod, *iter; + int ret = 0; + + /* + * We skip modules that tain the kernel, especially those with different + * module header (for forced load), to make sure we don't cause a crash. + */ + if (mod->taints) + return 0; + mutex_lock(&tracepoints_mutex); + tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); + if (!tp_mod) { + ret = -ENOMEM; + goto end; + } + tp_mod->num_tracepoints = mod->num_tracepoints; + tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; + + /* + * tracepoint_module_list is kept sorted by struct module pointer + * address for iteration on tracepoints from a seq_file that can release + * the mutex between calls. + */ + list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { + BUG_ON(iter == tp_mod); /* Should never be in the list twice */ + if (iter < tp_mod) { + /* We belong to the location right after iter. */ + list_add(&tp_mod->list, &iter->list); + goto module_added; + } + } + /* We belong to the beginning of the list */ + list_add(&tp_mod->list, &tracepoint_module_list); +module_added: + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} + +static int tracepoint_module_going(struct module *mod) +{ + struct tp_module *pos; + + mutex_lock(&tracepoints_mutex); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); + list_for_each_entry(pos, &tracepoint_module_list, list) { + if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { + list_del(&pos->list); + kfree(pos); + break; + } + } + /* + * In the case of modules that were tainted at "coming", we'll simply + * walk through the list without finding it. We cannot use the "tainted" + * flag on "going", in case a module taints the kernel only after being + * loaded. + */ + mutex_unlock(&tracepoints_mutex); + return 0; +} int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; + int ret = 0; switch (val) { case MODULE_STATE_COMING: + ret = tracepoint_module_coming(mod); + break; + case MODULE_STATE_LIVE: + break; case MODULE_STATE_GOING: - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + ret = tracepoint_module_going(mod); break; } - return 0; + return ret; } struct notifier_block tracepoint_module_nb = { @@ -598,7 +724,6 @@ static int init_tracepoints(void) return register_module_notifier(&tracepoint_module_nb); } __initcall(init_tracepoints); - #endif /* CONFIG_MODULES */ #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS |