diff options
Diffstat (limited to 'kernel')
79 files changed, 2776 insertions, 1574 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 6670fbd3e466..d15c0ee4d955 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -147,7 +147,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) again: smp_rmb(); rcu_read_lock(); - res = to_acct(ACCESS_ONCE(ns->bacct)); + res = to_acct(READ_ONCE(ns->bacct)); if (!res) { rcu_read_unlock(); return NULL; @@ -159,7 +159,7 @@ again: } rcu_read_unlock(); mutex_lock(&res->lock); - if (res != to_acct(ACCESS_ONCE(ns->bacct))) { + if (res != to_acct(READ_ONCE(ns->bacct))) { mutex_unlock(&res->lock); acct_put(res); goto again; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e2636737b69b..c4b9ab01bba5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e2924ecb..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -57,7 +57,7 @@ #include <linux/backing-dev.h> #include <linux/sort.h> #include <linux/oom.h> - +#include <linux/sched/isolation.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> @@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ - cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; - if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) - goto done; - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - non_isolated_cpus); + housekeeping_cpumask(HK_FLAG_DOMAIN)); goto done; } @@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains, */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; if (is_sched_load_balance(cp)) @@ -789,7 +785,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, non_isolated_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -802,7 +798,6 @@ restart: BUG_ON(nslot != ndoms); done: - free_cpumask_var(non_isolated_cpus); kfree(csa); /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..4c39c05e029a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -209,7 +209,7 @@ static int event_function(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* @@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) @@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } +/* + * State based event timekeeping... + * + * The basic idea is to use event->state to determine which (if any) time + * fields to increment with the current delta. This means we only need to + * update timestamps when we change state or when they are explicitly requested + * (read). + * + * Event groups make things a little more complicated, but not terribly so. The + * rules for a group are that if the group leader is OFF the entire group is + * OFF, irrespecive of what the group member states are. This results in + * __perf_effective_state(). + * + * A futher ramification is that when a group leader flips between OFF and + * !OFF, we need to update all group member times. + * + * + * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we + * need to make sure the relevant context time is updated before we try and + * update our timestamps. + */ + +static __always_inline enum perf_event_state +__perf_effective_state(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + if (leader->state <= PERF_EVENT_STATE_OFF) + return leader->state; + + return event->state; +} + +static __always_inline void +__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) +{ + enum perf_event_state state = __perf_effective_state(event); + u64 delta = now - event->tstamp; + + *enabled = event->total_time_enabled; + if (state >= PERF_EVENT_STATE_INACTIVE) + *enabled += delta; + + *running = event->total_time_running; + if (state >= PERF_EVENT_STATE_ACTIVE) + *running += delta; +} + +static void perf_event_update_time(struct perf_event *event) +{ + u64 now = perf_event_time(event); + + __perf_update_times(event, now, &event->total_time_enabled, + &event->total_time_running); + event->tstamp = now; +} + +static void perf_event_update_sibling_time(struct perf_event *leader) +{ + struct perf_event *sibling; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) + perf_event_update_time(sibling); +} + +static void +perf_event_set_state(struct perf_event *event, enum perf_event_state state) +{ + if (event->state == state) + return; + + perf_event_update_time(event); + /* + * If a group leader gets enabled/disabled all its siblings + * are affected too. + */ + if ((event->state < 0) ^ (state < 0)) + perf_event_update_sibling_time(event); + + WRITE_ONCE(event->state, state); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) event->shadow_ctx_time = now - t->timestamp; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} - /* * Update cpuctx->cgrp so that it is set when first cgroup event is added and * cleared when last cgroup event is removed. @@ -975,17 +1023,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) } static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} - -static inline void list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) { @@ -1006,7 +1043,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) struct perf_cpu_context *cpuctx; int rotations = 0; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); rotations = perf_rotate_context(cpuctx); @@ -1093,7 +1130,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) { struct list_head *head = this_cpu_ptr(&active_ctx_list); - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(!list_empty(&ctx->active_ctx_list)); @@ -1102,7 +1139,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) static void perf_event_ctx_deactivate(struct perf_event_context *ctx) { - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(list_empty(&ctx->active_ctx_list)); @@ -1202,7 +1239,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); - ctx = ACCESS_ONCE(event->ctx); + ctx = READ_ONCE(event->ctx); if (!atomic_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; @@ -1398,60 +1435,6 @@ static u64 perf_event_time(struct perf_event *event) return ctx ? ctx->time : 0; } -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - lockdep_assert_held(&ctx->lock); - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_cgroup_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; @@ -1494,6 +1477,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; + event->tstamp = perf_event_time(event); + /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that @@ -1701,8 +1686,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) list_del_init(&event->group_entry); - update_group_times(event); - /* * If event was in error state, then keep it * that way, otherwise bogus counts will be @@ -1711,7 +1694,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * of the event */ if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; + perf_event_set_state(event, PERF_EVENT_STATE_OFF); ctx->generation++; } @@ -1810,38 +1793,24 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - u64 delta; + enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE && - !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - if (event->state != PERF_EVENT_STATE_ACTIVE) return; perf_pmu_disable(event->pmu); - event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; - event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; + state = PERF_EVENT_STATE_OFF; } + perf_event_set_state(event, state); if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1861,7 +1830,9 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - int state = group_event->state; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; perf_pmu_disable(ctx->pmu); @@ -1875,7 +1846,7 @@ group_sched_out(struct perf_event *group_event, perf_pmu_enable(ctx->pmu); - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + if (group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -1895,6 +1866,11 @@ __perf_remove_from_context(struct perf_event *event, { unsigned long flags = (unsigned long)info; + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); @@ -1957,14 +1933,17 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); else event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; + + perf_event_set_state(event, PERF_EVENT_STATE_OFF); } /* @@ -2021,8 +2000,7 @@ void perf_event_disable_inatomic(struct perf_event *event) } static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) + struct perf_event_context *ctx) { /* * use the correct time source for the time snapshot @@ -2050,9 +2028,9 @@ static void perf_set_shadow_time(struct perf_event *event, * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) |
