diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-28 09:44:15 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-28 09:44:15 -0800 |
| commit | c0e809e244804d428bcd976eaf9369f60508ea8a (patch) | |
| tree | 99fa85899a3c11d2ebeb6d090f218fda968a0e6a /arch/x86 | |
| parent | 2180f214f4a5d8e2d8b7138d9a59246ee05753b9 (diff) | |
| parent | 0cc4bd8f70d1ea2940295f1050508c663fe9eff9 (diff) | |
| download | linux-c0e809e244804d428bcd976eaf9369f60508ea8a.tar.gz linux-c0e809e244804d428bcd976eaf9369f60508ea8a.tar.bz2 linux-c0e809e244804d428bcd976eaf9369f60508ea8a.zip | |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar:
"Kernel side changes:
- Ftrace is one of the last W^X violators (after this only KLP is
left). These patches move it over to the generic text_poke()
interface and thereby get rid of this oddity. This requires a
surprising amount of surgery, by Peter Zijlstra.
- x86/AMD PMUs: add support for 'Large Increment per Cycle Events' to
count certain types of events that have a special, quirky hw ABI
(by Kim Phillips)
- kprobes fixes by Masami Hiramatsu
Lots of tooling updates as well, the following subcommands were
updated: annotate/report/top, c2c, clang, record, report/top TUI,
sched timehist, tests; plus updates were done to the gtk ui, libperf,
headers and the parser"
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
perf/x86/amd: Add support for Large Increment per Cycle Events
perf/x86/amd: Constrain Large Increment per Cycle events
perf/x86/intel/rapl: Add Comet Lake support
tracing: Initialize ret in syscall_enter_define_fields()
perf header: Use last modification time for timestamp
perf c2c: Fix return type for histogram sorting comparision functions
perf beauty sockaddr: Fix augmented syscall format warning
perf/ui/gtk: Fix gtk2 build
perf ui gtk: Add missing zalloc object
perf tools: Use %define api.pure full instead of %pure-parser
libperf: Setup initial evlist::all_cpus value
perf report: Fix no libunwind compiled warning break s390 issue
perf tools: Support --prefix/--prefix-strip
perf report: Clarify in help that --children is default
tools build: Fix test-clang.cpp with Clang 8+
perf clang: Fix build with Clang 9
kprobes: Fix optimize_kprobe()/unoptimize_kprobe() cancellation logic
tools lib: Fix builds when glibc contains strlcpy()
perf report/top: Make 'e' visible in the help and make it toggle showing callchains
perf report/top: Do not offer annotation for symbols without samples
...
Diffstat (limited to 'arch/x86')
| -rw-r--r-- | arch/x86/events/amd/core.c | 109 | ||||
| -rw-r--r-- | arch/x86/events/core.c | 74 | ||||
| -rw-r--r-- | arch/x86/events/intel/rapl.c | 2 | ||||
| -rw-r--r-- | arch/x86/events/perf_event.h | 20 | ||||
| -rw-r--r-- | arch/x86/include/asm/ftrace.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/kprobes.h | 14 | ||||
| -rw-r--r-- | arch/x86/include/asm/set_memory.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/text-patching.h | 86 | ||||
| -rw-r--r-- | arch/x86/kernel/alternative.c | 198 | ||||
| -rw-r--r-- | arch/x86/kernel/ftrace.c | 688 | ||||
| -rw-r--r-- | arch/x86/kernel/jump_label.c | 116 | ||||
| -rw-r--r-- | arch/x86/kernel/kprobes/core.c | 20 | ||||
| -rw-r--r-- | arch/x86/kernel/kprobes/opt.c | 67 | ||||
| -rw-r--r-- | arch/x86/kernel/traps.c | 9 | ||||
| -rw-r--r-- | arch/x86/mm/init_32.c | 28 | ||||
| -rw-r--r-- | arch/x86/mm/init_64.c | 36 |
16 files changed, 590 insertions, 881 deletions
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index a7752cd78b89..1f22b6bbda68 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -14,6 +14,10 @@ static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp); static unsigned long perf_nmi_window; +/* AMD Event 0xFFF: Merge. Used with Large Increment per Cycle events */ +#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) +#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -301,6 +305,25 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel) return offset; } +/* + * AMD64 events are detected based on their event codes. + */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + +static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) +{ + if (!(x86_pmu.flags & PMU_FL_PAIR)) + return false; + + switch (amd_get_event_code(hwc)) { + case 0x003: return true; /* Retired SSE/AVX FLOPs */ + default: return false; + } +} + static int amd_core_hw_config(struct perf_event *event) { if (event->attr.exclude_host && event->attr.exclude_guest) @@ -316,15 +339,10 @@ static int amd_core_hw_config(struct perf_event *event) else if (event->attr.exclude_guest) event->hw.config |= AMD64_EVENTSEL_HOSTONLY; - return 0; -} + if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) + event->hw.flags |= PERF_X86_EVENT_PAIR; -/* - * AMD64 events are detected based on their event codes. - */ -static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) -{ - return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); + return 0; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -855,6 +873,29 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, } } +static struct event_constraint pair_constraint; + +static struct event_constraint * +amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (amd_is_pair_event_code(hwc)) + return &pair_constraint; + + return &unconstrained; +} + +static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (is_counter_pair(hwc)) + --cpuc->n_pair; +} + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -898,33 +939,15 @@ static __initconst const struct x86_pmu amd_pmu = { static int __init amd_core_pmu_init(void) { + u64 even_ctr_mask = 0ULL; + int i; + if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) return 0; - /* Avoid calulating the value each time in the NMI handler */ + /* Avoid calculating the value each time in the NMI handler */ perf_nmi_window = msecs_to_jiffies(100); - switch (boot_cpu_data.x86) { - case 0x15: - pr_cont("Fam15h "); - x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; - break; - case 0x17: - pr_cont("Fam17h "); - /* - * In family 17h, there are no event constraints in the PMC hardware. - * We fallback to using default amd_get_event_constraints. - */ - break; - case 0x18: - pr_cont("Fam18h "); - /* Using default amd_get_event_constraints. */ - break; - default: - pr_err("core perfctr but no constraints; unknown hardware!\n"); - return -ENODEV; - } - /* * If core performance counter extensions exists, we must use * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also @@ -939,6 +962,32 @@ static int __init amd_core_pmu_init(void) */ x86_pmu.amd_nb_constraints = 0; + if (boot_cpu_data.x86 == 0x15) { + pr_cont("Fam15h "); + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + } + if (boot_cpu_data.x86 >= 0x17) { + pr_cont("Fam17h+ "); + /* + * Family 17h and compatibles have constraints for Large + * Increment per Cycle events: they may only be assigned an + * even numbered counter that has a consecutive adjacent odd + * numbered counter following it. + */ + for (i = 0; i < x86_pmu.num_counters - 1; i += 2) + even_ctr_mask |= 1 << i; + + pair_constraint = (struct event_constraint) + __EVENT_CONSTRAINT(0, even_ctr_mask, 0, + x86_pmu.num_counters / 2, 0, + PERF_X86_EVENT_PAIR); + + x86_pmu.get_event_constraints = amd_get_event_constraints_f17h; + x86_pmu.put_event_constraints = amd_put_event_constraints_f17h; + x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE; + x86_pmu.flags |= PMU_FL_PAIR; + } + pr_cont("core perfctr, "); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f118af9f0718..3bb738f5a472 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -618,6 +618,7 @@ void x86_pmu_disable_all(void) int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -627,6 +628,8 @@ void x86_pmu_disable_all(void) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu_config_addr(idx), val); + if (is_counter_pair(hwc)) + wrmsrl(x86_pmu_config_addr(idx + 1), 0); } } @@ -699,7 +702,7 @@ struct sched_state { int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ int nr_gp; /* number of GP counters used */ - unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 used; }; /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ @@ -756,8 +759,12 @@ static bool perf_sched_restore_state(struct perf_sched *sched) sched->saved_states--; sched->state = sched->saved[sched->saved_states]; - /* continue with next counter: */ - clear_bit(sched->state.counter++, sched->state.used); + /* this assignment didn't work out */ + /* XXX broken vs EVENT_PAIR */ + sched->state.used &= ~BIT_ULL(sched->state.counter); + + /* try the next one */ + sched->state.counter++; return true; } @@ -782,20 +789,32 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { - if (!__test_and_set_bit(idx, sched->state.used)) - goto done; + u64 mask = BIT_ULL(idx); + + if (sched->state.used & mask) + continue; + + sched->state.used |= mask; + goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { - if (!__test_and_set_bit(idx, sched->state.used)) { - if (sched->state.nr_gp++ >= sched->max_gp) - return false; + u64 mask = BIT_ULL(idx); - goto done; - } + if (c->flags & PERF_X86_EVENT_PAIR) + mask |= mask << 1; + + if (sched->state.used & mask) + continue; + + if (sched->state.nr_gp++ >= sched->max_gp) + return false; + + sched->state.used |= mask; + goto done; } return false; @@ -872,12 +891,10 @@ EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; - - bitmap_zero(used_mask, X86_PMC_IDX_MAX); + u64 used_mask = 0; /* * Compute the number of events already present; see x86_pmu_add(), @@ -920,6 +937,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { + u64 mask; + hwc = &cpuc->event_list[i]->hw; c = cpuc->event_constraint[i]; @@ -931,11 +950,16 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (!test_bit(hwc->idx, c->idxmsk)) break; + mask = BIT_ULL(hwc->idx); + if (is_counter_pair(hwc)) + mask |= mask << 1; + /* not already used */ - if (test_bit(hwc->idx, used_mask)) + if (used_mask & mask) break; - __set_bit(hwc->idx, used_mask); + used_mask |= mask; + if (assign) assign[i] = hwc->idx; } @@ -958,6 +982,15 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) READ_ONCE(cpuc->excl_cntrs->exclusive_present)) gpmax /= 2; + /* + * Reduce the amount of available counters to allow fitting + * the extra Merge events needed by large increment events. + */ + if (x86_pmu.flags & PMU_FL_PAIR) { + gpmax = x86_pmu.num_counters - cpuc->n_pair; + WARN_ON(gpmax <= 0); + } + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, gpmax, assign); } @@ -1038,6 +1071,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, return -EINVAL; cpuc->event_list[n] = leader; n++; + if (is_counter_pair(&leader->hw)) + cpuc->n_pair++; } if (!dogrp) return n; @@ -1052,6 +1087,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, cpuc->event_list[n] = event; n++; + if (is_counter_pair(&event->hw)) + cpuc->n_pair++; } return n; } @@ -1238,6 +1275,13 @@ int x86_perf_event_set_period(struct perf_event *event) wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* + * Clear the Merge event counter's upper 16 bits since + * we currently declare a 48-bit counter width + */ + if (is_counter_pair(hwc)) + wrmsrl(x86_pmu_event_addr(idx + 1), 0); + + /* * Due to erratum on certan cpu we need * a second write to be sure the register * is updated properly diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 5053a403e4ae..09913121e726 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -741,6 +741,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE_L, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE, model_skl), {}, }; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 930611db8f9a..f1cd1ca1a77b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -77,6 +77,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ #define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -272,6 +273,7 @@ struct cpu_hw_events { struct amd_nb *amd_nb; /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; + int n_pair; /* Large increment events */ void *kfree_on_online[X86_PERF_KFREE_MAX]; }; @@ -694,6 +696,7 @@ struct x86_pmu { * AMD bits */ unsigned int amd_nb_constraints : 1; + u64 perf_ctr_pair_en; /* * Extra registers for events @@ -743,6 +746,7 @@ do { \ #define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ #define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ #define PMU_FL_TFA 0x20 /* deal with TSX force abort */ +#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -838,6 +842,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool is_counter_pair(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_PAIR; +} + static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { @@ -845,6 +854,14 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, if (hwc->extra_reg.reg) wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); + + /* + * Add enabled Merge event on next counter + * if large increment event being enabled on this counter + */ + if (is_counter_pair(hwc)) + wrmsrl(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en); + wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); } @@ -861,6 +878,9 @@ static inline void x86_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; wrmsrl(hwc->config_base, hwc->config); + + if (is_counter_pair(hwc)) + wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0); } void x86_pmu_enable_event(struct perf_event *event); diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c2a7458f912c..85be2f506272 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -47,8 +47,6 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; -int ftrace_int3_handler(struct pt_regs *regs); - #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 5dc909d9ad81..95b1f053bd96 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -11,12 +11,11 @@ #include <asm-generic/kprobes.h> -#define BREAKPOINT_INSTRUCTION 0xcc - #ifdef CONFIG_KPROBES #include <linux/types.h> #include <linux/ptrace.h> #include <linux/percpu.h> +#include <asm/text-patching.h> #include <asm/insn.h> #define __ARCH_WANT_KPROBES_INSN_SLOT @@ -25,10 +24,7 @@ struct pt_regs; struct kprobe; typedef u8 kprobe_opcode_t; -#define RELATIVEJUMP_OPCODE 0xe9 -#define RELATIVEJUMP_SIZE 5 -#define RELATIVECALL_OPCODE 0xe8 -#define RELATIVE_ADDR_SIZE 4 + #define MAX_STACK_SIZE 64 #define CUR_STACK_SIZE(ADDR) \ (current_top_of_stack() - (unsigned long)(ADDR)) @@ -43,11 +39,11 @@ extern __visible kprobe_opcode_t optprobe_template_entry[]; extern __visible kprobe_opcode_t optprobe_template_val[]; extern __visible kprobe_opcode_t optprobe_template_call[]; extern __visible kprobe_opcode_t optprobe_template_end[]; -#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + DISP32_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)optprobe_template_end - \ (unsigned long)optprobe_template_entry) + \ - MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + MAX_OPTIMIZED_LENGTH + JMP32_INSN_SIZE) extern const int kretprobe_blacklist_size; @@ -73,7 +69,7 @@ struct arch_specific_insn { struct arch_optimized_insn { /* copy of the original instructions */ - kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + kprobe_opcode_t copied_insn[DISP32_SIZE]; /* detour code buffer */ kprobe_opcode_t *insn; /* the size of instructions copied to detour code buffer */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 2ee8e469dcf5..64c3dce374e5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -81,8 +81,6 @@ int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); extern int kernel_set_to_readonly; -void set_kernel_text_rw(void); -void set_kernel_text_ro(void); #ifdef CONFIG_X86_64 static inline int set_mce_nospec(unsigned long pfn) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 23c626a742e8..67315fa3956a 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -25,14 +25,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, */ #define POKE_MAX_OPCODE_SIZE 5 -struct text_poke_loc { - void *addr; - int len; - s32 rel32; - u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; -}; - extern void text_poke_early(void *addr, const void *opcode, size_t len); /* @@ -50,21 +42,13 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); -extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate); -extern int after_bootmem; -extern __ro_after_init struct mm_struct *poking_mm; -extern __ro_after_init unsigned long poking_addr; -#ifndef CONFIG_UML_X86 -static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) -{ - regs->ip = ip; -} +extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); +extern void text_poke_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC @@ -78,6 +62,67 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB +#define DISP32_SIZE 4 + +static inline int text_opcode_size(u8 opcode) +{ + int size = 0; + +#define __CASE(insn) \ + case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break + + switch(opcode) { + __CASE(INT3); + __CASE(CALL); + __CASE(JMP32); + __CASE(JMP8); + } + +#undef __CASE + + return size; +} + +union text_poke_insn { + u8 text[POKE_MAX_OPCODE_SIZE]; + struct { + u8 opcode; + s32 disp; + } __attribute__((packed)); +}; + +static __always_inline +void *text_gen_insn(u8 opcode, const void *addr, const void *dest) +{ + static union text_poke_insn insn; /* per instance */ + int size = text_opcode_size(opcode); + + insn.opcode = opcode; + + if (size > 1) { + insn.disp = (long)dest - (long)(addr + size); + if (size == 2) { + /* + * Ensure that for JMP9 the displacement + * actually fits the signed byte. + */ + BUG_ON((insn.disp >> 31) != (insn.disp >> 7)); + } + } + + return &insn.text; +} + +extern int after_bootmem; +extern __ro_after_init struct mm_struct *poking_mm; +extern __ro_after_init unsigned long poking_addr; + +#ifndef CONFIG_UML_X86 +static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* @@ -85,6 +130,9 @@ static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of * this gap. See the idtentry macro's create_gap option. + * + * Similarly entry_32.S will have a gap on the stack for (any) hardware + * exception and pt_regs; see FIXUP_FRAME. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9ec463fe96f2..34360ca301a2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -936,44 +936,81 @@ static void do_sync_core(void *info) sync_core(); } -static struct bp_patching_desc { +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +struct text_poke_loc { + s32 rel_addr; /* addr := _stext + rel_addr */ + s32 rel32; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; +}; + +struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; -} bp_patching; + atomic_t refs; +}; + +static struct bp_patching_desc *bp_desc; + +static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +{ + struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ + + if (!desc || !atomic_inc_not_zero(&desc->refs)) + return NULL; + + return desc; +} + +static inline void put_desc(struct bp_patching_desc *desc) +{ + smp_mb__before_atomic(); + atomic_dec(&desc->refs); +} -static int patch_cmp(const void *key, const void *elt) +static inline void *text_poke_addr(struct text_poke_loc *tp) +{ + return _stext + tp->rel_addr; +} + +static int notrace patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; - if (key < tp->addr) + if (key < text_poke_addr(tp)) return -1; - if (key > tp->addr) + if (key > text_poke_addr(tp)) return 1; return 0; } NOKPROBE_SYMBOL(patch_cmp); -int poke_int3_handler(struct pt_regs *regs) +int notrace poke_int3_handler(struct pt_regs *regs) { + struct bp_patching_desc *desc; struct text_poke_loc *tp; + int len, ret = 0; void *ip; + if (user_mode(regs)) + return 0; + /* * Having observed our INT3 instruction, we now must observe - * bp_patching.nr_entries. + * bp_desc: * - * nr_entries != 0 INT3 + * bp_desc = desc INT3 * WMB RMB - * write INT3 if (nr_entries) - * - * Idem for other elements in bp_patching. + * write INT3 if (desc) */ smp_rmb(); - if (likely(!bp_patching.nr_entries)) - return 0; - - if (user_mode(regs)) + desc = try_get_desc(&bp_desc); + if (!desc) return 0; /* @@ -984,19 +1021,20 @@ int poke_int3_handler(struct pt_regs *regs) /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(bp_patching.nr_entries > 1)) { - tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, + if (unlikely(desc->nr_entries > 1)) { + tp = bsearch(ip, desc->vec, desc->nr_entries, sizeof(struct text_poke_loc), patch_cmp); if (!tp) - return 0; + goto out_put; } else { - tp = bp_patching.vec; - if (tp->addr != ip) - return 0; + tp = desc->vec; + if (text_poke_addr(tp) != ip) + goto out_put; } - ip += tp->len; + len = text_opcode_size(tp->opcode); + ip += len; switch (tp->opcode) { case INT3_INSN_OPCODE: @@ -1004,7 +1042,7 @@ int poke_int3_handler(struct pt_regs *regs) * Someone poked an explicit INT3, they'll want to handle it, * do not consume. */ - return 0; + goto out_put; case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tp->rel32); @@ -1019,10 +1057,18 @@ int poke_int3_handler(struct pt_regs *regs) BUG(); } - return 1; + ret = 1; + +out_put: + put_desc(desc); + return ret; } NOKPROBE_SYMBOL(poke_int3_handler); +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch @@ -1044,16 +1090,20 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { + struct bp_patching_desc desc = { + .vec = tp, + .nr_entries = nr_entries, + .refs = ATOMIC_INIT(1), + }; unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); - bp_patching.vec = tp; - bp_patching.nr_entries = nr_entries; + smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ /* * Corresponding read barrier in int3 notifier for making sure the @@ -1065,18 +1115,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * First step: add a int3 trap to the address that will be patched. */ for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, &int3, sizeof(int3)); + text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); /* * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { - if (tp[i].len - sizeof(int3) > 0) { - text_poke((char *)tp[i].addr + sizeof(int3), - (const char *)tp[i].text + sizeof(int3), - tp[i].len - sizeof(int3)); + int len = text_opcode_size(tp[i].opcode); + + if (len - INT3_INSN_SIZE > 0) { + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + (const char *)tp[i].text + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); do_sync++; } } @@ -1087,7 +1139,7 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); } /* @@ -1098,19 +1150,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) if (tp[i].text[0] == INT3_INSN_OPCODE) continue; - text_poke(tp[i].addr, tp[i].text, sizeof(int3)); + text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); do_sync++; } if (do_sync) - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); /* - * sync_core() implies an smp_mb() and orders this store against - * the writing of the new instruction. + * Remove and synchronize_rcu(), except we have a very primitive + * refcount based completion. */ - bp_patching.vec = NULL; - bp_patching.nr_entries = 0; + WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ + if (!atomic_dec_and_test(&desc.refs)) + atomic_cond_read_acquire(&desc.refs, !VAL); } void text_poke_loc_init(struct text_poke_loc *tp, void *addr, @@ -1118,11 +1171,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, { struct insn insn; - if (!opcode) - opcode = (void *)tp->text; - else - memcpy((void *)tp->text, opcode, len); - + memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; @@ -1132,8 +1181,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, BUG_ON(!insn_complete(&insn)); BUG_ON(len != insn.length); - tp->addr = addr; - tp->len = len; + tp->rel_addr = addr - (void *)_stext; tp->opcode = insn.opcode.bytes[0]; switch (tp->opcode) { @@ -1167,6 +1215,55 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } } +/* + * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * early if needed. + */ +static bool tp_order_fail(void *addr) +{ + struct text_poke_loc *tp; + + if (!tp_vec_nr) + return false; + + if (!addr) /* force */ + return true; + + tp = &tp_vec[tp_vec_nr - 1]; + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + return true; + + return false; +} + +static void text_poke_flush(void *addr) |
