From 09cbfeaf1a5a67bfb3201e0c83c810cecb2efa5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 1 Apr 2016 15:29:47 +0300 Subject: mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ; - >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 220fc17b9718..7edc95edfaee 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -321,7 +321,7 @@ retry: copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - page_cache_release(new_page); + put_page(new_page); put_old: put_page(old_page); @@ -539,14 +539,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, * see uprobe_register(). */ if (mapping->a_ops->readpage) - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else - page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); + page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); - page_cache_release(page); + put_page(page); return 0; } -- cgit v1.2.3 From d82bccc69041a51f7b7b9b4a36db0772f4cdba21 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 12 Apr 2016 10:26:19 -0700 Subject: bpf/verifier: reject invalid LD_ABS | BPF_DW instruction verifier must check for reserved size bits in instruction opcode and reject BPF_LD | BPF_ABS | BPF_DW and BPF_LD | BPF_IND | BPF_DW instructions, otherwise interpreter will WARN_RATELIMIT on them during execution. Fixes: ddd872bc3098 ("bpf: verifier: add checks for BPF_ABS | BPF_IND instructions") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e08f8e9b771..618ef77c302a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1374,6 +1374,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { verbose("BPF_LD_ABS uses reserved fields\n"); return -EINVAL; -- cgit v1.2.3 From 51d7b120418e99d6b3bf8df9eb3cc31e8171dee4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Apr 2016 12:05:37 -0700 Subject: /proc/iomem: only expose physical resource addresses to privileged users In commit c4004b02f8e5b ("x86: remove the kernel code/data/bss resources from /proc/iomem") I was hoping to remove the phyiscal kernel address data from /proc/iomem entirely, but that had to be reverted because some system programs actually use it. This limits all the detailed resource information to properly credentialed users instead. Signed-off-by: Linus Torvalds --- kernel/resource.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 2e78ead30934..9b5f04404152 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v) { struct resource *root = m->private; struct resource *r = v, *p; + unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; int depth; for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; + + if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) { + start = r->start; + end = r->end; + } else { + start = end = 0; + } + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, + width, start, + width, end, r->name ? r->name : ""); return 0; } -- cgit v1.2.3 From 6687659568e2ec5b3ac24b39c5d26ce8b9d90434 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 17 Apr 2016 23:31:41 -0700 Subject: locking/pvqspinlock: Fix division by zero in qstat_read() While playing with the qstat statistics (in /qlockstat/) I ran into the following splat on a VM when opening pv_hash_hops: divide error: 0000 [#1] SMP ... RIP: 0010:[] [] qstat_read+0x12e/0x1e0 ... Call Trace: [] ? mem_cgroup_commit_charge+0x6c/0xd0 [] ? page_add_new_anon_rmap+0x8c/0xd0 [] ? handle_mm_fault+0x1439/0x1b40 [] ? do_mmap+0x449/0x550 [] ? __vfs_read+0x23/0xd0 [] ? rw_verify_area+0x52/0xd0 [] ? vfs_read+0x81/0x120 [] ? SyS_read+0x42/0xa0 [] ? entry_SYSCALL_64_fastpath+0x1e/0xa8 Fix this by verifying that qstat_pv_kick_unlock is in fact non-zero, similarly to what the qstat_pv_latency_wake case does, as if nothing else, this can come from resetting the statistics, thus having 0 kicks should be quite valid in this context. Signed-off-by: Davidlohr Bueso Reviewed-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: waiman.long@hpe.com Link: http://lkml.kernel.org/r/1460961103-24953-1-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_stat.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index eb2a2c9bc3fc..d734b7502001 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -136,10 +136,12 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, } if (counter == qstat_pv_hash_hops) { - u64 frac; + u64 frac = 0; - frac = 100ULL * do_div(stat, kicks); - frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + if (kicks) { + frac = 100ULL * do_div(stat, kicks); + frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + } /* * Return a X.XX decimal number -- cgit v1.2.3 From 89e9e66ba1b3bde9d8ea90566c2aee20697ad681 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 15 Apr 2016 14:35:39 +0200 Subject: futex: Handle unlock_pi race gracefully If userspace calls UNLOCK_PI unconditionally without trying the TID -> 0 transition in user space first then the user space value might not have the waiters bit set. This opens the following race: CPU0 CPU1 uval = get_user(futex) lock(hb) lock(hb) futex |= FUTEX_WAITERS .... unlock(hb) cmpxchg(futex, uval, newval) So the cmpxchg fails and returns -EINVAL to user space, which is wrong because the futex value is valid. To handle this (yes, yet another) corner case gracefully, check for a flag change and retry. [ tglx: Massaged changelog and slightly reworked implementation ] Fixes: ccf9e6a80d9e ("futex: Make unlock_pi more robust") Signed-off-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Cc: Davidlohr Bueso Cc: Darren Hart Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460723739-5195-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a5d2e74c89e0..fd204e1670c9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1295,10 +1295,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; + } else if (curval != uval) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } if (ret) { raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; @@ -2622,6 +2632,15 @@ retry: */ if (ret == -EFAULT) goto pi_faulted; + /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) { + spin_unlock(&hb->lock); + put_futex_key(&key); + goto retry; + } /* * wake_futex_pi has detected invalid state. Tell user * space. -- cgit v1.2.3 From fe1bce9e2107ba3a8faffe572483b6974201a0e6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 20 Apr 2016 20:09:24 -0700 Subject: futex: Acknowledge a new waiter in counter before plist Otherwise an incoming waker on the dest hash bucket can miss the waiter adding itself to the plist during the lockless check optimization (small window but still the correct way of doing this); similarly to the decrement counterpart. Suggested-by: Peter Zijlstra Signed-off-by: Davidlohr Bueso Cc: Davidlohr Bueso Cc: bigeasy@linutronix.de Cc: dvhart@infradead.org Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1461208164-29150-1-git-send-email-dave@stgolabs.net Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index fd204e1670c9..c20f06f38ef3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1535,8 +1535,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); hb_waiters_dec(hb1); - plist_add(&q->list, &hb2->chain); hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); -- cgit v1.2.3 From 4589f450fb285ab85f7513b6649e51ec2a820653 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Thu, 21 Apr 2016 10:08:32 +0100 Subject: genirq: Dont allow affinity mask to be updated on IPIs The IPI domain re-purposes the IRQ affinity to signify the mask of CPUs that this IPI will deliver to. This must not be modified before the IPI is destroyed again, so set the IRQ_NO_BALANCING flag to prevent the affinity being overwritten by setup_affinity(). Without this, if an IPI is reserved for a single target CPU, then allocated using __setup_irq(), the affinity is overwritten with cpu_online_mask. When ipi_destroy() is subsequently called on a multi-cpu system, it will attempt to free cpumask_weight() IRQs that were never allocated, and crash. Fixes: d17bf24e6952 ("genirq: Add a new generic IPI reservation code to irq core") Signed-off-by: Matt Redfearn Cc: linux-mips@linux-mips.org Cc: jason@lakedaemon.net Cc: marc.zyngier@arm.com Cc: ralf@linux-mips.org Cc: Qais Yousef Cc: lisa.parratt@imgtec.com Link: http://lkml.kernel.org/r/1461229712-13057-1-git-send-email-matt.redfearn@imgtec.com Signed-off-by: Thomas Gleixner --- kernel/irq/ipi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index c37f34b00a11..14777af8e097 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -94,6 +94,7 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain, data = irq_get_irq_data(virq + i); cpumask_copy(data->common->affinity, dest); data->common->ipi_offset = offset; + irq_set_status_flags(virq + i, IRQ_NO_BALANCING); } return virq; -- cgit v1.2.3 From 3b9d6da67e11ca8f78fde887918983523a36b0fa Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 8 Apr 2016 14:40:15 +0200 Subject: cpu/hotplug: Fix rollback during error-out in __cpu_disable() The recent introduction of the hotplug thread which invokes the callbacks on the plugged cpu, cased the following regression: If takedown_cpu() fails, then we run into several issues: 1) The rollback of the target cpu states is not invoked. That leaves the smp threads and the hotplug thread in disabled state. 2) notify_online() is executed due to a missing skip_onerr flag. That causes that both CPU_DOWN_FAILED and CPU_ONLINE notifications are invoked which confuses quite some notifiers. 3) The CPU_DOWN_FAILED notification is not invoked on the target CPU. That's not an issue per se, but it is inconsistent and in consequence blocks the patches which rely on these states being invoked on the target CPU and not on the controlling cpu. It also does not preserve the strict call order on rollback which is problematic for the ongoing state machine conversion as well. To fix this we add a rollback flag to the remote callback machinery and invoke the rollback including the CPU_DOWN_FAILED notification on the remote cpu. Further mark the notify online state with 'skip_onerr' so we don't get a double invokation. This workaround will go away once we moved the unplug invocation to the target cpu itself. [ tglx: Massaged changelog and moved the CPU_DOWN_FAILED notifiaction to the target cpu ] Fixes: 4cb28ced23c4 ("cpu/hotplug: Create hotplug threads") Reported-by: Heiko Carstens Signed-off-by: Sebastian Andrzej Siewior Cc: linux-s390@vger.kernel.org Cc: rt@linutronix.de Cc: Martin Schwidefsky Cc: Anna-Maria Gleixner Link: http://lkml.kernel.org/r/20160408124015.GA21960@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ea42e8da861..3e3f6e49eabb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -36,6 +36,7 @@ * @target: The target state * @thread: Pointer to the hotplug thread * @should_run: Thread should execute + * @rollback: Perform a rollback * @cb_stat: The state for a single callback (install/uninstall) * @cb: Single callback function (install/uninstall) * @result: Result of the operation @@ -47,6 +48,7 @@ struct cpuhp_cpu_state { #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; + bool rollback; enum cpuhp_state cb_state; int (*cb)(unsigned int cpu); int result; @@ -301,6 +303,11 @@ static int cpu_notify(unsigned long val, unsigned int cpu) return __cpu_notify(val, cpu, -1, NULL); } +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + /* Notifier wrappers for transitioning to state machine */ static int notify_prepare(unsigned int cpu) { @@ -477,6 +484,16 @@ static void cpuhp_thread_fun(unsigned int cpu) } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); } + } else if (st->rollback) { + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + + undo_cpu_down(cpu, st, cpuhp_ap_states); + /* + * This is a momentary workaround to keep the notifier users + * happy. Will go away once we got rid of the notifiers. + */ + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); + st->rollback = false; } else { /* Cannot happen .... */ BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -636,11 +653,6 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -static void cpu_notify_nofail(unsigned long val, unsigned int cpu) -{ - BUG_ON(cpu_notify(val, cpu)); -} - static int notify_down_prepare(unsigned int cpu) { int err, nr_calls = 0; @@ -721,9 +733,10 @@ static int takedown_cpu(unsigned int cpu) */ err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { - /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED, cpu); + /* CPU refused to die */ irq_unlock_sparse(); + /* Unpark the hotplug thread so we can rollback there */ + kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread); return err; } BUG_ON(cpu_online(cpu)); @@ -832,6 +845,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + st->target = prev_state; + st->rollback = true; + cpuhp_kick_ap_work(cpu); + } hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; out: @@ -1249,6 +1267,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .name = "notify:online", .startup = notify_online, .teardown = notify_down_prepare, + .skip_onerr = true, }, #endif /* -- cgit v1.2.3 From b303e7c15d53cd8ef6b349b702e07eee3f102792 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Apr 2016 09:57:40 +0200 Subject: perf/core: Make sysctl_perf_cpu_time_max_percent conform to documentation Markus reported that 0 should also disable the throttling we per Documentation/sysctl/kernel.txt. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 91a612eea9a3 ("perf/core: Fix dynamic interrupt throttle") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 52bedc5a5aaa..2c78b6f47339 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, if (ret || !write) return ret; - if (sysctl_perf_cpu_time_max_percent == 100) { + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); -- cgit v1.2.3 From c24697566298df04cac9913e0601501b5ee2b3f5 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 16 Feb 2016 13:57:40 +0800 Subject: locking/lockdep: Fix ->irq_context calculation task_irq_context() returns the encoded irq_context of the task, the return value is encoded in the same as ->irq_context of held_lock. Always return 0 if !(CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING) Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Josh Triplett Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: sasha.levin@oracle.com Link: http://lkml.kernel.org/r/1455602265-16490-2-git-send-email-boqun.feng@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ed9410936a22..beb06f604420 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2932,6 +2932,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) return 1; } +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 2 * !!task->hardirq_context + !!task->softirq_context; +} + static int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { @@ -2940,8 +2945,6 @@ static int separate_irq_context(struct task_struct *curr, /* * Keep track of points where we cross into an interrupt context: */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; if (depth) { struct held_lock *prev_hlock; @@ -2973,6 +2976,11 @@ static inline int mark_irqflags(struct task_struct *curr, return 1; } +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + static inline int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { @@ -3241,6 +3249,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->acquire_ip = ip; hlock->instance = lock; hlock->nest_lock = nest_lock; + hlock->irq_context = task_irq_context(curr); hlock->trylock = trylock; hlock->read = read; hlock->check = check; -- cgit v1.2.3 From 75dd602a5198a6e5f75534db52b6e6fbaabb33d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Mar 2016 11:36:59 +0200 Subject: lockdep: Fix lock_chain::base size lock_chain::base is used to store an index into the chain_hlocks[] array, however that array contains more elements than can be indexed using the u16. Change the lock_chain structure to use a bitfield to encode the data it needs and add BUILD_BUG_ON() assertions to check the fields are wide enough. Also, for DEBUG_LOCKDEP, assert that we don't run out of elements of that array; as that would wreck the collision detectoring. Signed-off-by: Peter Zijlstra (Intel) Cc: Alfredo Alvarez Fernandez Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sedat Dilek Cc: Theodore Ts'o Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160330093659.GS3408@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 24 +++++++++++++++++++++++- kernel/locking/lockdep_proc.c | 2 ++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index beb06f604420..78c1c0ee6dc1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2176,15 +2176,37 @@ cache_hit: chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; + + BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks)); + BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); + BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; - nr_chain_hlocks += chain->depth; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; } + + if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) + nr_chain_hlocks += chain->depth; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * Important for check_no_collision(). + */ + if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + if (debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; + } +#endif + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index dbb61a302548..a0f61effad25 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v) int i; if (v == SEQ_START_TOKEN) { + if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } -- cgit v1.2.3 From 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:06:48 -0400 Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset kicks off asynchronous NUMA node migration if necessary during task migration and flushes it from cpuset_post_attach_flush() which is called at the end of __cgroup_procs_write(). This is to avoid performing migration with cgroup_threadgroup_rwsem write-locked which can lead to deadlock through dependency on kworker creation. memcg has a similar issue with charge moving, so let's convert it to an official callback rather than the current one-off cpuset specific function. This patch adds cgroup_subsys->post_attach callback and makes cpuset register cpuset_post_attach_flush() as its ->post_attach. The conversion is mostly one-to-one except that the new callback is called under cgroup_mutex. This is to guarantee that no other migration operations are started before ->post_attach callbacks are finished. cgroup_mutex is one of the outermost mutex in the system and has never been and shouldn't be a problem. We can add specialized synchronization around __cgroup_procs_write() but I don't think there's any noticeable benefit. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Cc: # 4.4+ prerequisite for the next patch --- kernel/cgroup.c | 7 +++++-- kernel/cpuset.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 671dc05c0b0f..909a7d31ffd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; + struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ret; + int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; @@ -2875,8 +2876,10 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); cgroup_kn_unlock(of->kn); - cpuset_post_attach_flush(); return ret ?: nbytes; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 00ab5c2b7c5b..1902956baba1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include @@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -void cpuset_post_attach_flush(void) +static void cpuset_post_attach(void) { flush_workqueue(cpuset_migrate_mm_wq); } @@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, .early_init = true, -- cgit v1.2.3 From 346c09f80459a3ad97df1816d6d606169a51001a Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Tue, 26 Apr 2016 13:15:35 +0200 Subject: workqueue: fix ghost PENDING flag while doing MQ IO The bug in a workqueue leads to a stalled IO request in MQ ctx->rq_list with the following backtrace: [ 601.347452] INFO: task kworker/u129:5:1636 blocked for more than 120 seconds. [ 601.347574] Tainted: G O 4.4.5-1-storage+ #6 [ 601.347651] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 601.348142] kworker/u129:5 D ffff880803077988 0 1636 2 0x00000000 [ 601.348519] Workqueue: ibnbd_server_fileio_wq ibnbd_dev_file_submit_io_worker [ibnbd_server] [ 601.348999] ffff880803077988 ffff88080466b900 ffff8808033f9c80 ffff880803078000 [ 601.349662] ffff880807c95000 7fffffffffffffff ffffffff815b0920 ffff880803077ad0 [ 601.350333] ffff8808030779a0 ffffffff815b01d5 0000000000000000 ffff880803077a38 [ 601.350965] Call Trace: [ 601.351203] [] ? bit_wait+0x60/0x60 [ 601.351444] [] schedule+0x35/0x80 [ 601.351709] [] schedule_timeout+0x192/0x230 [ 601.351958] [] ? blk_flush_plug_list+0xc7/0x220 [ 601.352208] [] ? ktime_get+0x37/0xa0 [ 601.352446] [] ? bit_wait+0x60/0x60 [ 601.352688] [] io_schedule_timeout+0xa4/0x110 [ 601.352951] [] ? _raw_spin_unlock_irqrestore+0xe/0x10 [ 601.353196] [] bit_wait_io+0x1b/0x70 [ 601.353440] [] __wait_on_bit+0x5d/0x90 [ 601.353689] [] wait_on_page_bit+0xc0/0xd0 [ 601.353958] [] ? autoremove_wake_function+0x40/0x40 [ 601.354200] [] __filemap_fdatawait_range+0xe4/0x140 [ 601.354441] [] filemap_fdatawait_range+0x14/0x30 [ 601.354688] [] filemap_write_and_wait_range+0x3f/0x70 [ 601.354932] [] blkdev_fsync+0x1b/0x50 [ 601.355193] [] vfs_fsync_range+0x49/0xa0 [ 601.355432] [] blkdev_write_iter+0xca/0x100 [ 601.355679] [] __vfs_write+0xaa/0xe0 [ 601.355925] [] vfs_write+0xa9/0x1a0 [ 601.356164] [] kernel_write+0x38/0x50 The underlying device is a null_blk, with default parameters: queue_mode = MQ submit_queues = 1 Verification that nullb0 has something inflight: root@pserver8:~# cat /sys/block/nullb0/inflight 0 1 root@pserver8:~# find /sys/block/nullb0/mq/0/cpu* -name rq_list -print -exec cat {} \; ... /sys/block/nullb0/mq/0/cpu2/rq_list CTX pending: ffff8838038e2400 ... During debug it became clear that stalled request is always inserted in the rq_list from the following path: save_stack_trace_tsk + 34 blk_mq_insert_requests + 231 blk_mq_flush_plug_list + 281 blk_flush_plug_list + 199 wait_on_page_bit + 192 __filemap_fdatawait_range + 228 filemap_fdatawait_range + 20 filemap_write_and_wait_range + 63 blkdev_fsync + 27 vfs_fsync_range + 73 blkdev_write_iter + 202 __vfs_write + 170 vfs_write + 169 kernel_write + 56 So blk_flush_plug_list() was called with from_schedule == true. If from_schedule is true, that means that finally blk_mq_insert_requests() offloads execution of __blk_mq_run_hw_queue() and uses kblockd workqueue, i.e. it calls kblockd_schedule_delayed_work_on(). That means, that we race with another CPU, which is about to execute __blk_mq_run_hw_queue() work. Further debugging shows the following traces from different CPUs: CPU#0 CPU#1 ---------------------------------- ------------------------------- reqeust A inserted STORE hctx->ctx_map[0] bit marked kblockd_schedule...() returns 1 request B inserted STORE hctx->ctx_map[1] bit marked kblockd_schedule...() returns 0 *** WORK PENDING bit is cleared *** flush_busy_ctxs() is executed, but bit 1, set by CPU#1, is not observed As a result request B pended forever. This behaviour can be explained by speculative LOAD of hctx->ctx_map on CPU#0, which is reordered with clear of PENDING bit and executed _before_ actual STORE of bit 1 on CPU#1. The proper fix is an explicit full barrier , which guarantees that clear of PENDING bit is to be executed before all possible speculative LOADS or STORES inside actual work function. Signed-off-by: Roman Pen Cc: Gioh Kim Cc: Michael Wang Cc: Tejun Heo Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/workqueue.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16e13d8628a3..801a698564d4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -666,6 +666,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) -- cgit v1.2.3 From 8358b02bf67d3a5d8a825070e1aa73f25fb2e4c7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 26 Apr 2016 22:26:26 +0200 Subject: bpf: fix double-fdput in replace_map_fd_with_map_ptr() When bpf(BPF_PROG_LOAD, ...) was invoked with a BPF program whose bytecode references a non-map file descriptor as a map file descriptor, the error handling code called fdput() twice instead of once (in __bpf_map_get() and in replace_map_fd_with_map_ptr()). If the file descriptor table of the current task is shared, this causes f_count to be decremented too much, allowing the struct file to be freed while it is still in use (use-after-free). This can be exploited to gain root privileges by an unprivileged user. This bug was introduced in commit 0246e64d9a5f ("bpf: handle pseudo BPF_LD_IMM64 insn"), but is only exploitable since commit 1be7f75d1668 ("bpf: enable non-root eBPF programs") because previously, CAP_SYS_ADMIN was required to reach the vulnerable code. (posted publicly according to request by maintainer) Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 618ef77c302a..db2574e7b8b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2030,7 +2030,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); - fdput(f); return PTR_ERR(map); } -- cgit v1.2.3 From 79c9ce57eb2d5f1497546a3946b4ae21b6fdc438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Apr 2016 11:36:53 +0200 Subject: perf/core: Fix perf_event_open() vs. execve() race Jann reported that the ptrace_may_access() check in find_lively_task_by_vpid() is racy against exec(). Specifically: perf_event_open() execve() ptrace_may_access() commit_creds() ... if (get_dumpable() != SUID_DUMP_USER) perf_event_exit_task(); perf_install_in_context() would result in installing a counter across the creds boundary. Fix this by wrapping lots of perf_event_open() in cred_guard_mutex. This should be fine as perf_event_exit_task() is already called with cred_guard_mutex held, so all perf locks already nest inside it. Reported-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 52 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2c78b6f47339..4e2ebf6f2f1f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1106,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: + * cred_guard_mutex * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -3421,7 +3422,6 @@ static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; - int err; rcu_read_lock(); if (!vpid) @@ -3435,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto errout; - return task; -errout: - put_task_struct(task); - return ERR_PTR(err); - } /* @@ -8414,6 +8405,24 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (task) { + err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + if (err) + goto err_cpus; + + /* + * Reuse ptrace permission checks for now. + * + * We must hold cred_guard_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -8421,7 +8430,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cpus; + goto err_cred; } if (is_sampling_event(event)) { @@ -8480,11 +8489,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } - if (task) { - put_task_struct(task); - task = NULL; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -8582,6 +8586,11 @@ SYSCALL_DEFINE5(perf_event_open, WARN_ON_ONCE(ctx->parent_ctx); + /* + * This is the point on no return; we cannot fail hereafter. This is + * where we start modifying current state. + */ + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details @@ -8653,6 +8662,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&gctx->mutex); mutex_unlock(&ctx->mutex); + if (task) { + mutex_unlock(&task->signal->cred_guard_mutex); + put_task_struct(task); + } + put_online_cpus(); mutex_lock(¤t->perf_event_mutex); @@ -8685,6 +8699,9 @@ err_alloc: */ if (!event_file) free_event(event); +err_cred: + if (task) + mutex_unlock(&task->signal->cred_guard_mutex); err_cpus: put_online_cpus(); err_task: @@ -8969,6 +8986,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. + * + * Can be called with cred_guard_mutex held when called from + * install_exec_creds(). */ void perf_event_exit_task(struct task_struct *child) { -- cgit v1.2.3 From 8639a847b0e11f8d2daa3eafe15a9609c91fd357 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai Date: Thu, 28 Apr 2016 16:18:18 -0700 Subject: kexec: update VMCOREINFO for compound_order/dtor makedumpfile refers page.lru.next to get the order of compound pages for page filtering. However, now the order is stored in page.compound_order, hence VMCOREINFO should be updated to export the offset of page.compound_order. The fact is, page.compound_order was introduced already in kernel 4.0, but the offset of it was the same as page.lru.next until kernel 4.3, so this was not actual problem. The above can be said also for page.lru.prev and page.compound_dtor, it's necessary to detect hugetlbfs pages. Further, the content was changed from direct address to the ID which means dtor. The problem is that unnecessary hugepages won't be removed from a dump file in kernels 4.4.x and later. This means that extra disk space would be consumed. It's a problem, but not critical. Signed-off-by: Atsushi Kumagai Acked-by: Dave Young Cc: "Eric W. Biederman" Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8d34308ea449..cbbb4c724149 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1415,6 +1415,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_dtor); + VMCOREINFO_OFFSET(page, compound_order); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -1447,8 +1449,8 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_X86 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); #endif -#ifdef CONFIG_HUGETLBFS - VMCOREINFO_SYMBOL(free_huge_page); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); #endif arch_crash_save_vmcoreinfo(); -- cgit v1.2.3 From d7f53518f713d3d9bf5ed150f943853fb94e7473 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai Date: Thu, 28 Apr 2016 16:18:21 -0700 Subject: kexec: export OFFSET(page.compound_head) to find out compound tail page PageAnon() always look at head page to check PAGE_MAPPING_ANON and tail page's page->mapping has just a poisoned data since commit 1c290f642101 ("mm: sanitize page->mapping for tail pages"). If makedumpfile checks page->mapping of a compound tail page to distinguish anonymous page as usual, it must fail in newer kernel. So it's necessary to export OFFSET(page.compound_head) to avoid checking compound tail pages. The problem is that unnecessary hugepages won't be removed from a dump file in kernels 4.5.x and later. This means that extra disk space would be consumed. It's a problem, but not critical. Signed-off-by: Atsushi Kumagai Acked-by: Dave Young Cc: "Eric W. Biederman" Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index cbbb4c724149..1391d3ee3b86 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1417,6 +1417,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, private); VMCOREINFO_OFFSET(page, compound_dtor); VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLAT_NODE_MEM_MAP -- cgit v1.2.3 From bdab42dfc974d15303afbf259f340f374a453974 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 28 Apr 2016 16:18:52 -0700 Subject: kcov: don't trace the code coverage code Kcov causes the compiler to add a call to __sanitizer_cov_trace_pc() in every basic block. Ftrace patches in a call to _mcount() to each function it has annotated. Letting these mechanisms annotate each other is a bad thing. Break the loop by adding 'notrace' to __sanitizer_cov_trace_pc() so that ftrace won't try to patch this code. This patch lets arm64 with KCOV and STACK_TRACER boot. Signed-off-by: James Morse Acked-by: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 3efbee0834a8..78bed7125515 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -43,7 +43,7 @@ struct kcov { * Entry point from instrumented code. * This is called once per basic-block/edge. */ -void __sanitizer_cov_trace_pc(void) +void notrace __sanitizer_cov_trace_pc(void) { struct task_struct *t; enum kcov_mode mode; -- cgit v1.2.3 From 36f05ae8bce904b4c8105363e6227a79d343bda6 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 28 Apr 2016 16:18:55 -0700 Subject: kcov: don't profile branches in kcov Profiling 'if' statements in __sanitizer_cov_trace_pc() leads to unbound recursion and crash: __sanitizer_cov_trace_pc() -> ftrace_likely_update -> __sanitizer_cov_trace_pc() ... Define DISABLE_BRANCH_PROFILING to disable this tracer. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 78bed7125515..a02f2dddd1d7 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1,5 +1,6 @@ #define pr_fmt(fmt) "kcov: " fmt +#define DISABLE_BRANCH_PROFILING #include #include #include -- cgit v1.2.3