summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/platforms/powernv/smp.c1
-rw-r--r--include/linux/irq_work.h9
-rw-r--r--include/linux/sched.h11
-rw-r--r--include/linux/sched/mm.h2
-rw-r--r--include/linux/sched/topology.h29
-rw-r--r--include/linux/smp.h24
-rw-r--r--include/linux/swait.h23
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/exit.c25
-rw-r--r--kernel/irq_work.c53
-rw-r--r--kernel/sched/core.c248
-rw-r--r--kernel/sched/cpuacct.c7
-rw-r--r--kernel/sched/debug.c9
-rw-r--r--kernel/sched/fair.c259
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/pelt.c24
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h36
-rw-r--r--kernel/sched/smp.h9
-rw-r--r--kernel/sched/topology.c31
-rw-r--r--kernel/smp.c175
21 files changed, 603 insertions, 408 deletions
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 13e251699346..b2ba3e95bda7 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
/* Standard hot unplug procedure */
idle_task_exit();
- current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 3b752e80c017..2735da5f839e 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -13,6 +13,8 @@
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
*/
+/* flags share CSD_FLAG_ space */
+
#define IRQ_WORK_PENDING BIT(0)
#define IRQ_WORK_BUSY BIT(1)
@@ -23,9 +25,12 @@
#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
+/*
+ * structure shares layout with single_call_data_t.
+ */
struct irq_work {
- atomic_t flags;
struct llist_node llnode;
+ atomic_t flags;
void (*func)(struct irq_work *);
};
@@ -53,9 +58,11 @@ void irq_work_sync(struct irq_work *work);
void irq_work_run(void);
bool irq_work_needs_cpu(void);
+void irq_work_single(void *arg);
#else
static inline bool irq_work_needs_cpu(void) { return false; }
static inline void irq_work_run(void) { }
+static inline void irq_work_single(void *arg) { }
#endif
#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33bb7c539246..12938d438d69 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -654,6 +654,7 @@ struct task_struct {
#ifdef CONFIG_SMP
struct llist_node wake_entry;
+ unsigned int wake_entry_type;
int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
@@ -1730,7 +1731,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
})
#ifdef CONFIG_SMP
-void scheduler_ipi(void);
+static __always_inline void scheduler_ipi(void)
+{
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ preempt_fold_need_resched();
+}
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
#else
static inline void scheduler_ipi(void) { }
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a3b510..a132d875d351 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+void mmdrop(struct mm_struct *mm);
+
/*
* This has to be called after a get_task_mm()/mmget_not_zero()
* followed by taking the mmap_sem for writing before modifying the
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 95253ad792b0..fb11091129b3 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -11,21 +11,20 @@
*/
#ifdef CONFIG_SMP
-#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
-#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
-#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
-#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
-#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */
-#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */
-#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */
-#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
-#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
-#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
-#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
-#define SD_NUMA 0x4000 /* cross-node balancing */
+#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
+#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
+#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
+#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
+#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
+#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
+#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
+#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
+#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
+#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
+#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
+#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
+#define SD_NUMA 0x2000 /* cross-node balancing */
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 04019872c7bc..7ee202ad21a6 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -16,17 +16,39 @@
typedef void (*smp_call_func_t)(void *info);
typedef bool (*smp_cond_func_t)(int cpu, void *info);
+
+enum {
+ CSD_FLAG_LOCK = 0x01,
+
+ /* IRQ_WORK_flags */
+
+ CSD_TYPE_ASYNC = 0x00,
+ CSD_TYPE_SYNC = 0x10,
+ CSD_TYPE_IRQ_WORK = 0x20,
+ CSD_TYPE_TTWU = 0x30,
+ CSD_FLAG_TYPE_MASK = 0xF0,
+};
+
+/*
+ * structure shares (partial) layout with struct irq_work
+ */
struct __call_single_data {
struct llist_node llist;
+ unsigned int flags;
smp_call_func_t func;
void *info;
- unsigned int flags;
};
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
typedef struct __call_single_data call_single_data_t
__aligned(sizeof(struct __call_single_data));
+/*
+ * Enqueue a llist_node on the call_single_queue; be very careful, read
+ * flush_smp_call_function_queue() in detail.
+ */
+extern void __smp_call_single_queue(int cpu, struct llist_node *node);
+
/* total number of cpus in this system (may exceed NR_CPUS) */
extern unsigned int total_cpus;
diff --git a/include/linux/swait.h b/include/linux/swait.h
index 73e06e9986d4..6a8c22b8c2a5 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -9,23 +9,10 @@
#include <asm/current.h>
/*
- * BROKEN wait-queues.
- *
- * These "simple" wait-queues are broken garbage, and should never be
- * used. The comments below claim that they are "similar" to regular
- * wait-queues, but the semantics are actually completely different, and
- * every single user we have ever had has been buggy (or pointless).
- *
- * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
- * "wake_up()" does, and has led to problems. In other cases, it has
- * been fine, because there's only ever one waiter (kvm), but in that
- * case gthe whole "simple" wait-queue is just pointless to begin with,
- * since there is no "queue". Use "wake_up_process()" with a direct
- * pointer instead.
- *
- * While these are very similar to regular wait queues (wait.h) the most
- * important difference is that the simple waitqueue allows for deterministic
- * behaviour -- IOW it has strictly bounded IRQ and lock hold times.
+ * Simple waitqueues are semantically very different to regular wait queues
+ * (wait.h). The most important difference is that the simple waitqueue allows
+ * for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
*
* Mainly, this is accomplished by two things. Firstly not allowing swake_up_all
* from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
@@ -39,7 +26,7 @@
* sleeper state.
*
* - the !exclusive mode; because that leads to O(n) wakeups, everything is
- * exclusive.
+ * exclusive. As such swake_up_one will only ever awake _one_ waiter.
*
* - custom wake callback functions; because you cannot give any guarantees
* about random code. This also allows swait to be used in RT, such that
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9f892144db6b..6ff2578ecf17 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
}
+static int finish_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * idle_task_exit() will have switched to &init_mm, now
+ * clean up any remaining active_mm state.
+ */
+ if (mm != &init_mm)
+ idle->active_mm = &init_mm;
+ mmdrop(mm);
+ return 0;
+}
+
/*
* Hotplug state machine related functions
*/
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
- .teardown.single = NULL,
+ .teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
diff --git a/kernel/exit.c b/kernel/exit.c
index 1b772f2c671b..c81805a6e03b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -708,8 +708,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- profile_task_exit(tsk);
- kcov_task_exit(tsk);
+ /*
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
WARN_ON(blk_needs_flush_plug(tsk));
@@ -727,6 +731,16 @@ void __noreturn do_exit(long code)
*/
set_fs(USER_DS);
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ profile_task_exit(tsk);
+ kcov_task_exit(tsk);
+
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -744,13 +758,6 @@ void __noreturn do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 48b5d1b6af4d..eca83965b631 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
{
int oflags;
- oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
/*
* If the work is already pending, no need to raise the IPI.
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
- arch_send_call_function_single_ipi(cpu);
+ __smp_call_single_queue(cpu, &work->llnode);
} else {
__irq_work_queue_local(work);
}
@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
return true;
}
+void irq_work_single(void *arg)
+{
+ struct irq_work *work = arg;
+ int flags;
+
+ /*
+ * Clear the PENDING bit, after this point the @work
+ * can be re-used.
+ * Make it immediately visible so that other CPUs trying
+ * to claim that work don't rely on us to handle their data
+ * while we are in the middle of the func.
+ */
+ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+
+ lockdep_irq_work_enter(work);
+ work->func(work);
+ lockdep_irq_work_exit(work);
+ /*
+ * Clear the BUSY bit and return to the free state if
+ * no-one else claimed it meanwhile.
+ */
+ flags &= ~IRQ_WORK_PENDING;
+ (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+}
+
static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
- llist_for_each_entry_safe(work, tmp, llnode, llnode) {
- int flags;
- /*
- * Clear the PENDING bit, after this point the @work
- * can be re-used.
- * Make it immediately visible so that other CPUs trying
- * to claim that work don't rely on us to handle their data
- * while we are in the middle of the func.
- */
- flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
-
- lockdep_irq_work_enter(work);
- work->func(work);
- lockdep_irq_work_exit(work);
- /*
- * Clear the BUSY bit and return to the free state if
- * no-one else claimed it meanwhile.
- */
- flags &= ~IRQ_WORK_PENDING;
- (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
- }
+ llist_for_each_entry_safe(work, tmp, llnode, llnode)
+ irq_work_single(work);
}
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0ae29fd57817..d7669027aede 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -21,6 +21,7 @@
#include "../smpboot.h"
#include "pelt.h"
+#include "smp.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -220,6 +221,13 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
+static inline void
+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+{
+ csd->flags = 0;
+ csd->func = func;
+ csd->info = rq;
+}
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -315,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
+
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq->hrtick_csd.flags = 0;
- rq->hrtick_csd.func = __hrtick_start;
- rq->hrtick_csd.info = rq;
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
#endif
-
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
@@ -633,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
wake_up_idle_cpu(cpu);
}
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
{
- int cpu = smp_processor_id();
-
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle_cpu(cpu) && !need_resched())
- return true;
+ struct rq *rq = info;
+ int cpu = cpu_of(rq);
+ unsigned int flags;
/*
- * We can't run Idle Load Balance on this CPU for this time so we
- * cancel it and clear NOHZ_BALANCE_KICK
+ * Release the rq::nohz_csd.
*/
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
- return false;
-}
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
-#else /* CONFIG_NO_HZ_COMMON */
-
-static inline bool got_nohz_idle_kick(void)
-{
- return false;
+ rq->idle_balance = idle_cpu(cpu);
+ if (rq->idle_balance && !need_resched()) {
+ rq->nohz_idle_balance = flags;
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
}
#endif /* CONFIG_NO_HZ_COMMON */
@@ -1540,7 +1540,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- sched_ttwu_pending();
+ flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -2274,16 +2274,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
{
+ struct llist_node *llist = arg;
struct rq *rq = this_rq();
- struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
+ /*
+ * rq::ttwu_pending racy indication of out-standing wakeups.
+ * Races such that false-negatives are possible, since they
+ * are shorter lived that false-positives would be.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
+
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -2293,56 +2300,30 @@ void sched_ttwu_pending(void)
rq_unlock_irqrestore(rq, &rf);
}
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- preempt_fold_need_resched();
-
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
- return;
-
- /*
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
- * traditionally all their work was done from the interrupt return
- * path. Now that we actually do some work, we need to make sure
- * we do call them.
- *
- * Some archs already do call them, luckily irq_enter/exit nest
- * properly.
- *
- * Arguably we should visit all archs and update all handlers,
- * however a fair share of IPIs are still resched only so this would
- * somewhat pessimize the simple resched case.
- */
- irq_enter();
- sched_ttwu_pending();
+ struct rq *rq = cpu_rq(cpu);
- /*
- * Check if someone kicked us for doing the nohz idle load balance.
- */
- if (unlikely(got_nohz_idle_kick())) {
- this_rq()->idle_balance = 1;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
- }
- irq_exit();
+ if (!set_nr_if_polling(rq->idle))
+ arch_send_call_function_single_ipi(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
- if (!set_nr_if_polling(rq->idle))
- smp_send_reschedule(cpu);
- else
- trace_sched_wake_idle_without_ipi(cpu);
- }
+ WRITE_ONCE(rq->ttwu_pending, 1);
+ __smp_call_single_queue(cpu, &p->wake_entry);
}
void wake_up_if_idle(int cpu)
@@ -2373,6 +2354,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+ /*
+ * If the CPU does not share cache, then queue the task on the
+ * remote rqs wakelist to avoid accessing remote data.
+ */
+ if (!cpus_share_cache(smp_processor_id(), cpu))
+ return true;
+
+ /*
+ * If the task is descheduling and the only running task on the
+ * CPU then use the wakelist to offload the task activation to
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
+ * nr_running is checked to avoid unnecessary task stacking.
+ */
+ if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+ return true;
+
+ return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2381,11 +2394,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq_flags rf;
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
- ttwu_queue_remote(p, cpu, wake_flags);
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
- }
#endif
rq_lock(rq, &rf);
@@ -2569,7 +2579,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->on_rq && ttwu_remote(p, wake_flags))
goto unlock;
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#ifdef CONFIG_SMP
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -2593,6 +2611,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ */
+ if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+ goto unlock;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_task().
@@ -2602,28 +2630,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-
-#else /* CONFIG_SMP */
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2751,6 +2763,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+ p->wake_entry_type = CSD_TYPE_TTWU;
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -3951,6 +3966,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
schedstat_inc(this_rq()->sched_count);
}
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ const struct sched_class *class;
+ /*
+ * We must do the balancing pass before put_prev_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
+ */
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+
+ put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
@@ -3984,22 +4021,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
restart:
-#ifdef CONFIG_SMP
- /*
- * We must do the balancing pass before put_next_task(), such
- * that when we release the rq->lock the task is in the same
- * state as before we took rq->lock.
- *
- * We can terminate the balance pass as soon as we know there is
- * a runnable task of @class priority or higher.
- */
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
- break;
- }
-#endif
-
- put_prev_task(rq, prev);
+ put_prev_task_balance(rq, prev, rf);
for_each_class(class) {
p = class->pick_next_task(rq);
@@ -4689,7 +4711,7 @@ int idle_cpu(int cpu)
return 0;
#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
+ if (rq->ttwu_pending)
return 0;
#endif
@@ -6243,13 +6265,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
/*
@@ -6539,7 +6562,6 @@ int sched_cpu_dying(unsigned int cpu)
struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
- sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
@@ -6642,6 +6664,8 @@ void __init sched_init(void)
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6694,7 +6718,6 @@ void __init sched_init(void)
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
@@ -6716,7 +6739,6 @@ void __init sched_init(void)
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -6744,6 +6766,8 @@ void __init sched_init(void)
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
+
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
@@ -7438,6 +7462,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -7466,6 +7492,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..941c28cf9738 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <asm/irq_regs.h>
#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = task_pt_regs(tsk);
+ struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
if (regs && user_mode(regs))
index = CPUACCT_STAT_USER;
@@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+ __this_cpu_add(ca->cpuusage->usages[index], cputime);
rcu_read_unlock();
}
@@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
+ __this_cpu_add(ca->cpustat->cpustat[index], val);
rcu_read_unlock();
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 239970b991c0..36c54265bb2b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name",