diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-04 09:10:24 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-04 09:10:24 -0700 |
| commit | f213a6c84c1b4b396a0713ee33cff0e02ba8235f (patch) | |
| tree | d5c0893d171dce6b1f1cf259a0e59bf433da0839 | |
| parent | 621bee34f6ed12d6d4f8896028333fc2200b4ced (diff) | |
| parent | bbdacdfed2f5fa50a2cc9f500a36e05990a0837d (diff) | |
| download | linux-f213a6c84c1b4b396a0713ee33cff0e02ba8235f.tar.gz linux-f213a6c84c1b4b396a0713ee33cff0e02ba8235f.tar.bz2 linux-f213a6c84c1b4b396a0713ee33cff0e02ba8235f.zip | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle were:
- fix affine wakeups (Peter Zijlstra)
- improve CPU onlining (and general bootup) scalability on systems
with ridiculous number (thousands) of CPUs (Peter Zijlstra)
- sched/numa updates (Rik van Riel)
- sched/deadline updates (Byungchul Park)
- sched/cpufreq enhancements and related cleanups (Viresh Kumar)
- sched/debug enhancements (Xie XiuQi)
- various fixes"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits)
sched/debug: Optimize sched_domain sysctl generation
sched/topology: Avoid pointless rebuild
sched/topology, cpuset: Avoid spurious/wrong domain rebuilds
sched/topology: Improve comments
sched/topology: Fix memory leak in __sdt_alloc()
sched/completion: Document that reinit_completion() must be called after complete_all()
sched/autogroup: Fix error reporting printk text in autogroup_create()
sched/fair: Fix wake_affine() for !NUMA_BALANCING
sched/debug: Intruduce task_state_to_char() helper function
sched/debug: Show task state in /proc/sched_debug
sched/debug: Use task_pid_nr_ns in /proc/$pid/sched
sched/core: Remove unnecessary initialization init_idle_bootup_task()
sched/deadline: Change return value of cpudl_find()
sched/deadline: Make find_later_rq() choose a closer CPU in topology
sched/numa: Scale scan period with tasks in group and shared/private
sched/numa: Slow down scan rate if shared faults dominate
sched/pelt: Fix false running accounting
sched: Mark pick_next_task_dl() and build_sched_domain() as static
sched/cpupri: Don't re-initialize 'struct cpupri'
sched/deadline: Don't re-initialize 'struct cpudl'
...
| -rw-r--r-- | arch/x86/include/asm/topology.h | 6 | ||||
| -rw-r--r-- | fs/proc/base.c | 3 | ||||
| -rw-r--r-- | include/linux/sched.h | 13 | ||||
| -rw-r--r-- | include/linux/sched/debug.h | 4 | ||||
| -rw-r--r-- | include/linux/sched/task.h | 1 | ||||
| -rw-r--r-- | include/linux/sched/topology.h | 8 | ||||
| -rw-r--r-- | init/main.c | 1 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 6 | ||||
| -rw-r--r-- | kernel/sched/autogroup.c | 3 | ||||
| -rw-r--r-- | kernel/sched/completion.c | 8 | ||||
| -rw-r--r-- | kernel/sched/core.c | 22 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 27 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c | 2 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 33 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 83 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 459 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 4 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 39 |
18 files changed, 459 insertions, 263 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6358a85e2270..c1d2a9892352 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) extern void setup_node_to_cpumask_map(void); -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); diff --git a/fs/proc/base.c b/fs/proc/base.c index 719c2e943ea1..98fd8f6df851 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; + struct pid_namespace *ns = inode->i_sb->s_fs_info; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; - proc_sched_show_task(p, m); + proc_sched_show_task(p, ns, m); put_task_struct(p); diff --git a/include/linux/sched.h b/include/linux/sched.h index b57c7bedb534..e5fbce866073 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1233,6 +1233,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } +static inline char task_state_to_char(struct task_struct *task) +{ + const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + unsigned long state = task->state; + + state = state ? __ffs(state) + 1 : 0; + + /* Make sure the string lines up properly with the number of task states: */ + BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + + return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; +} + /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index e0eaee54c5a4..5d58d49e9f87 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -6,6 +6,7 @@ */ struct task_struct; +struct pid_namespace; extern void dump_cpu_task(int cpu); @@ -34,7 +35,8 @@ extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_SCHED_DEBUG struct seq_file; -extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); +extern void proc_sched_show_task(struct task_struct *p, + struct pid_namespace *ns, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); #endif diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c97e5f096927..79a2a744648d 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -30,7 +30,6 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern void init_idle_bootup_task(struct task_struct *idle); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7d065abc7a47..d7b6dab956ec 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -71,6 +71,14 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + + /* + * Some variables from the most recent sd_lb_stats for this domain, + * used by wake_affine(). + */ + unsigned long nr_running; + unsigned long load; + unsigned long capacity; }; struct sched_domain { diff --git a/init/main.c b/init/main.c index 7ec20cf7b111..b78f63c30b17 100644 --- a/init/main.c +++ b/init/main.c @@ -430,7 +430,6 @@ static noinline void __ref rest_init(void) * The boot idle thread must execute schedule() * at least once to get things moving: */ - init_idle_bootup_task(current); schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ cpu_startup_entry(CPUHP_ONLINE); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 87a1213dd326..df403e97b073 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2344,13 +2344,7 @@ void cpuset_update_active_cpus(void) * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. - * - * We still need to do partition_sched_domains() synchronously; - * otherwise, the scheduler will get confused and put tasks to the - * dead CPU. Fall back to the default single domain. - * cpuset_hotplug_workfn() will rebuild it as necessary. */ - partition_sched_domains(1, NULL, NULL); schedule_work(&cpuset_hotplug_work); } diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index da39489d2d80..de6d7f4dfcb5 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void) goto out_fail; tg = sched_create_group(&root_task_group); - if (IS_ERR(tg)) goto out_free; @@ -101,7 +100,7 @@ out_free: out_fail: if (printk_ratelimit()) { printk(KERN_WARNING "autogroup_create: %s failure.\n", - ag ? "sched_create_group()" : "kmalloc()"); + ag ? "sched_create_group()" : "kzalloc()"); } return autogroup_kref_get(&autogroup_default); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index c9524d2d9316..5d9131aa846f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -47,6 +47,13 @@ EXPORT_SYMBOL(complete); * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. + * + * Since complete_all() sets the completion of @x permanently to done + * to allow multiple waiters to finish, a call to reinit_completion() + * must be used on @x if @x is to be used again. The code must make + * sure that all waiters have woken and finished before reinitializing + * @x. Also note that the function completion_done() can not be used + * to know if there are still waiters after complete_all() has been called. */ void complete_all(struct completion *x) { @@ -297,6 +304,7 @@ EXPORT_SYMBOL(try_wait_for_completion); * Return: 0 if there are waiters (wait_for_completion() in progress) * 1 if there are no waiters. * + * Note, this will always return true if complete_all() was called on @X. */ bool completion_done(struct completion *x) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e053c31d96da..c1fcd96cf432 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5133,24 +5133,17 @@ out_unlock: return retval; } -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned long state = p->state; - - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); if (!try_get_task_stack(p)) return; - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); - if (state == TASK_RUNNING) + + printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + + if (p->state == TASK_RUNNING) printk(KERN_CONT " running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); @@ -5207,11 +5200,6 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - /** * init_idle - set up an idle thread for a given CPU * @idle: task in question @@ -5468,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) */ next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); - next->sched_class->put_prev_task(rq, next); + put_prev_task(rq, next); /* * Rules for changing task_struct::cpus_allowed are holding diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index fba235c7d026..8d9562d890d3 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp) * @p: the task * @later_mask: a mask to fill in with the selected CPUs (or NULL) * - * Returns: int - best CPU (heap maximum if suitable) + * Returns: int - CPUs were found */ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask) { - int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { - best_cpu = cpumask_any(later_mask); - goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && - dl_time_before(dl_se->deadline, cp->elements[0].dl)) { - best_cpu = cpudl_maximum(cp); - if (later_mask) - cpumask_set_cpu(best_cpu, later_mask); - } + return 1; + } else { + int best_cpu = cpudl_maximum(cp); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); -out: - WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); - return best_cpu; + return 1; + } + } + return 0; } /* @@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp) { int i; - memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..2511aba36b89 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp) { int i; - memset(cp, 0, sizeof(*cp)); - for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 755bd3f1a1a9..d05bd9457a40 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) return; /* @@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + cpudl_find(&rq->rd->cpudl, p, NULL)) return; resched_curr(rq); @@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct * +static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; @@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task) struct sched_domain *sd; struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); - int best_cpu, cpu = task_cpu(task); + int cpu = task_cpu(task); /* Make sure the mask is initialized first */ if (unlikely(!later_mask)) @@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, - task, later_mask); - if (best_cpu == -1) + if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) return -1; /* - * If we are here, some target has been found, - * the most suitable of which is cached in best_cpu. - * This is, among the runqueues where the current tasks - * have later deadlines than the task's one, the rq - * with the latest possible one. + * If we are here, some targets have been found, including + * the most suitable which is, among the runqueues where the + * current tasks have later deadlines than the task's one, the + * rq with the latest possible one. * * Now we check how well this matches with task's * affinity and system topology. @@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task) rcu_read_lock(); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; /* * If possible, preempting this_cpu is @@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } + best_cpu = cpumask_first_and(later_mask, + sched_domain_span(sd)); /* - * Last chance: if best_cpu is valid and is - * in the mask, that becomes our choice. + * Last chance: if a cpu being in both later_mask + * and current sd span is valid, that becomes our + * choice. Of course, the latest possible cpu is + * already under consideration through later_mask. */ - if (best_cpu < nr_cpu_ids && - cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fa66de52bd6..4a23bbc3111b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } +static cpumask_var_t sd_sysctl_cpus; static struct ctl_table_header *sd_sysctl_header; + void register_sched_domain_sysctl(void) { - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + static struct ctl_table *cpu_entries; + static struct ctl_table **cpu_idx; char buf[32]; + int i; - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; + if (!cpu_entries) { + cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); + if (!cpu_entries) + return; - if (entry == NULL) - return; + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = cpu_entries; + } - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; + if (!cpu_idx) { + struct ctl_table *e = cpu_entries; + + cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); + if (!cpu_idx) + return; + + /* deal with sparse possible map */ + for_each_possible_cpu(i) { + cpu_idx[i] = e; + e++; + } + } + + if (!cpumask_available(sd_sysctl_cpus)) { + if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) + return; + + /* init to possible to not have holes in @cpu_entries */ + cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); + } + + for_each_cpu(i, sd_sysctl_cpus) { + struct ctl_table *e = cpu_idx[i]; + + if (e->child) + sd_free_ctl_entry(&e->child); + + if (!e->procname) { + snprintf(buf, 32, "cpu%d", i); + e->procname = kstrdup(buf, GFP_KERNEL); + } + e->mode = 0555; + e->child = sd_alloc_ctl_cpu_table(i); + + __cpumask_clear_cpu(i, sd_sysctl_cpus); } WARN_ON(sd_sysctl_header); sd_sysctl_header = register_sysctl_table(sd_ctl_root); } +void dirty_sched_domain_sysctl(int cpu) +{ + if (cpumask_available(sd_sysctl_cpus)) + __cpumask_set_cpu(cpu, sd_sysctl_cpus); +} + /* may be called multiple times per register */ void unregister_sched_domain_sysctl(void) { unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); } #endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ @@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg) } #endif +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { if (rq->curr == p) - SEQ_printf(m, "R"); + SEQ_printf(m, ">R"); else - SEQ_printf(m, " "); + SEQ_printf(m, " %c", task_state_to_char(p)); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), @@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" + " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n" - "------------------------------------------------------" + "-------------------------------------------------------" "----------------------------------------------------\n"); rcu_read_lock(); @@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) #endif } -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c95880e216f6..8d5868771cb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se) /* * For !fair tasks do: * - update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq); attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * @@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + int active_nodes; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long max_faults_cpu; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p) return max_t(unsigned int, floor, scan); } +static unsigned int task_scan_start(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long period = smin; + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + } + + return max(smin, period); +} + static unsigned int task_scan_max(struct task_struct *p) { - unsigned int smin = task_scan_min(p); - unsigned int smax; + unsigned long smin = task_scan_min(p); + unsigned long smax; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + + smax = max(smax, period); + } + return max(smin, smax); } @@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - int active_nodes; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long max_faults_cpu; - /* - * Faults_cpu is used to decide whether memory should move - * towards the CPU. As a consequence, these stats are weighted - * more by CPU use than by memory faults. - */ - unsigned long *faults_cpu; - unsigned long faults[0]; -}; - /* Shared or private faults. */ #define NR_NUMA_HINT_FAULT_TYPES 2 @@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +static inline unsigned long group_faults_priv(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + + return faults; +} + +static inline unsigned long group_faults_shared(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; + } + + return faults; +} + /* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations @@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); +static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); @@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p) * Reset the scan period if the task is being rescheduled on an * alternative node to recheck if the tasks is now properly placed. */ - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); @@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private) { unsigned int period_slot; - int ratio; + int lr_ratio, ps_ratio; int diff; unsigned long remote = p->numa_faults_locality[0]; @@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p, * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) */ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); - ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); - if (ratio >= NUMA_PERIOD_THRESHOLD) { - int slot = ratio - NUMA_PERIOD_THRESHOLD; + lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); + + if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are local. There is no need to + * do fast NUMA scanning, since memory is already local. + */ + int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are shared with other tasks. + * There is no point in continuing fast NUMA scanning, + * since other tasks may just move the memory elsewhere. + */ + int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; if (!slot) slot = 1; diff = slot * period_slot; } else { - diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; - /* - * Scale scan rate increases based on sharing. There is an - * inverse relationship between the degree of sharing and - * the adjustment made to the scanning period. Broadly - * speaking the intent is that there is little point - * scanning faster if shared accesses dominate as it may - * simply bounce migrations uselessly + * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, + * yet they are not on the local NUMA node. Speed up + * NUMA scanning to get the memory moved over. */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); - diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + int ratio = max(lr_ratio, ps_ratio); + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; } p->numa_scan_period = clamp(p->numa_scan_period + diff, @@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work) if (p->numa_scan_period == 0) { p->numa_scan_period_max = task_scan_max(p); - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); } next_scan = now + msecs_to_jiffies(p->numa_scan_period); @@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now > curr->node_stamp + period) { if (!curr->node_stamp) - curr->numa_scan_period = task_scan_min(curr); + curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { @@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } -/* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? - */ -static inline bool numa_wake_affine(struct sched_domain *sd, - struct task_struct *p, int this_cpu, - int prev_cpu, int sync) -{ - struct numa_stats prev_load, this_load; - s64 this_eff_load, prev_eff_load; - - update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); - update_numa_stats(&this_load, cpu_to_node(this_cpu)); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - unsigned long current_load = task_h_load(current); - - if (this_load.load > current_load) - this_load.load -= current_load; - else - this_load.load = 0; - } - - /* - * In low-load situations, where this_cpu's node is idle due to the - * sync cause above having dropped this_load.load to 0, move the task. - * Moving to an idle socket will not create a bad imbalance. - * - * Otherwise check if the nodes are near enough in load to allow this - * task to be woken on this_cpu's node. - */ - if (this_load.load > 0) { - unsigned long task_load = task_h_load(p); - - this_eff_load = 100; - this_eff_load *= prev_load.compute_capacity; - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= this_load.compute_capacity; - - this_eff_load *= this_load.load + task_load; - prev_eff_load *= prev_load.load - task_load; - - return this_eff_load <= prev_eff_load; - } - - return true; -} #else static void task_tick_numa(struct rq *rq, struct task_struct *c |
