diff options
| -rw-r--r-- | kernel/sched/Makefile | 2 | ||||
| -rw-r--r-- | kernel/sched/core.c | 1659 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 23 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 1658 |
4 files changed, 1684 insertions, 1658 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5e59b832ae2b..130ce8ac725b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,7 +18,7 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1cea6c61fb01..e4aa470ed454 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -31,7 +31,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); /* @@ -5446,7 +5445,7 @@ out: #ifdef CONFIG_SMP -static bool sched_smp_initialized __read_mostly; +bool sched_smp_initialized __read_mostly; #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ @@ -5643,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq) } #endif /* CONFIG_HOTPLUG_CPU */ -static void set_rq_online(struct rq *rq) +void set_rq_online(struct rq *rq) { if (!rq->online) { const struct sched_class *class; @@ -5658,7 +5657,7 @@ static void set_rq_online(struct rq *rq) } } -static void set_rq_offline(struct rq *rq) +void set_rq_offline(struct rq *rq) { if (rq->online) { const struct sched_class *class; @@ -5680,1658 +5679,6 @@ static void set_cpu_rq_start_time(unsigned int cpu) rq->age_stamp = sched_clock_cpu(cpu); } -/* Protected by sched_domains_mutex: */ -static cpumask_var_t sched_domains_tmpmask; - -#ifdef CONFIG_SCHED_DEBUG - -static __read_mostly int sched_debug_enabled; - -static int __init sched_debug_setup(char *str) -{ - sched_debug_enabled = 1; - - return 0; -} -early_param("sched_debug", sched_debug_setup); - -static inline bool sched_debug(void) -{ - return sched_debug_enabled; -} - -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - struct cpumask *groupmask) -{ - struct sched_group *group = sd->groups; - - cpumask_clear(groupmask); - - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); - return -1; - } - - printk(KERN_CONT "span %*pbl level %s\n", - cpumask_pr_args(sched_domain_span(sd)), sd->name); - - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); - } - - printk(KERN_DEBUG "%*s groups:", level + 1, ""); - do { - if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); - break; - } - - if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); - break; - } - - if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); - break; - } - - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", - group->sgc->capacity); - } - - group = group->next; - } while (group != sd->groups); - printk(KERN_CONT "\n"); - - if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - - if (sd->parent && - !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - return 0; -} - -static void sched_domain_debug(struct sched_domain *sd, int cpu) -{ - int level = 0; - - if (!sched_debug_enabled) - return; - - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) - break; - level++; - sd = sd->parent; - if (!sd) - break; - } -} -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_enabled 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; - - /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { - if (sd->groups != sd->groups->next) - return 0; - } - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_AFFINE)) - return 0; - - return 1; -} - -static int -sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) - return 0; - - /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } - if (~cflags & pflags) - return 0; - - return 1; -} - -static void free_rootdomain(struct rcu_head *rcu) -{ - struct root_domain *rd = container_of(rcu, struct root_domain, rcu); - - cpupri_cleanup(&rd->cpupri); - cpudl_cleanup(&rd->cpudl); - free_cpumask_var(rd->dlo_mask); - free_cpumask_var(rd->rto_mask); - free_cpumask_var(rd->online); - free_cpumask_var(rd->span); - kfree(rd); -} - -static void rq_attach_root(struct rq *rq, struct root_domain *rd) -{ - struct root_domain *old_rd = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - - if (rq->rd) { - old_rd = rq->rd; - - if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); - - cpumask_clear_cpu(rq->cpu, old_rd->span); - - /* - * If we dont want to free the old_rd yet then - * set old_rd to NULL to skip the freeing later - * in this function: - */ - if (!atomic_dec_and_test(&old_rd->refcount)) - old_rd = NULL; - } - - atomic_inc(&rd->refcount); - rq->rd = rd; - - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); -} - -static int init_rootdomain(struct root_domain *rd) -{ - memset(rd, 0, sizeof(*rd)); - - if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto out; - if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) - goto free_span; - if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) - goto free_online; - if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) - goto free_dlo_mask; - - init_dl_bw(&rd->dl_bw); - if (cpudl_init(&rd->cpudl) != 0) - goto free_rto_mask; - - if (cpupri_init(&rd->cpupri) != 0) - goto free_cpudl; - return 0; - -free_cpudl: - cpudl_cleanup(&rd->cpudl); -free_rto_mask: - free_cpumask_var(rd->rto_mask); -free_dlo_mask: - free_cpumask_var(rd->dlo_mask); -free_online: - free_cpumask_var(rd->online); -free_span: - free_cpumask_var(rd->span); -out: - return -ENOMEM; -} - -/* - * By default the system creates a single root-domain with all CPUs as - * members (mimicking the global state we have today). - */ -struct root_domain def_root_domain; - -static void init_defrootdomain(void) -{ - init_rootdomain(&def_root_domain); - - atomic_set(&def_root_domain.refcount, 1); -} - -static struct root_domain *alloc_rootdomain(void) -{ - struct root_domain *rd; - - rd = kmalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return NULL; - - if (init_rootdomain(rd) != 0) { - kfree(rd); - return NULL; - } - - return rd; -} - -static void free_sched_groups(struct sched_group *sg, int free_sgc) -{ - struct sched_group *tmp, *first; - - if (!sg) - return; - - first = sg; - do { - tmp = sg->next; - - if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) - kfree(sg->sgc); - - kfree(sg); - sg = tmp; - } while (sg != first); -} - -static void destroy_sched_domain(struct sched_domain *sd) -{ - /* - * If its an overlapping domain it has private groups, iterate and - * nuke them all. - */ - if (sd->flags & SD_OVERLAP) { - free_sched_groups(sd->groups, 1); - } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgc); - kfree(sd->groups); - } - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); - kfree(sd); -} - -static void destroy_sched_domains_rcu(struct rcu_head *rcu) -{ - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - - while (sd) { - struct sched_domain *parent = sd->parent; - destroy_sched_domain(sd); - sd = parent; - } -} - -static void destroy_sched_domains(struct sched_domain *sd) -{ - if (sd) - call_rcu(&sd->rcu, destroy_sched_domains_rcu); -} - -/* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). - * - * Also keep a unique ID per domain (we use the first CPU number in - * the cpumask of the domain), this allows us to quickly tell if - * two CPUs are in the same cache domain, see cpus_share_cache(). - */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); -DEFINE_PER_CPU(int, sd_llc_size); -DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym); - -static void update_top_cache_domain(int cpu) -{ - struct sched_domain_shared *sds = NULL; - struct sched_domain *sd; - int id = cpu; - int size = 1; - - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - id = cpumask_first(sched_domain_span(sd)); - size = cpumask_weight(sched_domain_span(sd)); - sds = sd->shared; - } - - rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); - per_cpu(sd_llc_size, cpu) = size; - per_cpu(sd_llc_id, cpu) = id; - rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); - - sd = lowest_flag_domain(cpu, SD_NUMA); - rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); - - sd = highest_flag_domain(cpu, SD_ASYM_PACKING); - rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); -} - -/* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must - * hold the hotplug lock. - */ -static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; - - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - - if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; - /* - * Transfer SD_PREFER_SIBLING down in case of a - * degenerate parent; the spans match for this - * so the property transfers. - */ - if (parent->flags & SD_PREFER_SIBLING) - tmp->flags |= SD_PREFER_SIBLING; - destroy_sched_domain(parent); - } else - tmp = tmp->parent; - } - - if (sd && sd_degenerate(sd)) { - tmp = sd; - sd = sd->parent; - destroy_sched_domain(tmp); - if (sd) - sd->child = NULL; - } - - sched_domain_debug(sd, cpu); - - rq_attach_root(rq, rd); - tmp = rq->sd; - rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp); - - update_top_cache_domain(cpu); -} - -/* Setup the mask of CPUs configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ret; - - alloc_bootmem_cpumask_var(&cpu_isolated_map); - ret = cpulist_parse(str, cpu_isolated_map); - if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); - return 0; - } - return 1; -} -__setup("isolcpus=", isolated_cpu_setup); - -struct s_data { - struct sched_domain ** __percpu sd; - struct root_domain *rd; -}; - -enum s_alloc { - sa_rootdomain, - sa_sd, - sa_sd_storage, - sa_none, -}; - -/* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. - * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. - * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * CPU they're built on, so check that. - */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) -{ - const struct cpumask *span = sched_domain_span(sd); - struct sd_data *sdd = sd->private; - struct sched_domain *sibling; - int i; - - for_each_cpu(i, span) { - sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) - continue; - - cpumask_set_cpu(i, sched_group_mask(sg)); - } -} - -/* - * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. - */ -int group_balance_cpu(struct sched_group *sg) -{ - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); -} - -static int -build_overlap_sched_groups(struct sched_domain *sd, int cpu) -{ - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; - const struct cpumask *span = sched_domain_span(sd); - struct cpumask *covered = sched_domains_tmpmask; - struct sd_data *sdd = sd->private; - struct sched_domain *sibling; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct cpumask *sg_span; - - if (cpumask_test_cpu(i, covered)) - continue; - - sibling = *per_cpu_ptr(sdd->sd, i); - - /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) - continue; - - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - - if (!sg) - goto fail; - - sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - - cpumask_or(covered, covered, sg_span); - - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; - - /* - * Make sure the first group of this domain contains the - * canonical balance CPU. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; - - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - last->next = first; - } - sd->groups = groups; - - return 0; - -fail: - free_sched_groups(first, 0); - - return -ENOMEM; -} - -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) -{ - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); - struct sched_domain *child = sd->child; - - if (child) - cpu = cpumask_first(sched_domain_span(child)); - - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); - - /* For claim_allocations: */ - atomic_set(&(*sg)->sgc->ref, 1); - } - - return cpu; -} - -/* - * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. - * - * Assumes the sched_domain tree is fully constructed - */ -static int -build_sched_groups(struct sched_domain *sd, int cpu) -{ - struct sched_group *first = NULL, *last = NULL; - struct sd_data *sdd = sd->private; - const struct cpumask *span = sched_domain_span(sd); - struct cpumask *covered; - int i; - - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - - lockdep_assert_held(&sched_domains_mutex); - covered = sched_domains_tmpmask; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group, j; - - if (cpumask_test_cpu(i, covered)) - continue; - - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); - - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; - - return 0; -} - -/* - * Initialize sched groups cpu_capacity. - * - * cpu_capacity indicates the capacity of sched group, which is used while - * distributing the load between different sched groups in a sched domain. - * Typically cpu_capacity for all the groups in a sched domain will be same - * unless there are asymmetries in the topology. If there are asymmetries, - * group having more cpu_capacity will pickup more load compared to the - * group having less cpu_capacity. - */ -static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) -{ - struct sched_group *sg = sd->groups; - - WARN_ON(!sg); - - do { - int cpu, max_cpu = -1; - - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); - - if (!(sd->flags & SD_ASYM_PACKING)) - goto next; - - for_each_cpu(cpu, sched_group_cpus(sg)) { - if (max_cpu < 0) - max_cpu = cpu; - else if (sched_asym_prefer(cpu, max_cpu)) - max_cpu = cpu; - } - sg->asym_prefer_cpu = max_cpu; - -next: - sg = sg->next; - } while (sg != sd->groups); - - if (cpu != group_balance_cpu(sg)) - return; - - update_group_capacity(sd, cpu); -} - -/* - * Initializers for schedule domains - * Non-inlined to reduce accumulated stack pressure in build_sched_domains() - */ - -static int default_relax_domain_level = -1; -int sched_domain_level_max; - -static int __init setup_relax_domain_level(char *str) -{ - if (kstrtoint(str, 0, &default_relax_domain_level)) - pr_warn("Unable to set relax_domain_level\n"); - - return 1; -} -__setup("relax_domain_level=", setup_relax_domain_level); - -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) -{ - int request; - - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* Turn off idle balance on this domain: */ - sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* Turn on idle balance on this domain: */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } -} - -static void __sdt_free(const struct cpumask *cpu_map); -static int __sdt_alloc(const struct cpumask *cpu_map); - -static void __free_domain_allocs(struct s_data *d, enum s_alloc what, - const struct cpumask *cpu_map) -{ - switch (what) { - case sa_rootdomain: - if (!atomic_read(&d->rd->refcount)) - free_rootdomain(&d->rd->rcu); - /* Fall through */ - case sa_sd: - free_percpu(d->sd); - /* Fall through */ - case sa_sd_storage: - __sdt_free(cpu_map); - /* Fall through */ - case sa_none: - break; - } -} - -static enum s_alloc -__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) -{ - memset(d, 0, sizeof(*d)); - - if (__sdt_alloc(cpu_map)) - return sa_sd_storage; - d->sd = alloc_percpu(struct sched_domain *); - if (!d->sd) - return sa_sd_storage; - d->rd = alloc_rootdomain(); - if (!d->rd) - return sa_sd; - return sa_rootdomain; -} - -/* - * NULL the sd_data elements we've used to build the sched_domain and - * sched_group structure so that the subsequent __free_domain_allocs() - * will not free the data we're using. - */ -static void claim_allocations(int cpu, struct sched_domain *sd) -{ - struct sd_data *sdd = sd->private; - - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) - *per_cpu_ptr(sdd->sg, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) - *per_cpu_ptr(sdd->sgc, cpu) = NULL; -} - -#ifdef CONFIG_NUMA -static int sched_domains_numa_levels; -enum numa_topology_type sched_numa_topology_type; -static int *sched_domains_numa_distance; -int sched_max_numa_distance; -static struct cpumask ***sched_domains_numa_masks; -static int sched_domains_curr_level; -#endif - -/* - * SD_flags allowed in topology descriptions. - * - * These flags are purely descriptive of the topology and do not prescribe - * behaviour. Behaviour is artificial and mapped in the below sd_init() - * function: - * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies - * - * Odd one out, which beside describing the topology has a quirk also - * prescribes the desired behaviour that goes along with it: - * - * SD_ASYM_PACKING - describes SMT quirks - */ -#define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUCAPACITY | \ - SD_SHARE_PKG_RESOURCES | \ - SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ - SD_SHARE_POWERDOMAIN) - -static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, - struct sched_domain *child, int cpu) -{ - struct sd_data *sdd = &tl->data; - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); - int sd_id, sd_weight, sd_flags = 0; - -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); - - if (tl->sd_flags) - sd_flags = (*tl->sd_flags)(); - if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, - "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; - - *sd = (struct sched_domain){ - .min_interval = sd_weight, - .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, - - .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, - - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE - | 1*SD_BALANCE_EXEC - | 1*SD_BALANCE_FORK - | 0*SD_BALANCE_WAKE - | 1*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES - | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING - | 0*SD_NUMA - | sd_flags - , - - .last_balance = jiffies, - .balance_interval = sd_weight, - .smt_gain = 0, - .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, - .child = child, -#ifdef CONFIG_SCHED_DEBUG - .name = tl->name, -#endif - }; - - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd_id = cpumask_first(sched_domain_span(sd)); - - /* - * Convert topological properties into behaviour. - */ - - if (sd->flags & SD_ASYM_CPUCAPACITY) { - struct sched_domain *t = sd; - - for_each_lower_domain(t) - t->flags |= SD_BALANCE_WAKE; - } - - if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; - sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ - - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->imbalance_pct = 117; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - -#ifdef CONFIG_NUMA - } else if (sd->flags & SD_NUMA) { - sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; - - sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { - sd->flags &= ~(SD_BALANCE_EXEC | - SD_BALANCE_FORK | - SD_WAKE_AFFINE); - } - -#endif - } else { - sd->flags |= SD_PREFER_SIBLING; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; - } - - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); - atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - } - - sd->private = sdd; - - return sd; -} - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = - default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) - -void set_sched_topology(struct sched_domain_topology_level *tl) -{ - if (WARN_ON_ONCE(sched_smp_initialized)) - return; - - sched_domain_topology = tl; -} - -#ifdef CONFIG_NUMA - -static const struct cpumask *sd_numa_mask(int cpu) -{ - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; -} - -static void sched_numa_warn(const char *str) -{ - static int done = false; - int i,j; - - if (done) - return; - - done = true; - - printk(KERN_WARNING "ERROR: %s\n\n", str); - - for (i = 0; i < nr_node_ids; i++) { - printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); - printk(KERN_CONT "\n"); - } - printk(KERN_WARNING "\n"); -} - -bool find_numa_distance(int distance) -{ - int i; - - if (distance == node_distance(0, 0)) - return true; - - for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; - } - - return false; -} - -/* - * A system can have three types of NUMA topology: - * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system - * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes - * NUMA_BACKPLANE: nodes can reach other nodes through a backplane - * - * The difference between a glueless mesh topology and a backplane - * topology lies in whether communication between not directly - * connected nodes goes through intermediary nodes (where programs - * could run), or through backplane controllers. This affects - * placement of programs. - * - * The type of topology can be discerned with the following tests: - * - If the maximum distance between any nodes is 1 hop, the system - * is directly connected. - * - If for two nodes A and B, located N > 1 hops away from each other, - * there is an intermediary node C, which is < N hops away from both - * nodes A and B, the system is a glueless mesh. - */ -static void init_numa_topology_type(void) -{ - int a, b, c, n; - - n = sched_max_numa_distance; - - if (sched_domains_numa_levels <= 1) { - sched_numa_topology_type = NUMA_DIRECT; - return; - } - - for_each_online_node(a) { - for_each_online_node(b) { - /* Find two nodes furthest removed from each other. */ - if (node_distance(a, b) < n) - continue; - - /* Is there an intermediary node between a and b? */ - for_each_online_node(c) { - if (node_distance(a, c) < n && - node_distance(b, c) < n) { - sched_numa_topology_type = - NUMA_GLUELESS_MESH; - return; - } - } - - sched_numa_topology_type = NUMA_BACKPLANE; - return; - } - } -} - -static void sched_init_numa(void) -{ - int next_distance, curr_distance = node_distance(0, 0); - struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the - * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. - */ - next_distance = curr_distance; - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); - - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); - } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; - } - - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; - } - - if (!level) - return; - - /* - * 'level' contains the number of unique distances, excluding the - * identity distance node_distance(i,i). - * - * The sched_domains_numa_distance[] array includes the actual distance - * numbers. - */ - - /* - * Here, we should temporarily reset sched_domains_numa_levels to 0. - * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be - * dangerous when we use it to iterate array sched_domains_numa_masks[][] - * in other functions. - * - * We reset it to 'level' at the end of this function. - */ - sched_domains_numa_levels = 0; - - sched_domains_numa_masks = kzalloc(sizeof(vo |
