summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 13:33:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 13:33:57 -0700
commit16b3d0cf5bad844daaf436ad2e9061de0fe36e5c (patch)
treed553a51e6d95fb166df7fa62264e9a27e4c438a4
parent42dec9a936e7696bea1f27d3c5a0068cd9aa95fd (diff)
parent2ea46c6fc9452ac100ad907b051d797225847e33 (diff)
downloadlinux-16b3d0cf5bad844daaf436ad2e9061de0fe36e5c.tar.gz
linux-16b3d0cf5bad844daaf436ad2e9061de0fe36e5c.tar.bz2
linux-16b3d0cf5bad844daaf436ad2e9061de0fe36e5c.zip
Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and debugfs interfaces to a unified debugfs interface. - Signals: Allow caching one sigqueue object per task, to improve performance & latencies. - Improve newidle_balance() irq-off latencies on systems with a large number of CPU cgroups. - Improve energy-aware scheduling - Improve the PELT metrics for certain workloads - Reintroduce select_idle_smt() to improve load-balancing locality - but without the previous regressions - Add 'scheduler latency debugging': warn after long periods of pending need_resched. This is an opt-in feature that requires the enabling of the LATENCY_WARN scheduler feature, or the use of the resched_latency_warn_ms=xx boot parameter. - CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix remaining balance_push() vs. hotplug holes/races - PSI fixes, plus allow /proc/pressure/ files to be written by CAP_SYS_RESOURCE tasks as well - Fix/improve various load-balancing corner cases vs. capacity margins - Fix sched topology on systems with NUMA diameter of 3 or above - Fix PF_KTHREAD vs to_kthread() race - Minor rseq optimizations - Misc cleanups, optimizations, fixes and smaller updates * tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits) cpumask/hotplug: Fix cpu_dying() state tracking kthread: Fix PF_KTHREAD vs to_kthread() race sched/debug: Fix cgroup_path[] serialization sched,psi: Handle potential task count underflow bugs more gracefully sched: Warn on long periods of pending need_resched sched/fair: Move update_nohz_stats() to the CONFIG_NO_HZ_COMMON block to simplify the code & fix an unused function warning sched/debug: Rename the sched_debug parameter to sched_verbose sched,fair: Alternative sched_slice() sched: Move /proc/sched_debug to debugfs sched,debug: Convert sysctl sched_domains to debugfs debugfs: Implement debugfs_create_str() sched,preempt: Move preempt_dynamic to debug.c sched: Move SCHED_DEBUG sysctl to debugfs sched: Don't make LATENCYTOP select SCHED_DEBUG sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG sched: Use cpu_dying() to fix balance_push vs hotplug-rollback cpumask: Introduce DYING mask cpumask: Make cpu_{online,possible,present,active}() inline rseq: Optimise rseq_get_rseq_cs() and clear_rseq_cs() ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/scheduler/sched-domains.rst10
-rw-r--r--drivers/usb/usbip/usbip_common.h1
-rw-r--r--fs/debugfs/file.c91
-rw-r--r--include/linux/cpumask.h117
-rw-r--r--include/linux/debugfs.h17
-rw-r--r--include/linux/kcov.h1
-rw-r--r--include/linux/psi.h1
-rw-r--r--include/linux/psi_types.h3
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/sched/sysctl.h9
-rw-r--r--include/linux/signal.h1
-rw-r--r--include/uapi/linux/ptrace.h10
-rw-r--r--kernel/cpu.c210
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/ptrace.c25
-rw-r--r--kernel/rseq.c29
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c217
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c12
-rw-r--r--kernel/sched/debug.c435
-rw-r--r--kernel/sched/fair.c380
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle.c10
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.c2
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/psi.c164
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h59
-rw-r--r--kernel/sched/stats.c2
-rw-r--r--kernel/sched/stats.h37
-rw-r--r--kernel/sched/topology.c113
-rw-r--r--kernel/signal.c59
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sysctl.c94
-rw-r--r--lib/Kconfig.debug1
-rw-r--r--net/core/skbuff.c1
-rw-r--r--net/mac80211/iface.c1
-rw-r--r--net/mac80211/rx.c1
46 files changed, 1284 insertions, 900 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 54582ca6c4f9..74db44ce4d9a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4754,7 +4754,7 @@
sbni= [NET] Granch SBNI12 leased line adapter
- sched_debug [KNL] Enables verbose scheduler debug messages.
+ sched_verbose [KNL] Enables verbose scheduler debug messages.
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst
index 8582fa5e9170..14ea2f21d20e 100644
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -74,8 +74,8 @@ for a given topology level by creating a sched_domain_topology_level array and
calling set_sched_topology() with this array as the parameter.
The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
-tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
-knob. This enables an error checking parse of the sched domains which should
-catch most possible errors (described above). It also prints out the domain
-structure in a visual format.
+CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
+forgot to tweak your cmdline, you can also flip the
+/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
+the sched domains which should catch most possible errors (described above). It
+also prints out the domain structure in a visual format.
diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h
index ea2a20e6d27d..d8cbd2dfc2c2 100644
--- a/drivers/usb/usbip/usbip_common.h
+++ b/drivers/usb/usbip/usbip_common.h
@@ -18,6 +18,7 @@
#include <linux/usb.h>
#include <linux/wait.h>
#include <linux/sched/task.h>
+#include <linux/kcov.h>
#include <uapi/linux/usbip.h>
#undef pr_fmt
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 9979d981e9be..e813acfaa6e8 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -864,6 +864,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
+ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct dentry *dentry = F_DENTRY(file);
+ char *str, *copy = NULL;
+ int copy_len, len;
+ ssize_t ret;
+
+ ret = debugfs_file_get(dentry);
+ if (unlikely(ret))
+ return ret;
+
+ str = *(char **)file->private_data;
+ len = strlen(str) + 1;
+ copy = kmalloc(len, GFP_KERNEL);
+ if (!copy) {
+ debugfs_file_put(dentry);
+ return -ENOMEM;
+ }
+
+ copy_len = strscpy(copy, str, len);
+ debugfs_file_put(dentry);
+ if (copy_len < 0) {
+ kfree(copy);
+ return copy_len;
+ }
+
+ copy[copy_len] = '\n';
+
+ ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len);
+ kfree(copy);
+
+ return ret;
+}
+
+static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ /* This is really only for read-only strings */
+ return -EINVAL;
+}
+
+static const struct file_operations fops_str = {
+ .read = debugfs_read_file_str,
+ .write = debugfs_write_file_str,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_str_ro = {
+ .read = debugfs_read_file_str,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_str_wo = {
+ .write = debugfs_write_file_str,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+/**
+ * debugfs_create_str - create a debugfs file that is used to read and write a string value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value. If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
+ * be returned.
+ */
+void debugfs_create_str(const char *name, umode_t mode,
+ struct dentry *parent, char **value)
+{
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str,
+ &fops_str_ro, &fops_str_wo);
+}
+
static ssize_t read_file_blob(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 383684e30f12..e6b948a6000d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -91,44 +91,15 @@ extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
+extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask)
#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask)
+#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask)
extern atomic_t __num_online_cpus;
-#if NR_CPUS > 1
-/**
- * num_online_cpus() - Read the number of online CPUs
- *
- * Despite the fact that __num_online_cpus is of type atomic_t, this
- * interface gives only a momentary snapshot and is not protected against
- * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
- * region.
- */
-static inline unsigned int num_online_cpus(void)
-{
- return atomic_read(&__num_online_cpus);
-}
-#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
-#define num_present_cpus() cpumask_weight(cpu_present_mask)
-#define num_active_cpus() cpumask_weight(cpu_active_mask)
-#define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask)
-#define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask)
-#define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask)
-#define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask)
-#else
-#define num_online_cpus() 1U
-#define num_possible_cpus() 1U
-#define num_present_cpus() 1U
-#define num_active_cpus() 1U
-#define cpu_online(cpu) ((cpu) == 0)
-#define cpu_possible(cpu) ((cpu) == 0)
-#define cpu_present(cpu) ((cpu) == 0)
-#define cpu_active(cpu) ((cpu) == 0)
-#endif
-
extern cpumask_t cpus_booted_once_mask;
static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
@@ -857,6 +828,14 @@ set_cpu_active(unsigned int cpu, bool active)
cpumask_clear_cpu(cpu, &__cpu_active_mask);
}
+static inline void
+set_cpu_dying(unsigned int cpu, bool dying)
+{
+ if (dying)
+ cpumask_set_cpu(cpu, &__cpu_dying_mask);
+ else
+ cpumask_clear_cpu(cpu, &__cpu_dying_mask);
+}
/**
* to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
@@ -894,6 +873,82 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
return to_cpumask(p);
}
+#if NR_CPUS > 1
+/**
+ * num_online_cpus() - Read the number of online CPUs
+ *
+ * Despite the fact that __num_online_cpus is of type atomic_t, this
+ * interface gives only a momentary snapshot and is not protected against
+ * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
+ * region.
+ */
+static inline unsigned int num_online_cpus(void)
+{
+ return atomic_read(&__num_online_cpus);
+}
+#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
+#define num_present_cpus() cpumask_weight(cpu_present_mask)
+#define num_active_cpus() cpumask_weight(cpu_active_mask)
+
+static inline bool cpu_online(unsigned int cpu)
+{
+ return cpumask_test_cpu(cpu, cpu_online_mask);
+}
+
+static inline bool cpu_possible(unsigned int cpu)
+{
+ return cpumask_test_cpu(cpu, cpu_possible_mask);
+}
+
+static inline bool cpu_present(unsigned int cpu)
+{
+ return cpumask_test_cpu(cpu, cpu_present_mask);
+}
+
+static inline bool cpu_active(unsigned int cpu)
+{
+ return cpumask_test_cpu(cpu, cpu_active_mask);
+}
+
+static inline bool cpu_dying(unsigned int cpu)
+{
+ return cpumask_test_cpu(cpu, cpu_dying_mask);
+}
+
+#else
+
+#define num_online_cpus() 1U
+#define num_possible_cpus() 1U
+#define num_present_cpus() 1U
+#define num_active_cpus() 1U
+
+static inline bool cpu_online(unsigned int cpu)
+{
+ return cpu == 0;
+}
+
+static inline bool cpu_possible(unsigned int cpu)
+{
+ return cpu == 0;
+}
+
+static inline bool cpu_present(unsigned int cpu)
+{
+ return cpu == 0;
+}
+
+static inline bool cpu_active(unsigned int cpu)
+{
+ return cpu == 0;
+}
+
+static inline bool cpu_dying(unsigned int cpu)
+{
+ return false;
+}
+
+#endif /* NR_CPUS > 1 */
+
#define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
#if NR_CPUS <= BITS_PER_LONG
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index d6c4cc9ecc77..1fdb4343af9c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -128,6 +128,8 @@ void debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value);
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
struct dentry *parent, bool *value);
+void debugfs_create_str(const char *name, umode_t mode,
+ struct dentry *parent, char **value);
struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
@@ -156,6 +158,9 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos);
+ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
#else
#include <linux/err.h>
@@ -297,6 +302,11 @@ static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
return ERR_PTR(-ENODEV);
}
+static inline void debugfs_create_str(const char *name, umode_t mode,
+ struct dentry *parent,
+ char **value)
+{ }
+
static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
@@ -348,6 +358,13 @@ static inline ssize_t debugfs_write_file_bool(struct file *file,
return -ENODEV;
}
+static inline ssize_t debugfs_read_file_str(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ return -ENODEV;
+}
+
#endif
/**
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index 4e3037dc1204..55dc338f6bcd 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_KCOV_H
#define _LINUX_KCOV_H
+#include <linux/sched.h>
#include <uapi/linux/kcov.h>
struct task_struct;
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7361023f3fdd..65eb1476ac70 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_memstall_tick(struct task_struct *task, int cpu);
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index b95f3211566a..0a23300d49af 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -50,9 +50,10 @@ enum psi_states {
PSI_MEM_SOME,
PSI_MEM_FULL,
PSI_CPU_SOME,
+ PSI_CPU_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
- NR_PSI_STATES = 6,
+ NR_PSI_STATES = 7,
};
enum psi_aggregators {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 743a613c9cf3..f652a8ccad73 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -14,7 +14,6 @@
#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
-#include <linux/kcov.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
@@ -985,6 +984,7 @@ struct task_struct {
/* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct __rcu *sighand;
+ struct sigqueue *sigqueue_cache;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
@@ -1101,7 +1101,7 @@ struct task_struct {
#ifdef CONFIG_CPUSETS
/* Protected by ->alloc_lock: */
nodemask_t mems_allowed;
- /* Seqence number to catch updates: */
+ /* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3c31ba88aca5..db2c0f34aaaf 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
enum { sysctl_hung_task_timeout_secs = 0 };
#endif
+extern unsigned int sysctl_sched_child_runs_first;
+
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -37,7 +38,7 @@ enum sched_tunable_scaling {
SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END,
};
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_tunable_scaling;
extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
@@ -48,8 +49,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos);
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
#endif
/*
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 1e98548d7cf6..0dbfda8d99d0 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -266,6 +266,7 @@ static inline void init_sigpending(struct sigpending *sig)
}
extern void flush_sigqueue(struct sigpending *queue);
+extern void exit_task_sigqueue_cache(struct task_struct *tsk);
/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 83ee45fa634b..3747bf816f9a 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -102,6 +102,16 @@ struct ptrace_syscall_info {
};
};
+#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
+
+struct ptrace_rseq_configuration {
+ __u64 rseq_abi_pointer;
+ __u32 rseq_abi_size;
+ __u32 signature;
+ __u32 flags;
+ __u32 pad;
+};
+
/*
* These values are stored in task->ptrace_message
* by tracehook_report_syscall_* to describe the current syscall-stop.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1b6302ecbabe..e538518556f4 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
+ int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -135,6 +136,11 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
return cpuhp_hp_states + state;
}
+static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
+{
+ return bringup ? !step->startup.single : !step->teardown.single;
+}
+
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
@@ -157,26 +163,24 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
if (st->fail == state) {
st->fail = CPUHP_INVALID;
-
- if (!(bringup ? step->startup.single : step->teardown.single))
- return 0;
-
return -EAGAIN;
}
+ if (cpuhp_step_empty(bringup, step)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
if (!step->multi_instance) {
WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
- if (!cb)
- return 0;
+
trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
cbm = bringup ? step->startup.multi : step->teardown.multi;
- if (!cbm)
- return 0;
/* Single invocation for instance add/remove */
if (node) {
@@ -461,13 +465,16 @@ static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
+ bool bringup = st->state < target;
st->rollback = false;
st->last = NULL;
st->target = target;
st->single = false;
- st->bringup = st->state < target;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);
return prev_state;
}
@@ -475,6 +482,17 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
static inline void
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
{
+ bool bringup = !st->bringup;
+
+ st->target = prev_state;
+
+ /*
+ * Already rolling back. No need invert the bringup value or to change
+ * the current state.
+ */
+ if (st->rollback)
+ return;
+
st->rollback = true;
/*
@@ -488,8 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
st->state++;
}
- st->target = prev_state;
- st->bringup = !st->bringup;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
@@ -591,10 +610,53 @@ static int finish_cpu(unsigned int cpu)
* Hotplug state machine related functions
*/
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
+/*
+ * Get the next state to run. Empty ones will be skipped. Returns true if a
+ * state must be run.
+ *
+ * st->state will be modified ahead of time, to match state_to_run, as if it
+ * has already ran.
+ */
+static bool cpuhp_next_state(bool bringup,
+ enum cpuhp_state *state_to_run,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ do {
+ if (bringup) {
+ if (st->state >= target)
+ return false;
+
+ *state_to_run = ++s