From 439e7271dc2b63de379e37971dc2f64d71e24f8a Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Thu, 31 Aug 2017 16:37:41 -0400 Subject: livepatch: introduce shadow variable API Add exported API for livepatch modules: klp_shadow_get() klp_shadow_alloc() klp_shadow_get_or_alloc() klp_shadow_free() klp_shadow_free_all() that implement "shadow" variables, which allow callers to associate new shadow fields to existing data structures. This is intended to be used by livepatch modules seeking to emulate additions to data structure definitions. See Documentation/livepatch/shadow-vars.txt for a summary of the new shadow variable API, including a few common use cases. See samples/livepatch/livepatch-shadow-* for example modules that demonstrate shadow variables. [jkosina@suse.cz: fix __klp_shadow_get_or_alloc() comment as spotted by Josh] Signed-off-by: Joe Lawrence Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/Makefile | 2 +- kernel/livepatch/shadow.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 kernel/livepatch/shadow.c (limited to 'kernel') diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index 2b8bdb1925da..b36ceda6488e 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o patch.o transition.o +livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c new file mode 100644 index 000000000000..67e4360521f3 --- /dev/null +++ b/kernel/livepatch/shadow.c @@ -0,0 +1,277 @@ +/* + * shadow.c - Shadow Variables + * + * Copyright (C) 2014 Josh Poimboeuf + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2017 Joe Lawrence + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +/** + * DOC: Shadow variable API concurrency notes: + * + * The shadow variable API provides a simple relationship between an + * pair and a pointer value. It is the responsibility of the + * caller to provide any mutual exclusion required of the shadow data. + * + * Once a shadow variable is attached to its parent object via the + * klp_shadow_*alloc() API calls, it is considered live: any subsequent + * call to klp_shadow_get() may then return the shadow variable's data + * pointer. Callers of klp_shadow_*alloc() should prepare shadow data + * accordingly. + * + * The klp_shadow_*alloc() API calls may allocate memory for new shadow + * variable structures. Their implementation does not call kmalloc + * inside any spinlocks, but API callers should pass GFP flags according + * to their specific needs. + * + * The klp_shadow_hash is an RCU-enabled hashtable and is safe against + * concurrent klp_shadow_free() and klp_shadow_get() operations. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +static DEFINE_HASHTABLE(klp_shadow_hash, 12); + +/* + * klp_shadow_lock provides exclusive access to the klp_shadow_hash and + * the shadow variables it references. + */ +static DEFINE_SPINLOCK(klp_shadow_lock); + +/** + * struct klp_shadow - shadow variable structure + * @node: klp_shadow_hash hash table node + * @rcu_head: RCU is used to safely free this structure + * @obj: pointer to parent object + * @id: data identifier + * @data: data area + */ +struct klp_shadow { + struct hlist_node node; + struct rcu_head rcu_head; + void *obj; + unsigned long id; + char data[]; +}; + +/** + * klp_shadow_match() - verify a shadow variable matches given + * @shadow: shadow variable to match + * @obj: pointer to parent object + * @id: data identifier + * + * Return: true if the shadow variable matches. + */ +static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj, + unsigned long id) +{ + return shadow->obj == obj && shadow->id == id; +} + +/** + * klp_shadow_get() - retrieve a shadow variable data pointer + * @obj: pointer to parent object + * @id: data identifier + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + + rcu_read_lock(); + + hash_for_each_possible_rcu(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + rcu_read_unlock(); + return shadow->data; + } + } + + rcu_read_unlock(); + + return NULL; +} +EXPORT_SYMBOL_GPL(klp_shadow_get); + +void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags, bool warn_on_exist) +{ + struct klp_shadow *new_shadow; + void *shadow_data; + unsigned long flags; + + /* Check if the shadow variable already exists */ + shadow_data = klp_shadow_get(obj, id); + if (shadow_data) + goto exists; + + /* Allocate a new shadow variable for use inside the lock below */ + new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); + if (!new_shadow) + return NULL; + + new_shadow->obj = obj; + new_shadow->id = id; + + /* Initialize the shadow variable if data provided */ + if (data) + memcpy(new_shadow->data, data, size); + + /* Look for again under the lock */ + spin_lock_irqsave(&klp_shadow_lock, flags); + shadow_data = klp_shadow_get(obj, id); + if (unlikely(shadow_data)) { + /* + * Shadow variable was found, throw away speculative + * allocation. + */ + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + goto exists; + } + + /* No found, so attach the newly allocated one */ + hash_add_rcu(klp_shadow_hash, &new_shadow->node, + (unsigned long)new_shadow->obj); + spin_unlock_irqrestore(&klp_shadow_lock, flags); + + return new_shadow->data; + +exists: + if (warn_on_exist) { + WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id); + return NULL; + } + + return shadow_data; +} + +/** + * klp_shadow_alloc() - allocate and add a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Allocates @size bytes for new shadow variable data using @gfp_flags + * and copies @size bytes from @data into the new shadow variable's own + * data space. If @data is NULL, @size bytes are still allocated, but + * no copy is performed. The new shadow variable is then added to the + * global hashtable. + * + * If an existing shadow variable can be found, this routine + * will issue a WARN, exit early and return NULL. + * + * Return: the shadow variable data element, NULL on duplicate or + * failure. + */ +void *klp_shadow_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); +} +EXPORT_SYMBOL_GPL(klp_shadow_alloc); + +/** + * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Returns a pointer to existing shadow data if an shadow + * variable is already present. Otherwise, it creates a new shadow + * variable like klp_shadow_alloc(). + * + * This function guarantees that only one shadow variable exists with + * the given @id for the given @obj. It also guarantees that the shadow + * variable will be initialized by the given @data only when it did not + * exist before. + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); +} +EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); + +/** + * klp_shadow_free() - detach and free a shadow variable + * @obj: pointer to parent object + * @id: data identifier + * + * This function releases the memory for this shadow variable + * instance, callers should stop referencing it accordingly. + */ +void klp_shadow_free(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete from hash */ + hash_for_each_possible(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + break; + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free); + +/** + * klp_shadow_free_all() - detach and free all <*, id> shadow variables + * @id: data identifier + * + * This function releases the memory for all <*, id> shadow variable + * instances, callers should stop referencing them accordingly. + */ +void klp_shadow_free_all(unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + int i; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete all <*, id> from hash */ + hash_for_each(klp_shadow_hash, i, shadow, node) { + if (klp_shadow_match(shadow, shadow->obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free_all); -- cgit v1.2.3 From 5d9da759f7587c87252ef98e70bc0b4a89e4d036 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 14 Sep 2017 14:15:36 -0700 Subject: livepatch: __klp_shadow_get_or_alloc() is local to shadow.c ... therefore make it static. Fixes: 439e7271dc2 ("livepatch: introduce shadow variable API") Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/shadow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 67e4360521f3..fdac27588d60 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -113,7 +113,7 @@ void *klp_shadow_get(void *obj, unsigned long id) } EXPORT_SYMBOL_GPL(klp_shadow_get); -void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, size_t size, gfp_t gfp_flags, bool warn_on_exist) { struct klp_shadow *new_shadow; -- cgit v1.2.3 From e454cf5958538666635488030046b6a84a22d447 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 18 Sep 2017 15:30:55 -0400 Subject: bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE This is a simple non-recursive delete operation. It prunes paths of empty nodes in the tree, but it does not try to further compress the tree as nodes are removed. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b767844a76f..9d58a576b2ae 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -389,10 +389,84 @@ out: return ret; } -static int trie_delete_elem(struct bpf_map *map, void *key) +/* Called from syscall or from eBPF program */ +static int trie_delete_elem(struct bpf_map *map, void *_key) { - /* TODO */ - return -ENOSYS; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key; + struct lpm_trie_node __rcu **trim; + struct lpm_trie_node *node; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Walk the tree looking for an exact key/length match and keeping + * track of where we could begin trimming the tree. The trim-point + * is the sub-tree along the walk consisting of only single-child + * intermediate nodes and ending at a leaf node that we want to + * remove. + */ + trim = &trie->root; + node = rcu_dereference_protected( + trie->root, lockdep_is_held(&trie->lock)); + while (node) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + /* If we hit a node that has more than one child or is a valid + * prefix itself, do not remove it. Reset the root of the trim + * path to its descendant on our path. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || + (node->child[0] && node->child[1])) + trim = &node->child[next_bit]; + node = rcu_dereference_protected( + node->child[next_bit], lockdep_is_held(&trie->lock)); + } + + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + ret = -ENOENT; + goto out; + } + + trie->n_entries--; + + /* If the node we are removing is not a leaf node, simply mark it + * as intermediate and we are done. + */ + if (rcu_access_pointer(node->child[0]) || + rcu_access_pointer(node->child[1])) { + node->flags |= LPM_TREE_NODE_FLAG_IM; + goto out; + } + + /* trim should now point to the slot holding the start of a path from + * zero or more intermediate nodes to our leaf node for deletion. + */ + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { + RCU_INIT_POINTER(*trim, NULL); + trim = rcu_access_pointer(node->child[0]) ? + &node->child[0] : + &node->child[1]; + kfree_rcu(node, rcu); + } + +out: + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; } #define LPM_DATA_SIZE_MAX 256 -- cgit v1.2.3 From f454322efbf6faee695f517c6b52c4dc03cacd3e Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:11 +0300 Subject: signal: replace sigset_to_compat() with put_compat_sigset() There are 4 callers of sigset_to_compat() in the entire kernel. One is in sparc compat rt_sigaction(2), the rest are in kernel/signal.c itself. All are followed by copy_to_user(), and all but the sparc one are under "if it's big-endian..." ifdefs. Let's transform sigset_to_compat() into put_compat_sigset() that also calls copy_to_user(). Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/compat.c | 20 ++++++++++++++------ kernel/signal.c | 27 ++++++--------------------- 2 files changed, 20 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 772e038d04d9..18dd902c9052 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -497,15 +497,23 @@ sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) } EXPORT_SYMBOL_GPL(sigset_from_compat); -void -sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +int +put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, + unsigned int size) { + /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ +#ifdef __BIG_ENDIAN + compat_sigset_t v; switch (_NSIG_WORDS) { - case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; - case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; - case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; - case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; + case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; + case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; + case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; } + return copy_to_user(compat, &v, size) ? -EFAULT : 0; +#else + return copy_to_user(compat, set, size) ? -EFAULT : 0; +#endif } #ifdef CONFIG_NUMA diff --git a/kernel/signal.c b/kernel/signal.c index 800a18f77732..14ad6bb90dad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2621,13 +2621,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, if (error) return error; } - if (oset) { - compat_sigset_t old32; - sigset_to_compat(&old32, &old_set); - if (copy_to_user(oset, &old32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return 0; + return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; #else return sys_rt_sigprocmask(how, (sigset_t __user *)nset, (sigset_t __user *)oset, sigsetsize); @@ -2669,20 +2663,11 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t set; int err = do_sigpending(&set, sigsetsize); - if (!err) { - compat_sigset_t set32; - sigset_to_compat(&set32, &set); - /* we can get here only if sigsetsize <= sizeof(set) */ - if (copy_to_user(uset, &set32, sigsetsize)) - err = -EFAULT; - } + if (!err) + err = put_compat_sigset(uset, &set, sigsetsize); return err; -#else - return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); -#endif } #endif @@ -3451,7 +3436,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; - compat_sigset_t mask; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif @@ -3463,6 +3447,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, if (act) { compat_uptr_t handler; + compat_sigset_t mask; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER @@ -3478,10 +3463,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - sigset_to_compat(&mask, &old_ka.sa.sa_mask); ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); + ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, + sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), -- cgit v1.2.3 From 1681634b8c70353d8ca211b9b3443889a16dafeb Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:29 +0300 Subject: signal: simplify compat_sigpending() Remove "if it's big-endian..." ifdef in compat_sigpending(), use the endian-agnostic variant. Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/signal.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 14ad6bb90dad..f59c05fc374a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3330,15 +3330,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { -#ifdef __BIG_ENDIAN sigset_t set; int err = do_sigpending(&set, sizeof(set.sig[0])); if (!err) err = put_user(set.sig[0], set32); return err; -#else - return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); -#endif } #endif -- cgit v1.2.3 From 176826af03666758c656dd27f098cc889b71638b Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:43 +0300 Subject: signal: lift sigset size check out of do_sigpending() As sigsetsize argument of do_sigpending() is not used anywhere else in that function after the check, remove this argument and move the check out of do_sigpending() into rt_sigpending() and its compat analog. Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/signal.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f59c05fc374a..9fbc574ced10 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2629,11 +2629,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, } #endif -static int do_sigpending(void *set, unsigned long sigsetsize) +static int do_sigpending(sigset_t *set) { - if (sigsetsize > sizeof(sigset_t)) - return -EINVAL; - spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); @@ -2653,7 +2650,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err = do_sigpending(&set, sigsetsize); + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); if (!err && copy_to_user(uset, &set, sigsetsize)) err = -EFAULT; return err; @@ -2664,7 +2666,12 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; - int err = do_sigpending(&set, sigsetsize); + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); if (!err) err = put_compat_sigset(uset, &set, sigsetsize); return err; @@ -3331,7 +3338,7 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; - int err = do_sigpending(&set, sizeof(set.sig[0])); + int err = do_sigpending(&set); if (!err) err = put_user(set.sig[0], set32); return err; -- cgit v1.2.3 From b8e8e1aa9f14110da180569908bbe538c9e9dc63 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Sep 2017 20:42:54 -0400 Subject: get rid of {get,put}_compat_itimerspec() no users left Signed-off-by: Al Viro --- kernel/compat.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 18dd902c9052..d43b18031116 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } -int get_compat_itimerspec(struct itimerspec *dst, - const struct compat_itimerspec __user *src) -{ - if (__compat_get_timespec(&dst->it_interval, &src->it_interval) || - __compat_get_timespec(&dst->it_value, &src->it_value)) - return -EFAULT; - return 0; -} - -int put_compat_itimerspec(struct compat_itimerspec __user *dst, - const struct itimerspec *src) -{ - if (__compat_put_timespec(&src->it_interval, &dst->it_interval) || - __compat_put_timespec(&src->it_value, &dst->it_value)) - return -EFAULT; - return 0; -} - int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits) { -- cgit v1.2.3 From 3968cf623892d710e651070243fd16af312a9797 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Sep 2017 21:45:17 -0400 Subject: get_compat_sigset() similar to put_compat_sigset() Signed-off-by: Al Viro --- kernel/compat.c | 23 ++++++++++++++++------- kernel/signal.c | 27 ++++----------------------- 2 files changed, 20 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d43b18031116..a46a4a40bb8b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -467,17 +467,26 @@ Efault: return -EFAULT; } -void -sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) +int +get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) { +#ifdef __BIG_ENDIAN + compat_sigset_t v; + if (copy_from_user(&v, compat, sizeof(compat_sigset_t))) + return -EFAULT; switch (_NSIG_WORDS) { - case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); - case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); - case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); - case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); + case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } +#else + if (copy_from_user(set, compat, sizeof(compat_sigset_t))) + return -EFAULT; +#endif + return 0; } -EXPORT_SYMBOL_GPL(sigset_from_compat); +EXPORT_SYMBOL_GPL(get_compat_sigset); int put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, diff --git a/kernel/signal.c b/kernel/signal.c index 9fbc574ced10..36a523640894 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2600,7 +2600,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ @@ -2608,13 +2607,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, return -EINVAL; if (nset) { - compat_sigset_t new32; sigset_t new_set; int error; - if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&new_set, nset)) return -EFAULT; - - sigset_from_compat(&new_set, &new32); sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); @@ -2622,10 +2618,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, return error; } return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; -#else - return sys_rt_sigprocmask(how, (sigset_t __user *)nset, - (sigset_t __user *)oset, sigsetsize); -#endif } #endif @@ -2908,7 +2900,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { - compat_sigset_t s32; sigset_t s; struct timespec t; siginfo_t info; @@ -2917,9 +2908,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + if (get_compat_sigset(&s, uthese)) return -EFAULT; - sigset_from_compat(&s, &s32); if (uts) { if (compat_get_timespec(&t, uts)) @@ -3450,18 +3440,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, if (act) { compat_uptr_t handler; - compat_sigset_t mask; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif - ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); + ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; - sigset_from_compat(&new_ka.sa.sa_mask, &mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); @@ -3649,22 +3637,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t newset; - compat_sigset_t newset32; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&newset, unewset)) return -EFAULT; - sigset_from_compat(&newset, &newset32); return sigsuspend(&newset); -#else - /* on little-endian bitmaps don't care about granularity */ - return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); -#endif } #endif -- cgit v1.2.3 From abca5fc535a3ee0f36fb6d4468a453eaae769921 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2017 18:17:46 -0400 Subject: sched_rr_get_interval(): move compat to native, get rid of set_fs() switch to using timespec64 internally, while we are at it Signed-off-by: Al Viro --- kernel/compat.c | 16 ---------------- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index a46a4a40bb8b..d1cee656a7ed 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -562,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, } #endif -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct compat_timespec __user *, interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs(old_fs); - if (compat_put_timespec(&t, interval)) - return -EFAULT; - return ret; -} - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..e74f0a5a8647 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -5098,13 +5099,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * Return: On success, 0 and the timeslice is in @interval. Otherwise, * an error code. */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; unsigned int time_slice; struct rq_flags rf; - struct timespec t; struct rq *rq; int retval; @@ -5128,15 +5127,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, task_rq_unlock(rq, p, &rf); rcu_read_unlock(); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; + jiffies_to_timespec64(time_slice, t); + return 0; out_unlock: rcu_read_unlock(); return retval; } +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = compat_put_timespec64(&t, interval); + return retval; +} +#endif + void sched_show_task(struct task_struct *p) { unsigned long free = 0; -- cgit v1.2.3 From cfb766da54d98ceb145e1bb0bd11c559569dcbfc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: sched/cputime: Expose cputime_adjust() Will be used by basic cgroup resource stat reporting later. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- kernel/sched/cputime.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..8839b6e8a104 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -585,9 +585,8 @@ drop_precision: * * Assuming that rtime_i+1 >= rtime_i. */ -static void cputime_adjust(struct task_cputime *curr, - struct prev_cputime *prev, - u64 *ut, u64 *st) +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) { u64 rtime, stime, utime; unsigned long flags; -- cgit v1.2.3 From d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: cpuacct: Introduce cgroup_account_cputime[_field]() Introduce cgroup_account_cputime[_field]() which wrap cpuacct_charge() and cgroup_account_field(). This doesn't introduce any functional changes and will be used to add cgroup basic resource accounting. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar --- kernel/sched/cpuacct.h | 17 ----------------- kernel/sched/cputime.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/stop_task.c | 2 +- 7 files changed, 6 insertions(+), 23 deletions(-) delete mode 100644 kernel/sched/cpuacct.h (limited to 'kernel') diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h deleted file mode 100644 index ba72807c73d4..000000000000 --- a/kernel/sched/cpuacct.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifdef CONFIG_CGROUP_CPUACCT - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); - -#else - -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ -} - -static inline void -cpuacct_account_field(struct task_struct *tsk, int index, u64 val) -{ -} - -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8839b6e8a104..e01b699bbd5b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __this_cpu_add(kernel_cpustat.cpustat[index], tmp); - cpuacct_account_field(p, index, tmp); + cgroup_account_cputime_field(p, index, tmp); } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7667c3..abd913c1b99e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1143,7 +1143,7 @@ static void update_curr_dl(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..0ae69af95b8b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -851,7 +851,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); + cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0af5ca9e3e3f..fdc2c5d1f82e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -979,7 +979,7 @@ static void update_curr_rt(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14db76cd496f..f0b98f978843 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_PARAVIRT #include @@ -36,7 +37,6 @@ #include "cpupri.h" #include "cpudeadline.h" -#include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 9f69fb630853..ec0bb5ab9024 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -71,7 +71,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) -- cgit v1.2.3 From 041cd640b2f3c5607171c59d8712b503659d21f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:05 -0700 Subject: cgroup: Implement cgroup2 basic CPU usage accounting In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner Cc: Waiman Long Cc: Roman Gushchin --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/cgroup-internal.h | 8 + kernel/cgroup/cgroup.c | 24 ++- kernel/cgroup/stat.c | 334 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+), 3 deletions(-) create mode 100644 kernel/cgroup/stat.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..0acee616e06c 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,4 +1,4 @@ -obj-y := cgroup.o namespace.o cgroup-v1.o +obj-y := cgroup.o stat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5151ff256c29..fa642c99586a 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); +/* + * stat.c + */ +void cgroup_stat_flush(struct cgroup *cgrp); +int cgroup_stat_init(struct cgroup *cgrp); +void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_boot(void); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..d036625556c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS +static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); + cgroup_stat_show_cputime(seq, "cpu."); + return 0; } @@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_exit(cgrp); kfree(cgrp); } else { /* @@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ trace_cgroup_release(cgrp); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_flush(cgrp); + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; @@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_free_cgrp; + if (cgroup_on_dfl(parent)) { + ret = cgroup_stat_init(cgrp); + if (ret) + goto out_cancel_ref; + } + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_cancel_ref; + goto out_stat_exit; } init_cgroup_housekeeping(cgrp); @@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_stat_exit: + if (cgroup_on_dfl(parent)) + cgroup_stat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5148,6 +5166,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + cgroup_stat_boot(); + /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..9cce79e89320 --- /dev/null +++ b/kernel/cgroup/stat.c @@ -0,0 +1,334 @@ +#include "cgroup-internal.h" + +#include + +static DEFINE_MUTEX(cgroup_stat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); + +static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->cpu_stat, cpu); +} + +/** + * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * @cgrp: target cgroup + * @cpu: cpu on which cpu_stat was updated + * + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching + * cpu_stat->updated_children list. See the comment on top of + * cgroup_cpu_stat definition for details. + */ +static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (cstat->updated_next) + break; + + cstat->updated_next = pcstat->updated_children; + pcstat->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + +/** + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_cpu_stat_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_cpu_stat *cstat; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + cstat = cgroup_cpu_stat(pos, cpu); + if (cstat->updated_children == pos) + break; + pos = cstat->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && cstat->updated_next) { + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_cpu_stat *ncstat; + struct cgroup **nextp; + + nextp = &pcstat->updated_children; + while (true) { + ncstat = cgroup_cpu_stat(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &ncstat->updated_next; + } + + *nextp = cstat->updated_next; + cstat->updated_next = NULL; + } + + return pos; +} + +static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, + struct cgroup_stat *src_stat) +{ + dst_stat->cputime.utime += src_stat->cputime.utime; + dst_stat->cputime.stime += src_stat->cputime.stime; + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; +} + +static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct task_cputime *last_cputime = &cstat->last_cputime; + struct task_cputime cputime; + struct cgroup_stat delta; + unsigned seq; + + lockdep_assert_held(&cgroup_stat_mutex); + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&cstat->sync); + cputime = cstat->cputime; + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + + /* accumulate the deltas to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_stat_accumulate(&cgrp->stat, &delta); + if (parent) + cgroup_stat_accumulate(&parent->pending_stat, &delta); +} + +/* see cgroup_stat_flush() */ +static void cgroup_stat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_stat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + cgroup_cpu_stat_flush_one(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_stat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_stat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_stat_mutex); + cgroup_stat_flush_locked(cgrp); + mutex_unlock(&cgroup_stat_mutex); +} + +static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +{ + struct cgroup_cpu_stat *cstat; + + cstat = get_cpu_ptr(cgrp->cpu_stat); + u64_stats_update_begin(&cstat->sync); + return cstat; +} + +static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, + struct cgroup_cpu_stat *cstat) +{ + u64_stats_update_end(&cstat->sync); + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(cstat); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + cstat->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + cstat->cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + cstat->cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + mutex_lock(&cgroup_stat_mutex); + + cgroup_stat_flush_locked(cgrp); + + usage = cgrp->stat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, + &utime, &stime); + + mutex_unlock(&cgroup_stat_mutex); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "%susage_usec %llu\n" + "%suser_usec %llu\n" + "%ssystem_usec %llu\n", + prefix, usage, prefix, utime, prefix, stime); +} + +int cgroup_stat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has cpu_stat preallocated */ + if (!cgrp->cpu_stat) { + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); + if (!cgrp->cpu_stat) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) + cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + + prev_cputime_init(&cgrp->stat.prev_cputime); + + return 0; +} + +void cgroup_stat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_stat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || + WARN_ON_ONCE(cstat->updated_next)) + return; + } + + free_percpu(cgrp->cpu_stat); + cgrp->cpu_stat = NULL; +} + +void __init cgroup_stat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); +} -- cgit v1.2.3 From e0b477941d130576d9daf31160dab6e34335b10a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:04 +0200 Subject: genirq/debugfs: Show debug information for all irq descriptors Currently the debugfs shows only information about actively used interrupts like /proc/irq/ does. That's fine for most cases, but not helpful when internals of allocated, but unused interrupt descriptors have to debugged. It's also useful to provide information about all descriptors so leaks can be debugged in a simpler way. Move the debugfs registration to the descriptor allocation code. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.355525908@linutronix.de --- kernel/irq/irqdesc.c | 1 + kernel/irq/manage.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 82afb7ed369f..0f6805b75fc8 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -462,6 +462,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); + irq_add_debugfs_entry(start + i, desc); } bitmap_set(allocated_irqs, start, cnt); return start; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d00132b5c325..0e8b48315f3c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1400,7 +1400,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); - irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; -- cgit v1.2.3 From 07557ccb8c83f315e409ddee181e9ffbf87c6ad1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:05 +0200 Subject: genirq/msi: Capture device name for debugfs For debugging the allocation of unused or potentially leaked interrupt descriptor it's helpful to have some information about the site which allocated them. In case of MSI this is simple because the caller hands the device struct pointer into the domain allocation function. Duplicate the device name and show it in the debugfs entry of the interrupt descriptor. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.433038426@linutronix.de --- kernel/irq/debugfs.c | 10 ++++++++++ kernel/irq/internals.h | 5 +++++ kernel/irq/msi.c | 6 +++++- 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c3fdb36dec30..b7d1023b9b22 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -149,6 +149,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, ARRAY_SIZE(irqdesc_states)); @@ -226,6 +227,15 @@ static const struct file_operations dfs_irq_ops = { .release = single_release, }; +void irq_debugfs_copy_devname(int irq, struct device *dev) +{ + struct irq_desc *desc = irq_to_desc(irq); + const char *name = dev_name(dev); + + if (name) + desc->dev_name = kstrdup(name, GFP_KERNEL); +} + void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) { char name [10]; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a4aa39009f0d..cfaec2669093 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -443,7 +443,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); static inline void irq_remove_debugfs_entry(struct irq_desc *desc) { debugfs_remove(desc->debugfs_file); + kfree(desc->dev_name); } +void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else @@ -458,4 +460,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } +static inline void irq_debugfs_copy_devname(int irq, struct device *dev) +{ +} #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3fa4bd59f569..94ecc0293844 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -16,6 +16,8 @@ #include #include +#include "internals.h" + /** * alloc_msi_entry - Allocate an initialize msi_entry * @dev: Pointer to the device for which this is allocated @@ -373,8 +375,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return ret; } - for (i = 0; i < desc->nvec_used; i++) + for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); + irq_debugfs_copy_devname(virq + i, dev); + } } if (ops->msi_finish) -- cgit v1.2.3 From c3e7239a7f43ce1ff407df5f5944bf0d42dc21bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:06 +0200 Subject: irqdomain/debugfs: Provide domain specific debug callback Some interrupt domains like the X86 vector domain has special requirements for debugging, like showing the vector usage on the CPUs. Add a callback to the irqdomain ops which can be filled in by domains which require it and add conditional invocations to the irqdomain and the per irq debug files. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.512937505@linutronix.de --- kernel/irq/debugfs.c | 2 ++ kernel/irq/irqdomain.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b7d1023b9b22..7f608ac39653 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind) data->domain ? data->domain->name : ""); seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq); irq_debug_show_chip(m, data, ind + 1); + if (data->domain && data->domain->ops && data->domain->ops->debug_show) + data->domain->ops->debug_show(m, NULL, data, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!data->parent_data) return; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..1a75a00ee01f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1810,6 +1810,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) d->revmap_size + d->revmap_direct_max_irq); seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); + if (d->ops && d->ops->debug_show) + d->ops->debug_show(m, d, NULL, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!d->parent) return; -- cgit v1.2.3 From 457f6d35072f395508255ef09fd08f824382cf85 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:07 +0200 Subject: genirq: Make state consistent for !IRQ_DOMAIN_HIERARCHY In the !IRQ_DOMAIN_HIERARCHY cas the activation stubs are not setting/clearing the activation status bits. This is not a problem at the moment, but upcoming changes require a correct status. Add the set/clear incovations to the stub functions and move them to the core internal header to avoid duplication and visibility outside the core. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.591985591@linutronix.de --- kernel/irq/internals.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index cfaec2669093..72b8da2a7c39 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -436,6 +436,17 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ +#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) +static inline void irq_domain_activate_irq(struct irq_data *data) +{ + irqd_set_activated(data); +} +static inline void irq_domain_deactivate_irq(struct irq_data *data) +{ + irqd_clr_activated(data); +} +#endif + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include -- cgit v1.2.3 From 239306fee8a59beb728faf8f42b13b2250b24841 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:08 +0200 Subject: genirq: Set managed shut down flag at init Managed interrupts should start up in managed shutdown mode. Set the status flag when initialising the irq descriptor. Signed-off-by: Thomas Gleixner