summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 16:01:46 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 16:01:46 -0700
commit30de24c7dd21348b142ee977b687afc70b392af6 (patch)
tree88aa84cb5c25d3832f716d6e3e50151bdb5c2cfb
parentf4990264565c2ccb8f193d22aad3b429eceee1ef (diff)
parent4a7a54a55e7237386cacc73f173e74329773ac93 (diff)
downloadlinux-30de24c7dd21348b142ee977b687afc70b392af6.tar.gz
linux-30de24c7dd21348b142ee977b687afc70b392af6.tar.bz2
linux-30de24c7dd21348b142ee977b687afc70b392af6.zip
Merge branch 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cache QoS (RDT/CAR) updates from Thomas Gleixner: "Add support for pseudo-locked cache regions. Cache Allocation Technology (CAT) allows on certain CPUs to isolate a region of cache and 'lock' it. Cache pseudo-locking builds on the fact that a CPU can still read and write data pre-allocated outside its current allocated area on cache hit. With cache pseudo-locking data can be preloaded into a reserved portion of cache that no application can fill, and from that point on will only serve cache hits. The cache pseudo-locked memory is made accessible to user space where an application can map it into its virtual address space and thus have a region of memory with reduced average read latency. The locking is not perfect and gets totally screwed by WBINDV and similar mechanisms, but it provides a reasonable enhancement for certain types of latency sensitive applications. The implementation extends the current CAT mechanism and provides a generally useful exclusive CAT mode on which it builds the extra pseude-locked regions" * 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits) x86/intel_rdt: Disable PMU access x86/intel_rdt: Fix possible circular lock dependency x86/intel_rdt: Make CPU information accessible for pseudo-locked regions x86/intel_rdt: Support restoration of subset of permissions x86/intel_rdt: Fix cleanup of plr structure on error x86/intel_rdt: Move pseudo_lock_region_clear() x86/intel_rdt: Limit C-states dynamically when pseudo-locking active x86/intel_rdt: Support L3 cache performance event of Broadwell x86/intel_rdt: More precise L2 hit/miss measurements x86/intel_rdt: Create character device exposing pseudo-locked region x86/intel_rdt: Create debugfs files for pseudo-locking testing x86/intel_rdt: Create resctrl debug area x86/intel_rdt: Ensure RDT cleanup on exit x86/intel_rdt: Resctrl files reflect pseudo-locked information x86/intel_rdt: Support creation/removal of pseudo-locked region x86/intel_rdt: Pseudo-lock region creation/removal core x86/intel_rdt: Discover supported platforms via prefetch disable bits x86/intel_rdt: Add utilities to test pseudo-locked region possibility x86/intel_rdt: Split resource group removal in two x86/intel_rdt: Enable entering of pseudo-locksetup mode ...
-rw-r--r--Documentation/x86/intel_rdt_ui.txt380
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c11
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h143
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c129
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c1522
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h43
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c808
8 files changed, 2965 insertions, 75 deletions
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index a16aa2113840..f662d3c530e5 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -29,7 +29,11 @@ mount options are:
L2 and L3 CDP are controlled seperately.
RDT features are orthogonal. A particular system may support only
-monitoring, only control, or both monitoring and control.
+monitoring, only control, or both monitoring and control. Cache
+pseudo-locking is a unique way of using cache control to "pin" or
+"lock" data in the cache. Details can be found in
+"Cache Pseudo-Locking".
+
The mount succeeds if either of allocation or monitoring is present, but
only those files and directories supported by the system will be created.
@@ -65,6 +69,29 @@ related to allocation:
some platforms support devices that have their
own settings for cache use which can over-ride
these bits.
+"bit_usage": Annotated capacity bitmasks showing how all
+ instances of the resource are used. The legend is:
+ "0" - Corresponding region is unused. When the system's
+ resources have been allocated and a "0" is found
+ in "bit_usage" it is a sign that resources are
+ wasted.
+ "H" - Corresponding region is used by hardware only
+ but available for software use. If a resource
+ has bits set in "shareable_bits" but not all
+ of these bits appear in the resource groups'
+ schematas then the bits appearing in
+ "shareable_bits" but no resource group will
+ be marked as "H".
+ "X" - Corresponding region is available for sharing and
+ used by hardware and software. These are the
+ bits that appear in "shareable_bits" as
+ well as a resource group's allocation.
+ "S" - Corresponding region is used by software
+ and available for sharing.
+ "E" - Corresponding region is used exclusively by
+ one resource group. No sharing allowed.
+ "P" - Corresponding region is pseudo-locked. No
+ sharing allowed.
Memory bandwitdh(MB) subdirectory contains the following files
with respect to allocation:
@@ -151,6 +178,9 @@ All groups contain the following files:
CPUs to/from this group. As with the tasks file a hierarchy is
maintained where MON groups may only include CPUs owned by the
parent CTRL_MON group.
+ When the resouce group is in pseudo-locked mode this file will
+ only be readable, reflecting the CPUs associated with the
+ pseudo-locked region.
"cpus_list":
@@ -163,6 +193,21 @@ When control is enabled all CTRL_MON groups will also contain:
A list of all the resources available to this group.
Each resource has its own line and format - see below for details.
+"size":
+ Mirrors the display of the "schemata" file to display the size in
+ bytes of each allocation instead of the bits representing the
+ allocation.
+
+"mode":
+ The "mode" of the resource group dictates the sharing of its
+ allocations. A "shareable" resource group allows sharing of its
+ allocations while an "exclusive" resource group does not. A
+ cache pseudo-locked region is created by first writing
+ "pseudo-locksetup" to the "mode" file before writing the cache
+ pseudo-locked region's schemata to the resource group's "schemata"
+ file. On successful pseudo-locked region creation the mode will
+ automatically change to "pseudo-locked".
+
When monitoring is enabled all MON groups will also contain:
"mon_data":
@@ -379,6 +424,170 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
+Cache Pseudo-Locking
+--------------------
+CAT enables a user to specify the amount of cache space that an
+application can fill. Cache pseudo-locking builds on the fact that a
+CPU can still read and write data pre-allocated outside its current
+allocated area on a cache hit. With cache pseudo-locking, data can be
+preloaded into a reserved portion of cache that no application can
+fill, and from that point on will only serve cache hits. The cache
+pseudo-locked memory is made accessible to user space where an
+application can map it into its virtual address space and thus have
+a region of memory with reduced average read latency.
+
+The creation of a cache pseudo-locked region is triggered by a request
+from the user to do so that is accompanied by a schemata of the region
+to be pseudo-locked. The cache pseudo-locked region is created as follows:
+- Create a CAT allocation CLOSNEW with a CBM matching the schemata
+ from the user of the cache region that will contain the pseudo-locked
+ memory. This region must not overlap with any current CAT allocation/CLOS
+ on the system and no future overlap with this cache region is allowed
+ while the pseudo-locked region exists.
+- Create a contiguous region of memory of the same size as the cache
+ region.
+- Flush the cache, disable hardware prefetchers, disable preemption.
+- Make CLOSNEW the active CLOS and touch the allocated memory to load
+ it into the cache.
+- Set the previous CLOS as active.
+- At this point the closid CLOSNEW can be released - the cache
+ pseudo-locked region is protected as long as its CBM does not appear in
+ any CAT allocation. Even though the cache pseudo-locked region will from
+ this point on not appear in any CBM of any CLOS an application running with
+ any CLOS will be able to access the memory in the pseudo-locked region since
+ the region continues to serve cache hits.
+- The contiguous region of memory loaded into the cache is exposed to
+ user-space as a character device.
+
+Cache pseudo-locking increases the probability that data will remain
+in the cache via carefully configuring the CAT feature and controlling
+application behavior. There is no guarantee that data is placed in
+cache. Instructions like INVD, WBINVD, CLFLUSH, etc. can still evict
+“locked” data from cache. Power management C-states may shrink or
+power off cache. Deeper C-states will automatically be restricted on
+pseudo-locked region creation.
+
+It is required that an application using a pseudo-locked region runs
+with affinity to the cores (or a subset of the cores) associated
+with the cache on which the pseudo-locked region resides. A sanity check
+within the code will not allow an application to map pseudo-locked memory
+unless it runs with affinity to cores associated with the cache on which the
+pseudo-locked region resides. The sanity check is only done during the
+initial mmap() handling, there is no enforcement afterwards and the
+application self needs to ensure it remains affine to the correct cores.
+
+Pseudo-locking is accomplished in two stages:
+1) During the first stage the system administrator allocates a portion
+ of cache that should be dedicated to pseudo-locking. At this time an
+ equivalent portion of memory is allocated, loaded into allocated
+ cache portion, and exposed as a character device.
+2) During the second stage a user-space application maps (mmap()) the
+ pseudo-locked memory into its address space.
+
+Cache Pseudo-Locking Interface
+------------------------------
+A pseudo-locked region is created using the resctrl interface as follows:
+
+1) Create a new resource group by creating a new directory in /sys/fs/resctrl.
+2) Change the new resource group's mode to "pseudo-locksetup" by writing
+ "pseudo-locksetup" to the "mode" file.
+3) Write the schemata of the pseudo-locked region to the "schemata" file. All
+ bits within the schemata should be "unused" according to the "bit_usage"
+ file.
+
+On successful pseudo-locked region creation the "mode" file will contain
+"pseudo-locked" and a new character device with the same name as the resource
+group will exist in /dev/pseudo_lock. This character device can be mmap()'ed
+by user space in order to obtain access to the pseudo-locked memory region.
+
+An example of cache pseudo-locked region creation and usage can be found below.
+
+Cache Pseudo-Locking Debugging Interface
+---------------------------------------
+The pseudo-locking debugging interface is enabled by default (if
+CONFIG_DEBUG_FS is enabled) and can be found in /sys/kernel/debug/resctrl.
+
+There is no explicit way for the kernel to test if a provided memory
+location is present in the cache. The pseudo-locking debugging interface uses
+the tracing infrastructure to provide two ways to measure cache residency of
+the pseudo-locked region:
+1) Memory access latency using the pseudo_lock_mem_latency tracepoint. Data
+ from these measurements are best visualized using a hist trigger (see
+ example below). In this test the pseudo-locked region is traversed at
+ a stride of 32 bytes while hardware prefetchers and preemption
+ are disabled. This also provides a substitute visualization of cache
+ hits and misses.
+2) Cache hit and miss measurements using model specific precision counters if
+ available. Depending on the levels of cache on the system the pseudo_lock_l2
+ and pseudo_lock_l3 tracepoints are available.
+ WARNING: triggering this measurement uses from two (for just L2
+ measurements) to four (for L2 and L3 measurements) precision counters on
+ the system, if any other measurements are in progress the counters and
+ their corresponding event registers will be clobbered.
+
+When a pseudo-locked region is created a new debugfs directory is created for
+it in debugfs as /sys/kernel/debug/resctrl/<newdir>. A single
+write-only file, pseudo_lock_measure, is present in this directory. The
+measurement on the pseudo-locked region depends on the number, 1 or 2,
+written to this debugfs file. Since the measurements are recorded with the
+tracing infrastructure the relevant tracepoints need to be enabled before the
+measurement is triggered.
+
+Example of latency debugging interface:
+In this example a pseudo-locked region named "newlock" was created. Here is
+how we can measure the latency in cycles of reading from this region and
+visualize this data with a histogram that is available if CONFIG_HIST_TRIGGERS
+is set:
+# :> /sys/kernel/debug/tracing/trace
+# echo 'hist:keys=latency' > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/trigger
+# echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
+# echo 1 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
+# echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
+# cat /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/hist
+
+# event histogram
+#
+# trigger info: hist:keys=latency:vals=hitcount:sort=hitcount:size=2048 [active]
+#
+
+{ latency: 456 } hitcount: 1
+{ latency: 50 } hitcount: 83
+{ latency: 36 } hitcount: 96
+{ latency: 44 } hitcount: 174
+{ latency: 48 } hitcount: 195
+{ latency: 46 } hitcount: 262
+{ latency: 42 } hitcount: 693
+{ latency: 40 } hitcount: 3204
+{ latency: 38 } hitcount: 3484
+
+Totals:
+ Hits: 8192
+ Entries: 9
+ Dropped: 0
+
+Example of cache hits/misses debugging:
+In this example a pseudo-locked region named "newlock" was created on the L2
+cache of a platform. Here is how we can obtain details of the cache hits
+and misses using the platform's precision counters.
+
+# :> /sys/kernel/debug/tracing/trace
+# echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
+# echo 2 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
+# echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
+# cat /sys/kernel/debug/tracing/trace
+
+# tracer: nop
+#
+# _-----=> irqs-off
+# / _----=> need-resched
+# | / _---=> hardirq/softirq
+# || / _--=> preempt-depth
+# ||| / delay
+# TASK-PID CPU# |||| TIMESTAMP FUNCTION
+# | | | |||| | |
+ pseudo_lock_mea-1672 [002] .... 3132.860500: pseudo_lock_l2: hits=4097 miss=0
+
+
Examples for RDT allocation usage:
Example 1
@@ -502,7 +711,172 @@ siblings and only the real time threads are scheduled on the cores 4-7.
# echo F0 > p0/cpus
-4) Locking between applications
+Example 4
+---------
+
+The resource groups in previous examples were all in the default "shareable"
+mode allowing sharing of their cache allocations. If one resource group
+configures a cache allocation then nothing prevents another resource group
+to overlap with that allocation.
+
+In this example a new exclusive resource group will be created on a L2 CAT
+system with two L2 cache instances that can be configured with an 8-bit
+capacity bitmask. The new exclusive resource group will be configured to use
+25% of each cache instance.
+
+# mount -t resctrl resctrl /sys/fs/resctrl/
+# cd /sys/fs/resctrl
+
+First, we observe that the default group is configured to allocate to all L2
+cache:
+
+# cat schemata
+L2:0=ff;1=ff
+
+We could attempt to create the new resource group at this point, but it will
+fail because of the overlap with the schemata of the default group:
+# mkdir p0
+# echo 'L2:0=0x3;1=0x3' > p0/schemata
+# cat p0/mode
+shareable
+# echo exclusive > p0/mode
+-sh: echo: write error: Invalid argument
+# cat info/last_cmd_status
+schemata overlaps
+
+To ensure that there is no overlap with another resource group the default
+resource group's schemata has to change, making it possible for the new
+resource group to become exclusive.
+# echo 'L2:0=0xfc;1=0xfc' > schemata
+# echo exclusive > p0/mode
+# grep . p0/*
+p0/cpus:0
+p0/mode:exclusive
+p0/schemata:L2:0=03;1=03
+p0/size:L2:0=262144;1=262144
+
+A new resource group will on creation not overlap with an exclusive resource
+group:
+# mkdir p1
+# grep . p1/*
+p1/cpus:0
+p1/mode:shareable
+p1/schemata:L2:0=fc;1=fc
+p1/size:L2:0=786432;1=786432
+
+The bit_usage will reflect how the cache is used:
+# cat info/L2/bit_usage
+0=SSSSSSEE;1=SSSSSSEE
+
+A resource group cannot be forced to overlap with an exclusive resource group:
+# echo 'L2:0=0x1;1=0x1' > p1/schemata
+-sh: echo: write error: Invalid argument
+# cat info/last_cmd_status
+overlaps with exclusive group
+
+Example of Cache Pseudo-Locking
+-------------------------------
+Lock portion of L2 cache from cache id 1 using CBM 0x3. Pseudo-locked
+region is exposed at /dev/pseudo_lock/newlock that can be provided to
+application for argument to mmap().
+
+# mount -t resctrl resctrl /sys/fs/resctrl/
+# cd /sys/fs/resctrl
+
+Ensure that there are bits available that can be pseudo-locked, since only
+unused bits can be pseudo-locked the bits to be pseudo-locked needs to be
+removed from the default resource group's schemata:
+# cat info/L2/bit_usage
+0=SSSSSSSS;1=SSSSSSSS
+# echo 'L2:1=0xfc' > schemata
+# cat info/L2/bit_usage
+0=SSSSSSSS;1=SSSSSS00
+
+Create a new resource group that will be associated with the pseudo-locked
+region, indicate that it will be used for a pseudo-locked region, and
+configure the requested pseudo-locked region capacity bitmask:
+
+# mkdir newlock
+# echo pseudo-locksetup > newlock/mode
+# echo 'L2:1=0x3' > newlock/schemata
+
+On success the resource group's mode will change to pseudo-locked, the
+bit_usage will reflect the pseudo-locked region, and the character device
+exposing the pseudo-locked region will exist:
+
+# cat newlock/mode
+pseudo-locked
+# cat info/L2/bit_usage
+0=SSSSSSSS;1=SSSSSSPP
+# ls -l /dev/pseudo_lock/newlock
+crw------- 1 root root 243, 0 Apr 3 05:01 /dev/pseudo_lock/newlock
+
+/*
+ * Example code to access one page of pseudo-locked cache region
+ * from user space.
+ */
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+/*
+ * It is required that the application runs with affinity to only
+ * cores associated with the pseudo-locked region. Here the cpu
+ * is hardcoded for convenience of example.
+ */
+static int cpuid = 2;
+
+int main(int argc, char *argv[])
+{
+ cpu_set_t cpuset;
+ long page_size;
+ void *mapping;
+ int dev_fd;
+ int ret;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpuid, &cpuset);
+ ret = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+ if (ret < 0) {
+ perror("sched_setaffinity");
+ exit(EXIT_FAILURE);
+ }
+
+ dev_fd = open("/dev/pseudo_lock/newlock", O_RDWR);
+ if (dev_fd < 0) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+
+ mapping = mmap(0, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ dev_fd, 0);
+ if (mapping == MAP_FAILED) {
+ perror("mmap");
+ close(dev_fd);
+ exit(EXIT_FAILURE);
+ }
+
+ /* Application interacts with pseudo-locked memory @mapping */
+
+ ret = munmap(mapping, page_size);
+ if (ret < 0) {
+ perror("munmap");
+ close(dev_fd);
+ exit(EXIT_FAILURE);
+ }
+
+ close(dev_fd);
+ exit(EXIT_SUCCESS);
+}
+
+Locking between applications
+----------------------------
Certain operations on the resctrl filesystem, composed of read/writes
to/from multiple files, must be atomic.
@@ -510,7 +884,7 @@ to/from multiple files, must be atomic.
As an example, the allocation of an exclusive reservation of L3 cache
involves:
- 1. Read the cbmmasks from each directory
+ 1. Read the cbmmasks from each directory or the per-resource "bit_usage"
2. Find a contiguous set of bits in the global CBM bitmask that is clear
in any of the directory cbmmasks
3. Create a new directory
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7a40196967cb..347137e80bf5 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -35,7 +35,9 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o
+CFLAGS_intel_rdt_pseudo_lock.o = -I$(src)
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index ec4754f81cbd..abb71ac70443 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -859,6 +859,8 @@ static __init bool get_rdt_resources(void)
return (rdt_mon_capable || rdt_alloc_capable);
}
+static enum cpuhp_state rdt_online;
+
static int __init intel_rdt_late_init(void)
{
struct rdt_resource *r;
@@ -880,6 +882,7 @@ static int __init intel_rdt_late_init(void)
cpuhp_remove_state(state);
return ret;
}
+ rdt_online = state;
for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
@@ -891,3 +894,11 @@ static int __init intel_rdt_late_init(void)
}
late_initcall(intel_rdt_late_init);
+
+static void __exit intel_rdt_exit(void)
+{
+ cpuhp_remove_state(rdt_online);
+ rdtgroup_exit();
+}
+
+__exitcall(intel_rdt_exit);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 39752825e376..4e588f36228f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -81,6 +81,34 @@ enum rdt_group_type {
};
/**
+ * enum rdtgrp_mode - Mode of a RDT resource group
+ * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
+ * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
+ * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
+ * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
+ * allowed AND the allocations are Cache Pseudo-Locked
+ *
+ * The mode of a resource group enables control over the allowed overlap
+ * between allocations associated with different resource groups (classes
+ * of service). User is able to modify the mode of a resource group by
+ * writing to the "mode" resctrl file associated with the resource group.
+ *
+ * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
+ * writing the appropriate text to the "mode" file. A resource group enters
+ * "pseudo-locked" mode after the schemata is written while the resource
+ * group is in "pseudo-locksetup" mode.
+ */
+enum rdtgrp_mode {
+ RDT_MODE_SHAREABLE = 0,
+ RDT_MODE_EXCLUSIVE,
+ RDT_MODE_PSEUDO_LOCKSETUP,
+ RDT_MODE_PSEUDO_LOCKED,
+
+ /* Must be last */
+ RDT_NUM_MODES,
+};
+
+/**
* struct mongroup - store mon group's data in resctrl fs.
* @mon_data_kn kernlfs node for the mon_data directory
* @parent: parent rdtgrp
@@ -95,6 +123,43 @@ struct mongroup {
};
/**
+ * struct pseudo_lock_region - pseudo-lock region information
+ * @r: RDT resource to which this pseudo-locked region
+ * belongs
+ * @d: RDT domain to which this pseudo-locked region
+ * belongs
+ * @cbm: bitmask of the pseudo-locked region
+ * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
+ * completion
+ * @thread_done: variable used by waitqueue to test if pseudo-locking
+ * thread completed
+ * @cpu: core associated with the cache on which the setup code
+ * will be run
+ * @line_size: size of the cache lines
+ * @size: size of pseudo-locked region in bytes
+ * @kmem: the kernel memory associated with pseudo-locked region
+ * @minor: minor number of character device associated with this
+ * region
+ * @debugfs_dir: pointer to this region's directory in the debugfs
+ * filesystem
+ * @pm_reqs: Power management QoS requests related to this region
+ */
+struct pseudo_lock_region {
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ u32 cbm;
+ wait_queue_head_t lock_thread_wq;
+ int thread_done;
+ int cpu;
+ unsigned int line_size;
+ unsigned int size;
+ void *kmem;
+ unsigned int minor;
+ struct dentry *debugfs_dir;
+ struct list_head pm_reqs;
+};
+
+/**
* struct rdtgroup - store rdtgroup's data in resctrl file system.
* @kn: kernfs node
* @rdtgroup_list: linked list for all rdtgroups
@@ -106,16 +171,20 @@ struct mongroup {
* @type: indicates type of this rdtgroup - either
* monitor only or ctrl_mon group
* @mon: mongroup related data
+ * @mode: mode of resource group
+ * @plr: pseudo-locked region
*/
struct rdtgroup {
- struct kernfs_node *kn;
- struct list_head rdtgroup_list;
- u32 closid;
- struct cpumask cpu_mask;
- int flags;
- atomic_t waitcount;
- enum rdt_group_type type;
- struct mongroup mon;
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ u32 closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+ enum rdt_group_type type;
+ struct mongroup mon;
+ enum rdtgrp_mode mode;
+ struct pseudo_lock_region *plr;
};
/* rdtgroup.flags */
@@ -148,6 +217,7 @@ extern struct list_head rdt_all_groups;
extern int max_name_width, max_data_width;
int __init rdtgroup_init(void);
+void __exit rdtgroup_exit(void);
/**
* struct rftype - describe each file in the resctrl file system
@@ -216,22 +286,24 @@ struct mbm_state {
* @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
* @new_ctrl: new ctrl value to be loaded
* @have_new_ctrl: did user provide new_ctrl for this domain
+ * @plr: pseudo-locked region (if any) associated with domain
*/
struct rdt_domain {
- struct list_head list;
- int id;
- struct cpumask cpu_mask;
- unsigned long *rmid_busy_llc;
- struct mbm_state *mbm_total;
- struct mbm_state *mbm_local;
- struct delayed_work mbm_over;
- struct delayed_work cqm_limbo;
- int mbm_work_cpu;
- int cqm_work_cpu;
- u32 *ctrl_val;
- u32 *mbps_val;
- u32 new_ctrl;
- bool have_new_ctrl;
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ unsigned long *rmid_busy_llc;
+ struct mbm_state *mbm_total;
+ struct mbm_state *mbm_local;
+ struct delayed_work mbm_over;
+ struct delayed_work cqm_limbo;
+ int mbm_work_cpu;
+ int cqm_work_cpu;
+ u32 *ctrl_val;
+ u32 *mbps_val;
+ u32 new_ctrl;
+ bool have_new_ctrl;
+ struct pseudo_lock_region *plr;
};
/**
@@ -351,7 +423,7 @@ struct rdt_resource {
struct rdt_cache cache;
struct rdt_membw membw;
const char *format_str;
- int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
+ int (*parse_ctrlval) (void *data, struct rdt_resource *r,
struct rdt_domain *d);
struct list_head evt_list;
int num_rmid;
@@ -359,8 +431,8 @@ struct rdt_resource {
unsigned long fflags;
};
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d);
extern struct mutex rdtgroup_mutex;
@@ -368,7 +440,7 @@ extern struct rdt_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-int __init rdtgroup_init(void);
+extern struct dentry *debugfs_resctrl;
enum {
RDT_RESOURCE_L3,
@@ -439,13 +511,32 @@ void rdt_last_cmd_printf(const char *fmt, ...);
void rdt_ctrl_update(void *arg);
struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
void rdtgroup_kn_unlock(struct kernfs_node *kn);
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask);
struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
struct list_head **pos);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ u32 _cbm, int closid, bool exclusive);
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
+ u32 cbm);
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
+int rdtgroup_tasks_assigned(struct rdtgroup *r);
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
+int rdt_pseudo_lock_init(void);
+void rdt_pseudo_lock_release(void);
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int update_domains(struct rdt_resource *r, int closid);
+void closid_free(int closid);
int alloc_rmid(void);
void free_rmid(u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 116d57b248d3..af358ca05160 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -64,9 +64,10 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d)
{
unsigned long data;
+ char *buf = _buf;
if (d->have_new_ctrl) {
rdt_last_cmd_printf("duplicate domain %d\n", d->id);
@@ -87,7 +88,7 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
* are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
* Additionally Haswell requires at least two bits set.
*/
-static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
{
unsigned long first_bit, zero_bit, val;
unsigned int cbm_len = r->cache.cbm_len;
@@ -122,22 +123,64 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
+struct rdt_cbm_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
/*
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
{
- unsigned long data;
+ struct rdt_cbm_parse_data *data = _data;
+ struct rdtgroup *rdtgrp = data->rdtgrp;
+ u32 cbm_val;
if (d->have_new_ctrl) {
rdt_last_cmd_printf("duplicate domain %d\n", d->id);
return -EINVAL;
}
- if(!cbm_validate(buf, &data, r))
+ /*
+ * Cannot set up more than one pseudo-locked region in a cache
+ * hierarchy.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ rdtgroup_pseudo_locked_in_hierarchy(d)) {
+ rdt_last_cmd_printf("pseudo-locked region in hierarchy\n");
return -EINVAL;
- d->new_ctrl = data;
+ }
+
+ if (!cbm_validate(data->buf, &cbm_val, r))
+ return -EINVAL;
+
+ if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_SHAREABLE) &&
+ rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
+ rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The CBM may not overlap with the CBM of another closid if
+ * either is exclusive.
+ */
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
+ rdt_last_cmd_printf("overlaps with exclusive group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
+ if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ rdt_last_cmd_printf("overlaps with other group\n");
+ return -EINVAL;
+ }
+ }
+
+ d->new_ctrl = cbm_val;
d->have_new_ctrl = true;
return 0;
@@ -149,8 +192,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
* separated by ";". The "id" is in decimal, and must match one of
* the "id"s for this resource.
*/
-static int parse_line(char *line, struct rdt_resource *r)
+static int parse_line(char *line, struct rdt_resource *r,
+ struct rdtgroup *rdtgrp)
{
+ struct rdt_cbm_parse_data data;
char *dom = NULL, *id;
struct rdt_domain *d;
unsigned long dom_id;
@@ -167,15 +212,32 @@ next:
dom = strim(dom);
list_for_each_entry(d, &r->domains, list) {
if (d->id == dom_id) {
- if (r->parse_ctrlval(dom, r, d))
+ data.buf = dom;
+ data.rdtgrp = rdtgrp;
+ if (r->parse_ctrlval(&data, r, d))
return -EINVAL;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * In pseudo-locking setup mode and just
+ * parsed a valid CBM that should be
+ * pseudo-locked. Only one locked region per
+ * resource group and domain so just do
+ * the required initialization for single
+ * region and return.
+ */
+ rdtgrp->plr->r = r;
+ rdtgrp->plr->d = d;
+ rdtgrp->plr->cbm = d->new_ctrl;
+ d->plr = rdtgrp->plr;
+ return 0;
+ }
goto next;
}
}
return -EINVAL;
}
-static int update_domains(struct rdt_resource *r, int closid)
+int update_domains(struct rdt_resource *r, int closid)
{
struct msr_param msr_param;
cpumask_var_t cpu_mask;
@@ -220,13 +282,14 @@ done:
return 0;
}
-static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
+static int rdtgroup_parse_resource(char *resname, char *tok,
+ struct rdtgroup *rdtgrp)
{
struct rdt_resource *r;
for_each_alloc_enabled_rdt_resource(r) {
- if (!strcmp(resname, r->name) && closid < r->num_closid)
- return parse_line(tok, r);
+ if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
+ return parse_line(tok, r, rdtgrp);
}
rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname);
return -EINVAL;
@@ -239,7 +302,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
struct rdt_domain *dom;
struct rdt_resource *r;<