diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-01 21:15:50 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-01 21:15:50 -0700 |
| commit | a52bbaf4a3b81e07430a91ee37ea76557c2c02ed (patch) | |
| tree | 237b8878b7b68b41a4bfd549b34d174c8439c3a9 | |
| parent | 16b76293c5c81e6345323d7aef41b26e8390f62d (diff) | |
| parent | 4797b7dfdfcf457075c36743d71e2b0feeaaa20f (diff) | |
| download | linux-a52bbaf4a3b81e07430a91ee37ea76557c2c02ed.tar.gz linux-a52bbaf4a3b81e07430a91ee37ea76557c2c02ed.tar.bz2 linux-a52bbaf4a3b81e07430a91ee37ea76557c2c02ed.zip | |
Merge branch 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cpu updates from Ingo Molnar:
"The biggest changes are an extension of the Intel RDT code to extend
it with Intel Memory Bandwidth Allocation CPU support: MBA allows
bandwidth allocation between cores, while CBM (already upstream)
allows CPU cache partitioning.
There's also misc smaller fixes and updates"
* 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
x86/intel_rdt: Return error for incorrect resource names in schemata
x86/intel_rdt: Trim whitespace while parsing schemata input
x86/intel_rdt: Fix padding when resource is enabled via mount
x86/intel_rdt: Get rid of anon union
x86/cpu: Keep model defines sorted by model number
x86/intel_rdt/mba: Add schemata file support for MBA
x86/intel_rdt: Make schemata file parsers resource specific
x86/intel_rdt/mba: Add info directory files for Memory Bandwidth Allocation
x86/intel_rdt: Make information files resource specific
x86/intel_rdt/mba: Add primary support for Memory Bandwidth Allocation (MBA)
x86/intel_rdt/mba: Memory bandwith allocation feature detect
x86/intel_rdt: Add resource specific msr update function
x86/intel_rdt: Move CBM specific data into a struct
x86/intel_rdt: Cleanup namespace to support multiple resource types
Documentation, x86: Intel Memory bandwidth allocation
x86/intel_rdt: Organize code properly
x86/intel_rdt: Init padding only if a device exists
x86/intel_rdt: Add cpus_list rdtgroup file
x86/intel_rdt: Cleanup kernel-doc
x86/intel_rdt: Update schemata read to show data in tabular format
...
| -rw-r--r-- | Documentation/x86/intel_rdt_ui.txt | 124 | ||||
| -rw-r--r-- | arch/x86/include/asm/cpufeatures.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/intel-family.h | 6 | ||||
| -rw-r--r-- | arch/x86/include/asm/intel_rdt.h | 157 | ||||
| -rw-r--r-- | arch/x86/include/asm/processor.h | 11 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.c | 350 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 125 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_schemata.c | 181 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/proc.c | 5 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/scattered.c | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/setup.c | 11 | ||||
| -rw-r--r-- | arch/x86/mm/init_32.c | 9 | ||||
| -rw-r--r-- | arch/x86/xen/enlighten.c | 1 |
13 files changed, 707 insertions, 276 deletions
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index 51cf6fa5591f..0f6d8477b66c 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt @@ -4,6 +4,7 @@ Copyright (C) 2016 Intel Corporation Fenghua Yu <fenghua.yu@intel.com> Tony Luck <tony.luck@intel.com> +Vikas Shivappa <vikas.shivappa@intel.com> This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3". @@ -22,19 +23,34 @@ Info directory The 'info' directory contains information about the enabled resources. Each resource has its own subdirectory. The subdirectory -names reflect the resource names. Each subdirectory contains the -following files: +names reflect the resource names. +Cache resource(L3/L2) subdirectory contains the following files: -"num_closids": The number of CLOSIDs which are valid for this - resource. The kernel uses the smallest number of - CLOSIDs of all enabled resources as limit. +"num_closids": The number of CLOSIDs which are valid for this + resource. The kernel uses the smallest number of + CLOSIDs of all enabled resources as limit. -"cbm_mask": The bitmask which is valid for this resource. This - mask is equivalent to 100%. +"cbm_mask": The bitmask which is valid for this resource. + This mask is equivalent to 100%. -"min_cbm_bits": The minimum number of consecutive bits which must be - set when writing a mask. +"min_cbm_bits": The minimum number of consecutive bits which + must be set when writing a mask. +Memory bandwitdh(MB) subdirectory contains the following files: + +"min_bandwidth": The minimum memory bandwidth percentage which + user can request. + +"bandwidth_gran": The granularity in which the memory bandwidth + percentage is allocated. The allocated + b/w percentage is rounded off to the next + control step available on the hardware. The + available bandwidth control steps are: + min_bandwidth + N * bandwidth_gran. + +"delay_linear": Indicates if the delay scale is linear or + non-linear. This field is purely informational + only. Resource groups --------------- @@ -59,6 +75,9 @@ There are three files associated with each group: given to the default (root) group. You cannot remove CPUs from the default group. +"cpus_list": One or more CPU ranges of logical CPUs assigned to this + group. Same rules apply like for the "cpus" file. + "schemata": A list of all the resources available to this group. Each resource has its own line and format - see below for details. @@ -107,6 +126,22 @@ and 0xA are not. On a system with a 20-bit mask each bit represents 5% of the capacity of the cache. You could partition the cache into four equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000. +Memory bandwidth(b/w) percentage +-------------------------------- +For Memory b/w resource, user controls the resource by indicating the +percentage of total memory b/w. + +The minimum bandwidth percentage value for each cpu model is predefined +and can be looked up through "info/MB/min_bandwidth". The bandwidth +granularity that is allocated is also dependent on the cpu model and can +be looked up at "info/MB/bandwidth_gran". The available bandwidth +control steps are: min_bw + N * bw_gran. Intermediate values are rounded +to the next control step available on the hardware. + +The bandwidth throttling is a core specific mechanism on some of Intel +SKUs. Using a high bandwidth and a low bandwidth setting on two threads +sharing a core will result in both threads being throttled to use the +low bandwidth. L3 details (code and data prioritization disabled) -------------------------------------------------- @@ -129,16 +164,38 @@ schemata format is always: L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... +Memory b/w Allocation details +----------------------------- + +Memory b/w domain is L3 cache. + + MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;... + +Reading/writing the schemata file +--------------------------------- +Reading the schemata file will show the state of all resources +on all domains. When writing you only need to specify those values +which you wish to change. E.g. + +# cat schemata +L3DATA:0=fffff;1=fffff;2=fffff;3=fffff +L3CODE:0=fffff;1=fffff;2=fffff;3=fffff +# echo "L3DATA:2=3c0;" > schemata +# cat schemata +L3DATA:0=fffff;1=fffff;2=3c0;3=fffff +L3CODE:0=fffff;1=fffff;2=fffff;3=fffff + Example 1 --------- On a two socket machine (one L3 cache per socket) with just four bits -for cache bit masks +for cache bit masks, minimum b/w of 10% with a memory bandwidth +granularity of 10% # mount -t resctrl resctrl /sys/fs/resctrl # cd /sys/fs/resctrl # mkdir p0 p1 -# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata -# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata +# echo "L3:0=3;1=c\nMB:0=50;1=50" > /sys/fs/resctrl/p0/schemata +# echo "L3:0=3;1=3\nMB:0=50;1=50" > /sys/fs/resctrl/p1/schemata The default resource group is unmodified, so we have access to all parts of all caches (its schemata file reads "L3:0=f;1=f"). @@ -147,6 +204,14 @@ Tasks that are under the control of group "p0" may only allocate from the "lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1. Tasks in group "p1" use the "lower" 50% of cache on both sockets. +Similarly, tasks that are under the control of group "p0" may use a +maximum memory b/w of 50% on socket0 and 50% on socket 1. +Tasks in group "p1" may also use 50% memory b/w on both sockets. +Note that unlike cache masks, memory b/w cannot specify whether these +allocations can overlap or not. The allocations specifies the maximum +b/w that the group may be able to use and the system admin can configure +the b/w accordingly. + Example 2 --------- Again two sockets, but this time with a more realistic 20-bit mask. @@ -160,9 +225,10 @@ of L3 cache on socket 0. # cd /sys/fs/resctrl First we reset the schemata for the default group so that the "upper" -50% of the L3 cache on socket 0 cannot be used by ordinary tasks: +50% of the L3 cache on socket 0 and 50% of memory b/w cannot be used by +ordinary tasks: -# echo "L3:0=3ff;1=fffff" > schemata +# echo "L3:0=3ff;1=fffff\nMB:0=50;1=100" > schemata Next we make a resource group for our first real time task and give it access to the "top" 25% of the cache on socket 0. @@ -185,6 +251,20 @@ Ditto for the second real time task (with the remaining 25% of cache): # echo 5678 > p1/tasks # taskset -cp 2 5678 +For the same 2 socket system with memory b/w resource and CAT L3 the +schemata would look like(Assume min_bandwidth 10 and bandwidth_gran is +10): + +For our first real time task this would request 20% memory b/w on socket +0. + +# echo -e "L3:0=f8000;1=fffff\nMB:0=20;1=100" > p0/schemata + +For our second real time task this would request an other 20% memory b/w +on socket 0. + +# echo -e "L3:0=f8000;1=fffff\nMB:0=20;1=100" > p0/schemata + Example 3 --------- @@ -198,18 +278,22 @@ the tasks. # cd /sys/fs/resctrl First we reset the schemata for the default group so that the "upper" -50% of the L3 cache on socket 0 cannot be used by ordinary tasks: +50% of the L3 cache on socket 0, and 50% of memory bandwidth on socket 0 +cannot be used by ordinary tasks: -# echo "L3:0=3ff" > schemata +# echo "L3:0=3ff\nMB:0=50" > schemata -Next we make a resource group for our real time cores and give -it access to the "top" 50% of the cache on socket 0. +Next we make a resource group for our real time cores and give it access +to the "top" 50% of the cache on socket 0 and 50% of memory bandwidth on +socket 0. # mkdir p0 -# echo "L3:0=ffc00;" > p0/schemata +# echo "L3:0=ffc00\nMB:0=50" > p0/schemata Finally we move core 4-7 over to the new group and make sure that the -kernel and the tasks running there get 50% of the cache. +kernel and the tasks running there get 50% of the cache. They should +also get 50% of memory bandwidth assuming that the cores 4-7 are SMT +siblings and only the real time threads are scheduled on the cores 4-7. # echo C0 > p0/cpus diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0fe00446f9ca..2701e5f8145b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -202,6 +202,8 @@ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9814db42b790..75b748a1deb8 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -12,6 +12,7 @@ */ #define INTEL_FAM6_CORE_YONAH 0x0E + #define INTEL_FAM6_CORE2_MEROM 0x0F #define INTEL_FAM6_CORE2_MEROM_L 0x16 #define INTEL_FAM6_CORE2_PENRYN 0x17 @@ -21,6 +22,7 @@ #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ #define INTEL_FAM6_NEHALEM_EP 0x1A #define INTEL_FAM6_NEHALEM_EX 0x2E + #define INTEL_FAM6_WESTMERE 0x25 #define INTEL_FAM6_WESTMERE_EP 0x2C #define INTEL_FAM6_WESTMERE_EX 0x2F @@ -36,9 +38,9 @@ #define INTEL_FAM6_HASWELL_GT3E 0x46 #define INTEL_FAM6_BROADWELL_CORE 0x3D -#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_BROADWELL_GT3E 0x47 #define INTEL_FAM6_BROADWELL_X 0x4F +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_SKYLAKE_MOBILE 0x4E #define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E @@ -59,8 +61,8 @@ #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ #define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ +#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A /* Xeon Phi */ diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 0d64397cee58..597dc4995678 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -12,6 +12,7 @@ #define IA32_L3_QOS_CFG 0xc81 #define IA32_L3_CBM_BASE 0xc90 #define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 #define L3_QOS_CDP_ENABLE 0x01ULL @@ -37,23 +38,30 @@ struct rdtgroup { /* rdtgroup.flags */ #define RDT_DELETED 1 +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + /* List of all resource groups */ extern struct list_head rdt_all_groups; +extern int max_name_width, max_data_width; + int __init rdtgroup_init(void); /** * struct rftype - describe each file in the resctrl file system - * @name: file name - * @mode: access mode - * @kf_ops: operations - * @seq_show: show content of the file - * @write: write to the file + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @seq_show: Show content of the file + * @write: Write to the file */ struct rftype { char *name; umode_t mode; struct kernfs_ops *kf_ops; + unsigned long flags; int (*seq_show)(struct kernfs_open_file *of, struct seq_file *sf, void *v); @@ -67,54 +75,21 @@ struct rftype { }; /** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @max_cbm: Largest Cache Bit Mask allowed - * @min_cbm_bits: Minimum number of consecutive bits to be set - * in a cache bit mask - * @domains: All domains for this resource - * @num_domains: Number of domains active - * @msr_base: Base MSR address for CBMs - * @tmp_cbms: Scratch space when updating schemata - * @num_tmp_cbms: Number of CBMs in tmp_cbms - * @cache_level: Which cache level defines scope of this domain - * @cbm_idx_multi: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cbm_len; - int min_cbm_bits; - u32 max_cbm; - struct list_head domains; - int num_domains; - int msr_base; - u32 *tmp_cbms; - int num_tmp_cbms; - int cache_level; - int cbm_idx_multi; - int cbm_idx_offset; -}; - -/** * struct rdt_domain - group of cpus sharing an RDT resource * @list: all instances of this resource * @id: unique id for this instance * @cpu_mask: which cpus share this resource - * @cbm: array of cache bit masks (indexed by CLOSID) + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain */ struct rdt_domain { struct list_head list; int id; struct cpumask cpu_mask; - u32 *cbm; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; }; /** @@ -129,6 +104,83 @@ struct msr_param { int high; }; +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +/** + * struct rdt_resource - attributes of an RDT resource + * @enabled: Is this feature enabled on this machine + * @capable: Is this feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @info_files: resctrl info files for the resource + * @nr_info_files: Number of info files + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + */ +struct rdt_resource { + bool enabled; + bool capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + struct rftype *info_files; + int nr_info_files; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); +}; + +void rdt_get_cache_infofile(struct rdt_resource *r); +void rdt_get_mba_infofile(struct rdt_resource *r); +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + extern struct mutex rdtgroup_mutex; extern struct rdt_resource rdt_resources_all[]; @@ -142,6 +194,7 @@ enum { RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE, RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, /* Must be the last */ RDT_NUM_RESOURCES, @@ -149,7 +202,7 @@ enum { #define for_each_capable_rdt_resource(r) \ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ + r++) \ if (r->capable) #define for_each_enabled_rdt_resource(r) \ @@ -165,8 +218,16 @@ union cpuid_0x10_1_eax { unsigned int full; }; -/* CPUID.(EAX=10H, ECX=ResID=1).EDX */ -union cpuid_0x10_1_edx { +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { struct { unsigned int cos_max:16; } split; @@ -175,7 +236,7 @@ union cpuid_0x10_1_edx { DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); -void rdt_cbm_update(void *arg); +void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a80c1b3997ed..78defd0aa220 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -80,7 +80,7 @@ extern u16 __read_mostly tlb_lld_1g[NR_INFO]; /* * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head.S, so think twice + * Members of this structure are referenced in head_32.S, so think twice * before touching them. [mj] */ @@ -89,14 +89,7 @@ struct cpuinfo_x86 { __u8 x86_vendor; /* CPU vendor */ __u8 x86_model; __u8 x86_mask; -#ifdef CONFIG_X86_32 - char wp_works_ok; /* It doesn't on 386's */ - - /* Problems on some 486Dx4's and old 386's: */ - char rfu; - char pad0; - char pad1; -#else +#ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ int x86_tlbsize; #endif diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5a533fefefa0..5b366462f579 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -32,55 +32,98 @@ #include <asm/intel-family.h> #include <asm/intel_rdt.h> +#define MAX_MBA_BW 100u +#define MBA_IS_LINEAR 0x4 + /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * Used to store the max resource name width and max resource data width + * to display the schemata in a tabular format + */ +int max_name_width, max_data_width; + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { { - .name = "L3", - .domains = domain_init(RDT_RESOURCE_L3), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3DATA", - .domains = domain_init(RDT_RESOURCE_L3DATA), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 0 + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3CODE", - .domains = domain_init(RDT_RESOURCE_L3CODE), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 1 + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L2", - .domains = domain_init(RDT_RESOURCE_L2), - .msr_base = IA32_L2_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 2, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "MB", + .domains = domain_init(RDT_RESOURCE_MBA), + .msr_base = IA32_MBA_THRTL_BASE, + .msr_update = mba_wrmsr, + .cache_level = 3, + .parse_ctrlval = parse_bw, + .format_str = "%d=%*d", }, }; -static int cbm_idx(struct rdt_resource *r, int closid) +static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) { - return closid * r->cbm_idx_multi + r->cbm_idx_offset; + return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset; } /* @@ -118,9 +161,9 @@ static inline bool cache_alloc_hsw_probe(void) return false; r->num_closid = 4; - r->cbm_len = 20; - r->max_cbm = max_cbm; - r->min_cbm_bits = 2; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.min_cbm_bits = 2; r->capable = true; r->enabled = true; @@ -130,16 +173,66 @@ static inline bool cache_alloc_hsw_probe(void) return false; } -static void rdt_get_config(int idx, struct rdt_resource *r) +/* + * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values + * exposed to user interface and the h/w understandable delay values. + * + * The non-linear delay values have the granularity of power of two + * and also the h/w does not guarantee a curve for configured delay + * values vs. actual b/w enforced. + * Hence we need a mapping that is pre calibrated so the user can + * express the memory b/w as a percentage value. + */ +static inline bool rdt_get_mb_table(struct rdt_resource *r) +{ + /* + * There are no Intel SKUs as of now to support non-linear delay. + */ + pr_info("MBA b/w map not implemented for cpu:%d, model:%d", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return false; +} + +static bool rdt_get_mem_config(struct rdt_resource *r) +{ + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->membw.max_delay = eax.split.max_delay + 1; + r->default_ctrl = MAX_MBA_BW; + if (ecx & MBA_IS_LINEAR) { + r->membw.delay_linear = true; + r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; + r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + } else { + if (!rdt_get_mb_table(r)) + return false; + } + r->data_width = 3; + rdt_get_mba_infofile(r); + + r->capable = true; + r->enabled = true; + + return true; +} + +static void rdt_get_cache_config(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; - union cpuid_0x10_1_edx edx; + union cpuid_0x10_x_edx edx; u32 ebx, ecx; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->cbm_len = eax.split.cbm_len + 1; - r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.cbm_len = eax.split.cbm_len + 1; + r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->data_width = (r->cache.cbm_len + 3) / 4; + rdt_get_cache_infofile(r); r->capable = true; r->enabled = true; } @@ -150,8 +243,9 @@ static void rdt_get_cdp_l3_config(int type) struct rdt_resource *r = &rdt_resources_all[type]; r->num_closid = r_l3->num_closid / 2; - r->cbm_len = r_l3->cbm_len; - r->max_cbm = r_l3->max_cbm; + r->cache.cbm_len = r_l3->cache.cbm_len; + r->default_ctrl = r_l3->default_ctrl; + r->data_width = (r->cache.cbm_len + 3) / 4; r->capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter @@ -160,33 +254,6 @@ static void rdt_get_cdp_l3_config(int type) r->enabled = false; } -static inline bool get_rdt_resources(void) -{ - bool ret = false; - - if (cache_alloc_hsw_probe()) - return true; - - if (!boot_cpu_has(X86_FEATURE_RDT_A)) - return false; - - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { - rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); - rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); - } - ret = true; - } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { - /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); - ret = true; - } - - return ret; -} - static int get_cache_id(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -200,29 +267,55 @@ static int get_cache_id(int cpu, int level) return -1; } -void rdt_cbm_update(void *arg) +/* + * Map the memory b/w percentage value to delay values + * that can be written to QOS_MSRs. + * There are currently no SKUs which support non linear delay values. + */ +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - struct msr_param *m = (struct msr_param *)arg; + if (r->membw.delay_linear) + return MAX_MBA_BW - bw; + + pr_warn_once("Non Linear delay-bw map not supported but queried\n"); + return r->default_ctrl; +} + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + /* Write the delay values for mba. */ + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r)); +} + +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); +} + +void rdt_ctrl_update(void *arg) +{ + struct msr_param *m = arg; struct rdt_resource *r = m->res; - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) - goto found; + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + r->msr_update(d, m, r); + return; + } } - pr_info_once("cpu %d not found in any domain for resource %s\n", + pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); - - return; - -found: - for (i = m->low; i < m->high; i++) { - int idx = cbm_idx(r, i); - - wrmsrl(r->msr_base + idx, d->cbm[i]); - } } /* @@ -258,6 +351,32 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +{ + struct msr_param m; + u32 *dc; + int i; + + dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); + if (!dc) + return -ENOMEM; + + d->ctrl_val = dc; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100 + */ + for (i = 0; i < r->num_closid; i++, dc++) + *dc = r->default_ctrl; + + m.low = 0; + m.high = r->num_closid; + r->msr_update(d, &m, r); + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -273,7 +392,7 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int i, id = get_cache_id(cpu, r->cache_level); + int id = get_cache_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; @@ -294,22 +413,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d->id = id; - d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); - if (!d->cbm) { + if (domain_setup_ctrlval(r, d)) { kfree(d); return; } - for (i = 0; i < r->num_closid; i++) { - int idx = cbm_idx(r, i); - - d->cbm[i] = r->max_cbm; - wrmsrl(r->msr_base + idx, d->cbm[i]); - } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); - r->num_domains++; } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -325,8 +435,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d |
