From b5169fd2771d60a359af7263d84453adc0b5dbfb Mon Sep 17 00:00:00 2001 From: Tim Steinbach Date: Tue, 2 May 2017 08:49:39 -0400 Subject: [PATCH] linux: Add cgroups patches for 4.9, 4.10, 4.11 --- .../kernel/cpu-cgroup-v2-patches/4.10.patch | 784 ++++++++++++++++++ .../kernel/cpu-cgroup-v2-patches/4.11.patch | 784 ++++++++++++++++++ .../kernel/cpu-cgroup-v2-patches/4.6.patch | 407 --------- .../kernel/cpu-cgroup-v2-patches/4.7.patch | 407 --------- .../kernel/cpu-cgroup-v2-patches/4.9.patch | 784 ++++++++++++++++++ pkgs/top-level/all-packages.nix | 13 +- 6 files changed, 2355 insertions(+), 824 deletions(-) create mode 100644 pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.10.patch create mode 100644 pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch delete mode 100644 pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.6.patch delete mode 100644 pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.7.patch create mode 100644 pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.9.patch diff --git a/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.10.patch b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.10.patch new file mode 100644 index 000000000000..21040239a2b7 --- /dev/null +++ b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.10.patch @@ -0,0 +1,784 @@ +commit d0273888226b264d34795970c073d6e935d5114f +Author: Tejun Heo +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Misc preps for cgroup unified hierarchy interface + + Make the following changes in preparation for the cpu controller + interface implementation for the unified hierarchy. This patch + doesn't cause any functional differences. + + * s/cpu_stats_show()/cpu_cfs_stats_show()/ + + * s/cpu_files/cpu_legacy_files/ + + * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While + at it, make the @val array u64 for consistency. + + Signed-off-by: Tejun Heo + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Li Zefan + Cc: Johannes Weiner + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c56fb57f2991..112037890e9b 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8724,7 +8724,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) + return ret; + } + +-static int cpu_stats_show(struct seq_file *sf, void *v) ++static int cpu_cfs_stats_show(struct seq_file *sf, void *v) + { + struct task_group *tg = css_tg(seq_css(sf)); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; +@@ -8764,7 +8764,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + } + #endif /* CONFIG_RT_GROUP_SCHED */ + +-static struct cftype cpu_files[] = { ++static struct cftype cpu_legacy_files[] = { + #ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", +@@ -8785,7 +8785,7 @@ static struct cftype cpu_files[] = { + }, + { + .name = "stat", +- .seq_show = cpu_stats_show, ++ .seq_show = cpu_cfs_stats_show, + }, + #endif + #ifdef CONFIG_RT_GROUP_SCHED +@@ -8810,7 +8810,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .fork = cpu_cgroup_fork, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, +- .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, + .early_init = true, + }; + +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index 9add206b5608..4dd7b8588b69 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) + return 0; + } + +-static int cpuacct_stats_show(struct seq_file *sf, void *v) ++static void cpuacct_stats_read(struct cpuacct *ca, ++ u64 (*val)[CPUACCT_STAT_NSTATS]) + { +- struct cpuacct *ca = css_ca(seq_css(sf)); +- s64 val[CPUACCT_STAT_NSTATS]; + int cpu; +- int stat; + +- memset(val, 0, sizeof(val)); ++ memset(val, 0, sizeof(*val)); ++ + for_each_possible_cpu(cpu) { + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + } ++} ++ ++static int cpuacct_stats_show(struct seq_file *sf, void *v) ++{ ++ u64 val[CPUACCT_STAT_NSTATS]; ++ int stat; ++ ++ cpuacct_stats_read(css_ca(seq_css(sf)), &val); + + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { +- seq_printf(sf, "%s %lld\n", ++ seq_printf(sf, "%s %llu\n", + cpuacct_stat_desc[stat], + (long long)cputime64_to_clock_t(val[stat])); + } + +commit 41103511fa43b4aa04fea259c5c60fef752ddefb +Author: Tejun Heo +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Implement interface for cgroup unified hierarchy + + While the cpu controller doesn't have any functional problems, there + are a couple interface issues which can be addressed in the v2 + interface. + + * cpuacct being a separate controller. This separation is artificial + and rather pointless as demonstrated by most use cases co-mounting + the two controllers. It also forces certain information to be + accounted twice. + + * Use of different time units. Writable control knobs use + microseconds, some stat fields use nanoseconds while other cpuacct + stat fields use centiseconds. + + * Control knobs which can't be used in the root cgroup still show up + in the root. + + * Control knob names and semantics aren't consistent with other + controllers. + + This patchset implements cpu controller's interface on the unified + hierarchy which adheres to the controller file conventions described + in Documentation/cgroups/unified-hierarchy.txt. Overall, the + following changes are made. + + * cpuacct is implictly enabled and disabled by cpu and its information + is reported through "cpu.stat" which now uses microseconds for all + time durations. All time duration fields now have "_usec" appended + to them for clarity. While this doesn't solve the double accounting + immediately, once majority of users switch to v2, cpu can directly + account and report the relevant stats and cpuacct can be disabled on + the unified hierarchy. + + Note that cpuacct.usage_percpu is currently not included in + "cpu.stat". If this information is actually called for, it can be + added later. + + * "cpu.shares" is replaced with "cpu.weight" and operates on the + standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). + The weight is scaled to scheduler weight so that 100 maps to 1024 + and the ratio relationship is preserved - if weight is W and its + scaled value is S, W / 100 == S / 1024. While the mapped range is a + bit smaller than the orignal scheduler weight range, the dead zones + on both sides are relatively small and covers wider range than the + nice value mappings. This file doesn't make sense in the root + cgroup and isn't create on root. + + * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" + which contains both quota and period. + + * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by + "cpu.rt.max" which contains both runtime and period. + + v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for + CFS bandwidth stats and also using raw division for u64. Use + CONFIG_CFS_BANDWITH and do_div() instead. + + The semantics of "cpu.rt.max" is not fully decided yet. Dropped + for now. + + Signed-off-by: Tejun Heo + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Li Zefan + Cc: Johannes Weiner + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 112037890e9b..a80d586a4317 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8803,6 +8803,139 @@ static struct cftype cpu_legacy_files[] = { + { } /* terminate */ + }; + ++static int cpu_stats_show(struct seq_file *sf, void *v) ++{ ++ cpuacct_cpu_stats_show(sf); ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ struct task_group *tg = css_tg(seq_css(sf)); ++ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; ++ u64 throttled_usec; ++ ++ throttled_usec = cfs_b->throttled_time; ++ do_div(throttled_usec, NSEC_PER_USEC); ++ ++ seq_printf(sf, "nr_periods %d\n" ++ "nr_throttled %d\n" ++ "throttled_usec %llu\n", ++ cfs_b->nr_periods, cfs_b->nr_throttled, ++ throttled_usec); ++ } ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ struct task_group *tg = css_tg(css); ++ u64 weight = scale_load_down(tg->shares); ++ ++ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); ++} ++ ++static int cpu_weight_write_u64(struct cgroup_subsys_state *css, ++ struct cftype *cftype, u64 weight) ++{ ++ /* ++ * cgroup weight knobs should use the common MIN, DFL and MAX ++ * values which are 1, 100 and 10000 respectively. While it loses ++ * a bit of range on both ends, it maps pretty well onto the shares ++ * value used by scheduler and the round-trip conversions preserve ++ * the original value over the entire range. ++ */ ++ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) ++ return -ERANGE; ++ ++ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); ++ ++ return sched_group_set_shares(css_tg(css), scale_load(weight)); ++} ++#endif ++ ++static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, ++ long period, long quota) ++{ ++ if (quota < 0) ++ seq_puts(sf, "max"); ++ else ++ seq_printf(sf, "%ld", quota); ++ ++ seq_printf(sf, " %ld\n", period); ++} ++ ++/* caller should put the current value in *@periodp before calling */ ++static int __maybe_unused cpu_period_quota_parse(char *buf, ++ u64 *periodp, u64 *quotap) ++{ ++ char tok[21]; /* U64_MAX */ ++ ++ if (!sscanf(buf, "%s %llu", tok, periodp)) ++ return -EINVAL; ++ ++ *periodp *= NSEC_PER_USEC; ++ ++ if (sscanf(tok, "%llu", quotap)) ++ *quotap *= NSEC_PER_USEC; ++ else if (!strcmp(tok, "max")) ++ *quotap = RUNTIME_INF; ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++static int cpu_max_show(struct seq_file *sf, void *v) ++{ ++ struct task_group *tg = css_tg(seq_css(sf)); ++ ++ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); ++ return 0; ++} ++ ++static ssize_t cpu_max_write(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, loff_t off) ++{ ++ struct task_group *tg = css_tg(of_css(of)); ++ u64 period = tg_get_cfs_period(tg); ++ u64 quota; ++ int ret; ++ ++ ret = cpu_period_quota_parse(buf, &period, "a); ++ if (!ret) ++ ret = tg_set_cfs_bandwidth(tg, period, quota); ++ return ret ?: nbytes; ++} ++#endif ++ ++static struct cftype cpu_files[] = { ++ { ++ .name = "stat", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_stats_show, ++ }, ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ { ++ .name = "weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_u64 = cpu_weight_read_u64, ++ .write_u64 = cpu_weight_write_u64, ++ }, ++#endif ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ .name = "max", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_max_show, ++ .write = cpu_max_write, ++ }, ++#endif ++ { } /* terminate */ ++}; ++ + struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, +@@ -8811,7 +8944,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, + .early_init = true, ++#ifdef CONFIG_CGROUP_CPUACCT ++ /* ++ * cpuacct is enabled together with cpu on the unified hierarchy ++ * and its stats are reported through "cpu.stat". ++ */ ++ .depends_on = 1 << cpuacct_cgrp_id, ++#endif + }; + + #endif /* CONFIG_CGROUP_SCHED */ +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index 4dd7b8588b69..97c6dd7d8f59 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -347,6 +347,31 @@ static struct cftype files[] = { + { } /* terminate */ + }; + ++/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ ++void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++ struct cgroup_subsys_state *css; ++ u64 usage, val[CPUACCT_STAT_NSTATS]; ++ ++ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); ++ ++ usage = cpuusage_read(css, seq_cft(sf)); ++ cpuacct_stats_read(css_ca(css), &val); ++ ++ val[CPUACCT_STAT_USER] *= TICK_NSEC; ++ val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC; ++ do_div(usage, NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC); ++ ++ seq_printf(sf, "usage_usec %llu\n" ++ "user_usec %llu\n" ++ "system_usec %llu\n", ++ usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]); ++ ++ css_put(css); ++} ++ + /* + * charge this task's execution time to its accounting group. + * +diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h +index ba72807c73d4..ddf7af466d35 100644 +--- a/kernel/sched/cpuacct.h ++++ b/kernel/sched/cpuacct.h +@@ -2,6 +2,7 @@ + + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); + extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); ++extern void cpuacct_cpu_stats_show(struct seq_file *sf); + + #else + +@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) + { + } + ++static inline void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++} ++ + #endif + +commit 2dae6b0ec091c93131e02eb56987ec6c26818f42 +Author: Tejun Heo +Date: Fri Aug 5 12:41:01 2016 -0400 + + cgroup: add documentation regarding CPU controller cgroup v2 support + + Signed-off-by: Tejun Heo + +diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt +new file mode 100644 +index 000000000000..1ed7032d4472 +--- /dev/null ++++ b/Documentation/cgroup-v2-cpu.txt +@@ -0,0 +1,368 @@ ++ ++ ++CPU Controller on Control Group v2 ++ ++August, 2016 Tejun Heo ++ ++ ++While most controllers have support for cgroup v2 now, the CPU ++controller support is not upstream yet due to objections from the ++scheduler maintainers on the basic designs of cgroup v2. This ++document explains the current situation as well as an interim ++solution, and details the disagreements and arguments. The latest ++version of this document can be found at the following URL. ++ ++ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu ++ ++This document was posted to the linux-kernel and cgroup mailing lists. ++Unfortunately, no consensus was reached as of Oct, 2016. The thread ++can be found at the following URL. ++ ++ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org ++ ++ ++CONTENTS ++ ++1. Current Situation and Interim Solution ++2. Disagreements and Arguments ++ 2-1. Contentious Restrictions ++ 2-1-1. Process Granularity ++ 2-1-2. No Internal Process Constraint ++ 2-2. Impact on CPU Controller ++ 2-2-1. Impact of Process Granularity ++ 2-2-2. Impact of No Internal Process Constraint ++ 2-3. Arguments for cgroup v2 ++3. Way Forward ++4. References ++ ++ ++1. Current Situation and Interim Solution ++ ++All objections from the scheduler maintainers apply to cgroup v2 core ++design, and there are no known objections to the specifics of the CPU ++controller cgroup v2 interface. The only blocked part is changes to ++expose the CPU controller interface on cgroup v2, which comprises the ++following two patches: ++ ++ [1] sched: Misc preps for cgroup unified hierarchy interface ++ [2] sched: Implement interface for cgroup unified hierarchy ++ ++The necessary changes are superficial and implement the interface ++files on cgroup v2. The combined diffstat is as follows. ++ ++ kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++-- ++ kernel/sched/cpuacct.c | 57 ++++++++++++------ ++ kernel/sched/cpuacct.h | 5 + ++ 3 files changed, 189 insertions(+), 22 deletions(-) ++ ++The patches are easy to apply and forward-port. The following git ++branch will always carry the two patches on top of the latest release ++of the upstream kernel. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu ++ ++There also are versioned branches going back to v4.4. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER ++ ++While it's difficult to tell whether the CPU controller support will ++be merged, there are crucial resource control features in cgroup v2 ++that are only possible due to the design choices that are being ++objected to, and every effort will be made to ease enabling the CPU ++controller cgroup v2 support out-of-tree for parties which choose to. ++ ++ ++2. Disagreements and Arguments ++ ++There have been several lengthy discussion threads [3][4] on LKML ++around the structural constraints of cgroup v2. The two that affect ++the CPU controller are process granularity and no internal process ++constraint. Both arise primarily from the need for common resource ++domain definition across different resources. ++ ++The common resource domain is a powerful concept in cgroup v2 that ++allows controllers to make basic assumptions about the structural ++organization of processes and controllers inside the cgroup hierarchy, ++and thus solve problems spanning multiple types of resources. The ++prime example for this is page cache writeback: dirty page cache is ++regulated through throttling buffered writers based on memory ++availability, and initiating batched write outs to the disk based on ++IO capacity. Tracking and controlling writeback inside a cgroup thus ++requires the direct cooperation of the memory and the IO controller. ++ ++This easily extends to other areas, such as CPU cycles consumed while ++performing memory reclaim or IO encryption. ++ ++ ++2-1. Contentious Restrictions ++ ++For controllers of different resources to work together, they must ++agree on a common organization. This uniform model across controllers ++imposes two contentious restrictions on the CPU controller: process ++granularity and the no-internal-process constraint. ++ ++ ++ 2-1-1. Process Granularity ++ ++ For memory, because an address space is shared between all threads ++ of a process, the terminal consumer is a process, not a thread. ++ Separating the threads of a single process into different memory ++ control domains doesn't make semantical sense. cgroup v2 ensures ++ that all controller can agree on the same organization by requiring ++ that threads of the same process belong to the same cgroup. ++ ++ There are other reasons to enforce process granularity. One ++ important one is isolating system-level management operations from ++ in-process application operations. The cgroup interface, being a ++ virtual filesystem, is very unfit for multiple independent ++ operations taking place at the same time as most operations have to ++ be multi-step and there is no way to synchronize multiple accessors. ++ See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity" ++ ++ ++ 2-1-2. No Internal Process Constraint ++ ++ cgroup v2 does not allow processes to belong to any cgroup which has ++ child cgroups when resource controllers are enabled on it (the ++ notable exception being the root cgroup itself). This is because, ++ for some resources, a resource domain (cgroup) is not directly ++ comparable to the terminal consumer (process/task) of said resource, ++ and so putting the two into a sibling relationship isn't meaningful. ++ ++ - Differing Control Parameters and Capabilities ++ ++ A cgroup controller has different resource control parameters and ++ capabilities from a terminal consumer, be that a task or process. ++ There are a couple cases where a cgroup control knob can be mapped ++ to a per-task or per-process API but they are exceptions and the ++ mappings aren't obvious even in those cases. ++ ++ For example, task priorities (also known as nice values) set ++ through setpriority(2) are mapped to the CPU controller ++ "cpu.shares" values. However, how exactly the two ranges map and ++ even the fact that they map to each other at all are not obvious. ++ ++ The situation gets further muddled when considering other resource ++ types and control knobs. IO priorities set through ioprio_set(2) ++ cannot be mapped to IO controller weights and most cgroup resource ++ control knobs including the bandwidth control knobs of the CPU ++ controller don't have counterparts in the terminal consumers. ++ ++ - Anonymous Resource Consumption ++ ++ For CPU, every time slice consumed from inside a cgroup, which ++ comprises most but not all of consumed CPU time for the cgroup, ++ can be clearly attributed to a specific task or process. Because ++ these two types of entities are directly comparable as consumers ++ of CPU time, it's theoretically possible to mix tasks and cgroups ++ on the same tree levels and let them directly compete for the time ++ quota available to their common ancestor. ++ ++ However, the same can't be said for resource types like memory or ++ IO: the memory consumed by the page cache, for example, can be ++ tracked on a per-cgroup level, but due to mismatches in lifetimes ++ of involved objects (page cache can persist long after processes ++ are gone), shared usages and the implementation overhead of ++ tracking persistent state, it can no longer be attributed to ++ individual processes after instantiation. Consequently, any IO ++ incurred by page cache writeback can be attributed to a cgroup, ++ but not to the individual consumers inside the cgroup. ++ ++ For memory and IO, this makes a resource domain (cgroup) an object ++ of a fundamentally different type than a terminal consumer ++ (process). A process can't be a first class object in the resource ++ distribution graph as its total resource consumption can't be ++ described without the containing resource domain. ++ ++ Disallowing processes in internal cgroups avoids competition between ++ cgroups and processes which cannot be meaningfully defined for these ++ resources. All resource control takes place among cgroups and a ++ terminal consumer interacts with the containing cgroup the same way ++ it would with the system without cgroup. ++ ++ Root cgroup is exempt from this constraint, which is in line with ++ how root cgroup is handled in general - it's excluded from cgroup ++ resource accounting and control. ++ ++ ++Enforcing process granularity and no internal process constraint ++allows all controllers to be on the same footing in terms of resource ++distribution hierarchy. ++ ++ ++2-2. Impact on CPU Controller ++ ++As indicated earlier, the CPU controller's resource distribution graph ++is the simplest. Every schedulable resource consumption can be ++attributed to a specific task. In addition, for weight based control, ++the per-task priority set through setpriority(2) can be translated to ++and from a per-cgroup weight. As such, the CPU controller can treat a ++task and a cgroup symmetrically, allowing support for any tree layout ++of cgroups and tasks. Both process granularity and the no internal ++process constraint restrict how the CPU controller can be used. ++ ++ ++ 2-2-1. Impact of Process Granularity ++ ++ Process granularity prevents tasks belonging to the same process to ++ be assigned to different cgroups. It was pointed out [6] that this ++ excludes the valid use case of hierarchical CPU distribution within ++ processes. ++ ++ To address this issue, the rgroup (resource group) [7][8][9] ++ interface, an extension of the existing setpriority(2) API, was ++ proposed, which is in line with other programmable priority ++ mechanisms and eliminates the risk of in-application configuration ++ and system configuration stepping on each other's toes. ++ Unfortunately, the proposal quickly turned into discussions around ++ cgroup v2 design decisions [4] and no consensus could be reached. ++ ++ ++ 2-2-2. Impact of No Internal Process Constraint ++ ++ The no internal process constraint disallows tasks from competing ++ directly against cgroups. Here is an excerpt from Peter Zijlstra ++ pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and ++ t4 are tasks: ++ ++ ++ R ++ / | \ ++ t1 t2 A ++ / \ ++ t3 t4 ++ ++ ++ Is fundamentally different from: ++ ++ ++ R ++ / \ ++ L A ++ / \ / \ ++ t1 t2 t3 t4 ++ ++ ++ Because if in the first hierarchy you add a task (t5) to R, all of ++ its A will run at 1/4th of total bandwidth where before it had ++ 1/3rd, whereas with the second example, if you add our t5 to L, A ++ doesn't get any less bandwidth. ++ ++ ++ It is true that the trees are semantically different from each other ++ and the symmetric handling of tasks and cgroups is aesthetically ++ pleasing. However, it isn't clear what the practical usefulness of ++ a layout with direct competition between tasks and cgroups would be, ++ considering that number and behavior of tasks are controlled by each ++ application, and cgroups primarily deal with system level resource ++ distribution; changes in the number of active threads would directly ++ impact resource distribution. Real world use cases of such layouts ++ could not be established during the discussions. ++ ++ ++2-3. Arguments for cgroup v2 ++ ++There are strong demands for comprehensive hierarchical resource ++control across all major resources, and establishing a common resource ++hierarchy is an essential step. As with most engineering decisions, ++common resource hierarchy definition comes with its trade-offs. With ++cgroup v2, the trade-offs are in the form of structural constraints ++which, among others, restrict the CPU controller's space of possible ++configurations. ++ ++However, even with the restrictions, cgroup v2, in combination with ++rgroup, covers most of identified real world use cases while enabling ++new important use cases of resource control across multiple resource ++types that were fundamentally broken previously. ++ ++Furthermore, for resource control, treating resource domains as ++objects of a different type from terminal consumers has important ++advantages - it can account for resource consumptions which are not ++tied to any specific terminal consumer, be that a task or process, and ++allows decoupling resource distribution controls from in-application ++APIs. Even the CPU controller may benefit from it as the kernel can ++consume significant amount of CPU cycles in interrupt context or tasks ++shared across multiple resource domains (e.g. softirq). ++ ++Finally, it's important to note that enabling cgroup v2 support for ++the CPU controller doesn't block use cases which require the features ++which are not available on cgroup v2. Unlikely, but should anybody ++actually rely on the CPU controller's symmetric handling of tasks and ++cgroups, backward compatibility is and will be maintained by being ++able to disconnect the controller from the cgroup v2 hierarchy and use ++it standalone. This also holds for cpuset which is often used in ++highly customized configurations which might be a poor fit for common ++resource domains. ++ ++The required changes are minimal, the benefits for the target use ++cases are critical and obvious, and use cases which have to use v1 can ++continue to do so. ++ ++ ++3. Way Forward ++ ++cgroup v2 primarily aims to solve the problem of comprehensive ++hierarchical resource control across all major computing resources, ++which is one of the core problems of modern server infrastructure ++engineering. The trade-offs that cgroup v2 took are results of ++pursuing that goal and gaining a better understanding of the nature of ++resource control in the process. ++ ++I believe that real world usages will prove cgroup v2's model right, ++considering the crucial pieces of comprehensive resource control that ++cannot be implemented without common resource domains. This is not to ++say that cgroup v2 is fixed in stone and can't be updated; if there is ++an approach which better serves both comprehensive resource control ++and the CPU controller's flexibility, we will surely move towards ++that. It goes without saying that discussions around such approach ++should consider practical aspects of resource control as a whole ++rather than absolutely focusing on a particular controller. ++ ++Until such consensus can be reached, the CPU controller cgroup v2 ++support will be maintained out of the mainline kernel in an easily ++accessible form. If there is anything cgroup developers can do to ++ease the pain, please feel free to contact us on the cgroup mailing ++list at cgroups@vger.kernel.org. ++ ++ ++4. References ++ ++[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org ++ [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface ++ Tejun Heo ++ ++[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org ++ [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo ++ ++[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org ++ [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo ++ ++[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Peter Zijlstra ++ ++[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt ++ Control Group v2 ++ Tejun Heo ++ ++[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com ++ Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Paul Turner ++ ++[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org ++ [RFD] cgroup: thread granularity support for cpu controller ++ Tejun Heo ++ ++[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org ++ [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Tejun Heo ++ ++[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org ++ Example program for PRIO_RGRP ++ Tejun Heo ++ ++[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource ++ Peter Zijlstra diff --git a/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch new file mode 100644 index 000000000000..38cc0532ba97 --- /dev/null +++ b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch @@ -0,0 +1,784 @@ +commit 827b86ad1dd21feed4c0b99faf6059f245f7dadb +Author: Tejun Heo +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Misc preps for cgroup unified hierarchy interface + + Make the following changes in preparation for the cpu controller + interface implementation for the unified hierarchy. This patch + doesn't cause any functional differences. + + * s/cpu_stats_show()/cpu_cfs_stats_show()/ + + * s/cpu_files/cpu_legacy_files/ + + * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While + at it, make the @val array u64 for consistency. + + Signed-off-by: Tejun Heo + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Li Zefan + Cc: Johannes Weiner + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3b31fc05a0f1..a1b95e83fa87 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -7174,7 +7174,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) + return ret; + } + +-static int cpu_stats_show(struct seq_file *sf, void *v) ++static int cpu_cfs_stats_show(struct seq_file *sf, void *v) + { + struct task_group *tg = css_tg(seq_css(sf)); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; +@@ -7214,7 +7214,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + } + #endif /* CONFIG_RT_GROUP_SCHED */ + +-static struct cftype cpu_files[] = { ++static struct cftype cpu_legacy_files[] = { + #ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", +@@ -7235,7 +7235,7 @@ static struct cftype cpu_files[] = { + }, + { + .name = "stat", +- .seq_show = cpu_stats_show, ++ .seq_show = cpu_cfs_stats_show, + }, + #endif + #ifdef CONFIG_RT_GROUP_SCHED +@@ -7261,7 +7261,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .fork = cpu_cgroup_fork, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, +- .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, + .early_init = true, + }; + +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index f95ab29a45d0..6151c23f722f 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) + return 0; + } + +-static int cpuacct_stats_show(struct seq_file *sf, void *v) ++static void cpuacct_stats_read(struct cpuacct *ca, ++ u64 (*val)[CPUACCT_STAT_NSTATS]) + { +- struct cpuacct *ca = css_ca(seq_css(sf)); +- s64 val[CPUACCT_STAT_NSTATS]; + int cpu; +- int stat; + +- memset(val, 0, sizeof(val)); ++ memset(val, 0, sizeof(*val)); ++ + for_each_possible_cpu(cpu) { + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + } ++} ++ ++static int cpuacct_stats_show(struct seq_file *sf, void *v) ++{ ++ u64 val[CPUACCT_STAT_NSTATS]; ++ int stat; ++ ++ cpuacct_stats_read(css_ca(seq_css(sf)), &val); + + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { +- seq_printf(sf, "%s %lld\n", ++ seq_printf(sf, "%s %llu\n", + cpuacct_stat_desc[stat], + (long long)nsec_to_clock_t(val[stat])); + } + +commit fdb64d002b3a223ce4bb11aa4448a42050470052 +Author: Tejun Heo +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Implement interface for cgroup unified hierarchy + + While the cpu controller doesn't have any functional problems, there + are a couple interface issues which can be addressed in the v2 + interface. + + * cpuacct being a separate controller. This separation is artificial + and rather pointless as demonstrated by most use cases co-mounting + the two controllers. It also forces certain information to be + accounted twice. + + * Use of different time units. Writable control knobs use + microseconds, some stat fields use nanoseconds while other cpuacct + stat fields use centiseconds. + + * Control knobs which can't be used in the root cgroup still show up + in the root. + + * Control knob names and semantics aren't consistent with other + controllers. + + This patchset implements cpu controller's interface on the unified + hierarchy which adheres to the controller file conventions described + in Documentation/cgroups/unified-hierarchy.txt. Overall, the + following changes are made. + + * cpuacct is implictly enabled and disabled by cpu and its information + is reported through "cpu.stat" which now uses microseconds for all + time durations. All time duration fields now have "_usec" appended + to them for clarity. While this doesn't solve the double accounting + immediately, once majority of users switch to v2, cpu can directly + account and report the relevant stats and cpuacct can be disabled on + the unified hierarchy. + + Note that cpuacct.usage_percpu is currently not included in + "cpu.stat". If this information is actually called for, it can be + added later. + + * "cpu.shares" is replaced with "cpu.weight" and operates on the + standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). + The weight is scaled to scheduler weight so that 100 maps to 1024 + and the ratio relationship is preserved - if weight is W and its + scaled value is S, W / 100 == S / 1024. While the mapped range is a + bit smaller than the orignal scheduler weight range, the dead zones + on both sides are relatively small and covers wider range than the + nice value mappings. This file doesn't make sense in the root + cgroup and isn't create on root. + + * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" + which contains both quota and period. + + * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by + "cpu.rt.max" which contains both runtime and period. + + v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for + CFS bandwidth stats and also using raw division for u64. Use + CONFIG_CFS_BANDWITH and do_div() instead. + + The semantics of "cpu.rt.max" is not fully decided yet. Dropped + for now. + + Signed-off-by: Tejun Heo + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Li Zefan + Cc: Johannes Weiner + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index a1b95e83fa87..f01d56e58a1b 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -7253,6 +7253,139 @@ static struct cftype cpu_legacy_files[] = { + { } /* Terminate */ + }; + ++static int cpu_stats_show(struct seq_file *sf, void *v) ++{ ++ cpuacct_cpu_stats_show(sf); ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ struct task_group *tg = css_tg(seq_css(sf)); ++ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; ++ u64 throttled_usec; ++ ++ throttled_usec = cfs_b->throttled_time; ++ do_div(throttled_usec, NSEC_PER_USEC); ++ ++ seq_printf(sf, "nr_periods %d\n" ++ "nr_throttled %d\n" ++ "throttled_usec %llu\n", ++ cfs_b->nr_periods, cfs_b->nr_throttled, ++ throttled_usec); ++ } ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ struct task_group *tg = css_tg(css); ++ u64 weight = scale_load_down(tg->shares); ++ ++ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); ++} ++ ++static int cpu_weight_write_u64(struct cgroup_subsys_state *css, ++ struct cftype *cftype, u64 weight) ++{ ++ /* ++ * cgroup weight knobs should use the common MIN, DFL and MAX ++ * values which are 1, 100 and 10000 respectively. While it loses ++ * a bit of range on both ends, it maps pretty well onto the shares ++ * value used by scheduler and the round-trip conversions preserve ++ * the original value over the entire range. ++ */ ++ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) ++ return -ERANGE; ++ ++ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); ++ ++ return sched_group_set_shares(css_tg(css), scale_load(weight)); ++} ++#endif ++ ++static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, ++ long period, long quota) ++{ ++ if (quota < 0) ++ seq_puts(sf, "max"); ++ else ++ seq_printf(sf, "%ld", quota); ++ ++ seq_printf(sf, " %ld\n", period); ++} ++ ++/* caller should put the current value in *@periodp before calling */ ++static int __maybe_unused cpu_period_quota_parse(char *buf, ++ u64 *periodp, u64 *quotap) ++{ ++ char tok[21]; /* U64_MAX */ ++ ++ if (!sscanf(buf, "%s %llu", tok, periodp)) ++ return -EINVAL; ++ ++ *periodp *= NSEC_PER_USEC; ++ ++ if (sscanf(tok, "%llu", quotap)) ++ *quotap *= NSEC_PER_USEC; ++ else if (!strcmp(tok, "max")) ++ *quotap = RUNTIME_INF; ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++static int cpu_max_show(struct seq_file *sf, void *v) ++{ ++ struct task_group *tg = css_tg(seq_css(sf)); ++ ++ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); ++ return 0; ++} ++ ++static ssize_t cpu_max_write(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, loff_t off) ++{ ++ struct task_group *tg = css_tg(of_css(of)); ++ u64 period = tg_get_cfs_period(tg); ++ u64 quota; ++ int ret; ++ ++ ret = cpu_period_quota_parse(buf, &period, "a); ++ if (!ret) ++ ret = tg_set_cfs_bandwidth(tg, period, quota); ++ return ret ?: nbytes; ++} ++#endif ++ ++static struct cftype cpu_files[] = { ++ { ++ .name = "stat", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_stats_show, ++ }, ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ { ++ .name = "weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_u64 = cpu_weight_read_u64, ++ .write_u64 = cpu_weight_write_u64, ++ }, ++#endif ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ .name = "max", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_max_show, ++ .write = cpu_max_write, ++ }, ++#endif ++ { } /* terminate */ ++}; ++ + struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, +@@ -7262,7 +7395,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, + .early_init = true, ++#ifdef CONFIG_CGROUP_CPUACCT ++ /* ++ * cpuacct is enabled together with cpu on the unified hierarchy ++ * and its stats are reported through "cpu.stat". ++ */ ++ .depends_on = 1 << cpuacct_cgrp_id, ++#endif + }; + + #endif /* CONFIG_CGROUP_SCHED */ +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index 6151c23f722f..fc1cf13c3af1 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -347,6 +347,31 @@ static struct cftype files[] = { + { } /* terminate */ + }; + ++/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ ++void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++ struct cgroup_subsys_state *css; ++ u64 usage, val[CPUACCT_STAT_NSTATS]; ++ ++ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); ++ ++ usage = cpuusage_read(css, seq_cft(sf)); ++ cpuacct_stats_read(css_ca(css), &val); ++ ++ val[CPUACCT_STAT_USER] *= TICK_NSEC; ++ val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC; ++ do_div(usage, NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC); ++ ++ seq_printf(sf, "usage_usec %llu\n" ++ "user_usec %llu\n" ++ "system_usec %llu\n", ++ usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]); ++ ++ css_put(css); ++} ++ + /* + * charge this task's execution time to its accounting group. + * +diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h +index ba72807c73d4..ddf7af466d35 100644 +--- a/kernel/sched/cpuacct.h ++++ b/kernel/sched/cpuacct.h +@@ -2,6 +2,7 @@ + + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); + extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); ++extern void cpuacct_cpu_stats_show(struct seq_file *sf); + + #else + +@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) + { + } + ++static inline void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++} ++ + #endif + +commit 8dde150866b8c433216105c50b7e889d5242d583 +Author: Tejun Heo +Date: Fri Aug 5 12:41:01 2016 -0400 + + cgroup: add documentation regarding CPU controller cgroup v2 support + + Signed-off-by: Tejun Heo + +diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt +new file mode 100644 +index 000000000000..1ed7032d4472 +--- /dev/null ++++ b/Documentation/cgroup-v2-cpu.txt +@@ -0,0 +1,368 @@ ++ ++ ++CPU Controller on Control Group v2 ++ ++August, 2016 Tejun Heo ++ ++ ++While most controllers have support for cgroup v2 now, the CPU ++controller support is not upstream yet due to objections from the ++scheduler maintainers on the basic designs of cgroup v2. This ++document explains the current situation as well as an interim ++solution, and details the disagreements and arguments. The latest ++version of this document can be found at the following URL. ++ ++ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu ++ ++This document was posted to the linux-kernel and cgroup mailing lists. ++Unfortunately, no consensus was reached as of Oct, 2016. The thread ++can be found at the following URL. ++ ++ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org ++ ++ ++CONTENTS ++ ++1. Current Situation and Interim Solution ++2. Disagreements and Arguments ++ 2-1. Contentious Restrictions ++ 2-1-1. Process Granularity ++ 2-1-2. No Internal Process Constraint ++ 2-2. Impact on CPU Controller ++ 2-2-1. Impact of Process Granularity ++ 2-2-2. Impact of No Internal Process Constraint ++ 2-3. Arguments for cgroup v2 ++3. Way Forward ++4. References ++ ++ ++1. Current Situation and Interim Solution ++ ++All objections from the scheduler maintainers apply to cgroup v2 core ++design, and there are no known objections to the specifics of the CPU ++controller cgroup v2 interface. The only blocked part is changes to ++expose the CPU controller interface on cgroup v2, which comprises the ++following two patches: ++ ++ [1] sched: Misc preps for cgroup unified hierarchy interface ++ [2] sched: Implement interface for cgroup unified hierarchy ++ ++The necessary changes are superficial and implement the interface ++files on cgroup v2. The combined diffstat is as follows. ++ ++ kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++-- ++ kernel/sched/cpuacct.c | 57 ++++++++++++------ ++ kernel/sched/cpuacct.h | 5 + ++ 3 files changed, 189 insertions(+), 22 deletions(-) ++ ++The patches are easy to apply and forward-port. The following git ++branch will always carry the two patches on top of the latest release ++of the upstream kernel. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu ++ ++There also are versioned branches going back to v4.4. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER ++ ++While it's difficult to tell whether the CPU controller support will ++be merged, there are crucial resource control features in cgroup v2 ++that are only possible due to the design choices that are being ++objected to, and every effort will be made to ease enabling the CPU ++controller cgroup v2 support out-of-tree for parties which choose to. ++ ++ ++2. Disagreements and Arguments ++ ++There have been several lengthy discussion threads [3][4] on LKML ++around the structural constraints of cgroup v2. The two that affect ++the CPU controller are process granularity and no internal process ++constraint. Both arise primarily from the need for common resource ++domain definition across different resources. ++ ++The common resource domain is a powerful concept in cgroup v2 that ++allows controllers to make basic assumptions about the structural ++organization of processes and controllers inside the cgroup hierarchy, ++and thus solve problems spanning multiple types of resources. The ++prime example for this is page cache writeback: dirty page cache is ++regulated through throttling buffered writers based on memory ++availability, and initiating batched write outs to the disk based on ++IO capacity. Tracking and controlling writeback inside a cgroup thus ++requires the direct cooperation of the memory and the IO controller. ++ ++This easily extends to other areas, such as CPU cycles consumed while ++performing memory reclaim or IO encryption. ++ ++ ++2-1. Contentious Restrictions ++ ++For controllers of different resources to work together, they must ++agree on a common organization. This uniform model across controllers ++imposes two contentious restrictions on the CPU controller: process ++granularity and the no-internal-process constraint. ++ ++ ++ 2-1-1. Process Granularity ++ ++ For memory, because an address space is shared between all threads ++ of a process, the terminal consumer is a process, not a thread. ++ Separating the threads of a single process into different memory ++ control domains doesn't make semantical sense. cgroup v2 ensures ++ that all controller can agree on the same organization by requiring ++ that threads of the same process belong to the same cgroup. ++ ++ There are other reasons to enforce process granularity. One ++ important one is isolating system-level management operations from ++ in-process application operations. The cgroup interface, being a ++ virtual filesystem, is very unfit for multiple independent ++ operations taking place at the same time as most operations have to ++ be multi-step and there is no way to synchronize multiple accessors. ++ See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity" ++ ++ ++ 2-1-2. No Internal Process Constraint ++ ++ cgroup v2 does not allow processes to belong to any cgroup which has ++ child cgroups when resource controllers are enabled on it (the ++ notable exception being the root cgroup itself). This is because, ++ for some resources, a resource domain (cgroup) is not directly ++ comparable to the terminal consumer (process/task) of said resource, ++ and so putting the two into a sibling relationship isn't meaningful. ++ ++ - Differing Control Parameters and Capabilities ++ ++ A cgroup controller has different resource control parameters and ++ capabilities from a terminal consumer, be that a task or process. ++ There are a couple cases where a cgroup control knob can be mapped ++ to a per-task or per-process API but they are exceptions and the ++ mappings aren't obvious even in those cases. ++ ++ For example, task priorities (also known as nice values) set ++ through setpriority(2) are mapped to the CPU controller ++ "cpu.shares" values. However, how exactly the two ranges map and ++ even the fact that they map to each other at all are not obvious. ++ ++ The situation gets further muddled when considering other resource ++ types and control knobs. IO priorities set through ioprio_set(2) ++ cannot be mapped to IO controller weights and most cgroup resource ++ control knobs including the bandwidth control knobs of the CPU ++ controller don't have counterparts in the terminal consumers. ++ ++ - Anonymous Resource Consumption ++ ++ For CPU, every time slice consumed from inside a cgroup, which ++ comprises most but not all of consumed CPU time for the cgroup, ++ can be clearly attributed to a specific task or process. Because ++ these two types of entities are directly comparable as consumers ++ of CPU time, it's theoretically possible to mix tasks and cgroups ++ on the same tree levels and let them directly compete for the time ++ quota available to their common ancestor. ++ ++ However, the same can't be said for resource types like memory or ++ IO: the memory consumed by the page cache, for example, can be ++ tracked on a per-cgroup level, but due to mismatches in lifetimes ++ of involved objects (page cache can persist long after processes ++ are gone), shared usages and the implementation overhead of ++ tracking persistent state, it can no longer be attributed to ++ individual processes after instantiation. Consequently, any IO ++ incurred by page cache writeback can be attributed to a cgroup, ++ but not to the individual consumers inside the cgroup. ++ ++ For memory and IO, this makes a resource domain (cgroup) an object ++ of a fundamentally different type than a terminal consumer ++ (process). A process can't be a first class object in the resource ++ distribution graph as its total resource consumption can't be ++ described without the containing resource domain. ++ ++ Disallowing processes in internal cgroups avoids competition between ++ cgroups and processes which cannot be meaningfully defined for these ++ resources. All resource control takes place among cgroups and a ++ terminal consumer interacts with the containing cgroup the same way ++ it would with the system without cgroup. ++ ++ Root cgroup is exempt from this constraint, which is in line with ++ how root cgroup is handled in general - it's excluded from cgroup ++ resource accounting and control. ++ ++ ++Enforcing process granularity and no internal process constraint ++allows all controllers to be on the same footing in terms of resource ++distribution hierarchy. ++ ++ ++2-2. Impact on CPU Controller ++ ++As indicated earlier, the CPU controller's resource distribution graph ++is the simplest. Every schedulable resource consumption can be ++attributed to a specific task. In addition, for weight based control, ++the per-task priority set through setpriority(2) can be translated to ++and from a per-cgroup weight. As such, the CPU controller can treat a ++task and a cgroup symmetrically, allowing support for any tree layout ++of cgroups and tasks. Both process granularity and the no internal ++process constraint restrict how the CPU controller can be used. ++ ++ ++ 2-2-1. Impact of Process Granularity ++ ++ Process granularity prevents tasks belonging to the same process to ++ be assigned to different cgroups. It was pointed out [6] that this ++ excludes the valid use case of hierarchical CPU distribution within ++ processes. ++ ++ To address this issue, the rgroup (resource group) [7][8][9] ++ interface, an extension of the existing setpriority(2) API, was ++ proposed, which is in line with other programmable priority ++ mechanisms and eliminates the risk of in-application configuration ++ and system configuration stepping on each other's toes. ++ Unfortunately, the proposal quickly turned into discussions around ++ cgroup v2 design decisions [4] and no consensus could be reached. ++ ++ ++ 2-2-2. Impact of No Internal Process Constraint ++ ++ The no internal process constraint disallows tasks from competing ++ directly against cgroups. Here is an excerpt from Peter Zijlstra ++ pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and ++ t4 are tasks: ++ ++ ++ R ++ / | \ ++ t1 t2 A ++ / \ ++ t3 t4 ++ ++ ++ Is fundamentally different from: ++ ++ ++ R ++ / \ ++ L A ++ / \ / \ ++ t1 t2 t3 t4 ++ ++ ++ Because if in the first hierarchy you add a task (t5) to R, all of ++ its A will run at 1/4th of total bandwidth where before it had ++ 1/3rd, whereas with the second example, if you add our t5 to L, A ++ doesn't get any less bandwidth. ++ ++ ++ It is true that the trees are semantically different from each other ++ and the symmetric handling of tasks and cgroups is aesthetically ++ pleasing. However, it isn't clear what the practical usefulness of ++ a layout with direct competition between tasks and cgroups would be, ++ considering that number and behavior of tasks are controlled by each ++ application, and cgroups primarily deal with system level resource ++ distribution; changes in the number of active threads would directly ++ impact resource distribution. Real world use cases of such layouts ++ could not be established during the discussions. ++ ++ ++2-3. Arguments for cgroup v2 ++ ++There are strong demands for comprehensive hierarchical resource ++control across all major resources, and establishing a common resource ++hierarchy is an essential step. As with most engineering decisions, ++common resource hierarchy definition comes with its trade-offs. With ++cgroup v2, the trade-offs are in the form of structural constraints ++which, among others, restrict the CPU controller's space of possible ++configurations. ++ ++However, even with the restrictions, cgroup v2, in combination with ++rgroup, covers most of identified real world use cases while enabling ++new important use cases of resource control across multiple resource ++types that were fundamentally broken previously. ++ ++Furthermore, for resource control, treating resource domains as ++objects of a different type from terminal consumers has important ++advantages - it can account for resource consumptions which are not ++tied to any specific terminal consumer, be that a task or process, and ++allows decoupling resource distribution controls from in-application ++APIs. Even the CPU controller may benefit from it as the kernel can ++consume significant amount of CPU cycles in interrupt context or tasks ++shared across multiple resource domains (e.g. softirq). ++ ++Finally, it's important to note that enabling cgroup v2 support for ++the CPU controller doesn't block use cases which require the features ++which are not available on cgroup v2. Unlikely, but should anybody ++actually rely on the CPU controller's symmetric handling of tasks and ++cgroups, backward compatibility is and will be maintained by being ++able to disconnect the controller from the cgroup v2 hierarchy and use ++it standalone. This also holds for cpuset which is often used in ++highly customized configurations which might be a poor fit for common ++resource domains. ++ ++The required changes are minimal, the benefits for the target use ++cases are critical and obvious, and use cases which have to use v1 can ++continue to do so. ++ ++ ++3. Way Forward ++ ++cgroup v2 primarily aims to solve the problem of comprehensive ++hierarchical resource control across all major computing resources, ++which is one of the core problems of modern server infrastructure ++engineering. The trade-offs that cgroup v2 took are results of ++pursuing that goal and gaining a better understanding of the nature of ++resource control in the process. ++ ++I believe that real world usages will prove cgroup v2's model right, ++considering the crucial pieces of comprehensive resource control that ++cannot be implemented without common resource domains. This is not to ++say that cgroup v2 is fixed in stone and can't be updated; if there is ++an approach which better serves both comprehensive resource control ++and the CPU controller's flexibility, we will surely move towards ++that. It goes without saying that discussions around such approach ++should consider practical aspects of resource control as a whole ++rather than absolutely focusing on a particular controller. ++ ++Until such consensus can be reached, the CPU controller cgroup v2 ++support will be maintained out of the mainline kernel in an easily ++accessible form. If there is anything cgroup developers can do to ++ease the pain, please feel free to contact us on the cgroup mailing ++list at cgroups@vger.kernel.org. ++ ++ ++4. References ++ ++[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org ++ [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface ++ Tejun Heo ++ ++[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org ++ [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo ++ ++[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org ++ [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo ++ ++[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Peter Zijlstra ++ ++[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt ++ Control Group v2 ++ Tejun Heo ++ ++[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com ++ Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Paul Turner ++ ++[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org ++ [RFD] cgroup: thread granularity support for cpu controller ++ Tejun Heo ++ ++[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org ++ [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Tejun Heo ++ ++[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org ++ Example program for PRIO_RGRP ++ Tejun Heo ++ ++[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource ++ Peter Zijlstra diff --git a/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.6.patch b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.6.patch deleted file mode 100644 index f64908317cac..000000000000 --- a/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.6.patch +++ /dev/null @@ -1,407 +0,0 @@ -commit 6426c5b02d4aab620219b08a5d97ad8851b56b0d -Author: Tejun Heo -Date: Fri Mar 11 07:31:23 2016 -0500 - - sched: Misc preps for cgroup unified hierarchy interface - - Make the following changes in preparation for the cpu controller - interface implementation for the unified hierarchy. This patch - doesn't cause any functional differences. - - * s/cpu_stats_show()/cpu_cfs_stats_show()/ - - * s/cpu_files/cpu_legacy_files/ - - * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While - at it, remove pointless cpuacct_stat_desc[] array. - - Signed-off-by: Tejun Heo - Cc: Ingo Molnar - Cc: Peter Zijlstra - Cc: Li Zefan - Cc: Johannes Weiner - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index d1f7149..0d34f35 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -8371,7 +8371,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) - return ret; - } - --static int cpu_stats_show(struct seq_file *sf, void *v) -+static int cpu_cfs_stats_show(struct seq_file *sf, void *v) - { - struct task_group *tg = css_tg(seq_css(sf)); - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; -@@ -8411,7 +8411,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, - } - #endif /* CONFIG_RT_GROUP_SCHED */ - --static struct cftype cpu_files[] = { -+static struct cftype cpu_legacy_files[] = { - #ifdef CONFIG_FAIR_GROUP_SCHED - { - .name = "shares", -@@ -8432,7 +8432,7 @@ static struct cftype cpu_files[] = { - }, - { - .name = "stat", -- .seq_show = cpu_stats_show, -+ .seq_show = cpu_cfs_stats_show, - }, - #endif - #ifdef CONFIG_RT_GROUP_SCHED -@@ -8457,7 +8457,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { - .fork = cpu_cgroup_fork, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, -- .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, - .early_init = true, - }; - -diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c -index 4a81120..b99030a 100644 ---- a/kernel/sched/cpuacct.c -+++ b/kernel/sched/cpuacct.c -@@ -180,36 +180,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) - return 0; - } - --static const char * const cpuacct_stat_desc[] = { -- [CPUACCT_STAT_USER] = "user", -- [CPUACCT_STAT_SYSTEM] = "system", --}; -- --static int cpuacct_stats_show(struct seq_file *sf, void *v) -+static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp) - { -- struct cpuacct *ca = css_ca(seq_css(sf)); - int cpu; -- s64 val = 0; - -+ *userp = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -- val += kcpustat->cpustat[CPUTIME_USER]; -- val += kcpustat->cpustat[CPUTIME_NICE]; -+ *userp += kcpustat->cpustat[CPUTIME_USER]; -+ *userp += kcpustat->cpustat[CPUTIME_NICE]; - } -- val = cputime64_to_clock_t(val); -- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); - -- val = 0; -+ *sysp = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -- val += kcpustat->cpustat[CPUTIME_SYSTEM]; -- val += kcpustat->cpustat[CPUTIME_IRQ]; -- val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; -+ *sysp += kcpustat->cpustat[CPUTIME_SYSTEM]; -+ *sysp += kcpustat->cpustat[CPUTIME_IRQ]; -+ *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } -+} - -- val = cputime64_to_clock_t(val); -- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); -+static int cpuacct_stats_show(struct seq_file *sf, void *v) -+{ -+ cputime64_t user, sys; - -+ cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys); -+ seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user)); -+ seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys)); - return 0; - } - - -commit d2a799f795a5d5a69c9dc365c34f926e0649f840 -Author: Tejun Heo -Date: Fri Mar 11 07:31:23 2016 -0500 - - sched: Implement interface for cgroup unified hierarchy - - While the cpu controller doesn't have any functional problems, there - are a couple interface issues which can be addressed in the v2 - interface. - - * cpuacct being a separate controller. This separation is artificial - and rather pointless as demonstrated by most use cases co-mounting - the two controllers. It also forces certain information to be - accounted twice. - - * Use of different time units. Writable control knobs use - microseconds, some stat fields use nanoseconds while other cpuacct - stat fields use centiseconds. - - * Control knobs which can't be used in the root cgroup still show up - in the root. - - * Control knob names and semantics aren't consistent with other - controllers. - - This patchset implements cpu controller's interface on the unified - hierarchy which adheres to the controller file conventions described - in Documentation/cgroups/unified-hierarchy.txt. Overall, the - following changes are made. - - * cpuacct is implictly enabled and disabled by cpu and its information - is reported through "cpu.stat" which now uses microseconds for all - time durations. All time duration fields now have "_usec" appended - to them for clarity. While this doesn't solve the double accounting - immediately, once majority of users switch to v2, cpu can directly - account and report the relevant stats and cpuacct can be disabled on - the unified hierarchy. - - Note that cpuacct.usage_percpu is currently not included in - "cpu.stat". If this information is actually called for, it can be - added later. - - * "cpu.shares" is replaced with "cpu.weight" and operates on the - standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). - The weight is scaled to scheduler weight so that 100 maps to 1024 - and the ratio relationship is preserved - if weight is W and its - scaled value is S, W / 100 == S / 1024. While the mapped range is a - bit smaller than the orignal scheduler weight range, the dead zones - on both sides are relatively small and covers wider range than the - nice value mappings. This file doesn't make sense in the root - cgroup and isn't create on root. - - * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" - which contains both quota and period. - - * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by - "cpu.rt.max" which contains both runtime and period. - - v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for - CFS bandwidth stats and also using raw division for u64. Use - CONFIG_CFS_BANDWITH and do_div() instead. - - The semantics of "cpu.rt.max" is not fully decided yet. Dropped - for now. - - Signed-off-by: Tejun Heo - Cc: Ingo Molnar - Cc: Peter Zijlstra - Cc: Li Zefan - Cc: Johannes Weiner - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 0d34f35..5990efc 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -8450,6 +8450,139 @@ static struct cftype cpu_legacy_files[] = { - { } /* terminate */ - }; - -+static int cpu_stats_show(struct seq_file *sf, void *v) -+{ -+ cpuacct_cpu_stats_show(sf); -+ -+#ifdef CONFIG_CFS_BANDWIDTH -+ { -+ struct task_group *tg = css_tg(seq_css(sf)); -+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; -+ u64 throttled_usec; -+ -+ throttled_usec = cfs_b->throttled_time; -+ do_div(throttled_usec, NSEC_PER_USEC); -+ -+ seq_printf(sf, "nr_periods %d\n" -+ "nr_throttled %d\n" -+ "throttled_usec %llu\n", -+ cfs_b->nr_periods, cfs_b->nr_throttled, -+ throttled_usec); -+ } -+#endif -+ return 0; -+} -+ -+#ifdef CONFIG_FAIR_GROUP_SCHED -+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, -+ struct cftype *cft) -+{ -+ struct task_group *tg = css_tg(css); -+ u64 weight = scale_load_down(tg->shares); -+ -+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); -+} -+ -+static int cpu_weight_write_u64(struct cgroup_subsys_state *css, -+ struct cftype *cftype, u64 weight) -+{ -+ /* -+ * cgroup weight knobs should use the common MIN, DFL and MAX -+ * values which are 1, 100 and 10000 respectively. While it loses -+ * a bit of range on both ends, it maps pretty well onto the shares -+ * value used by scheduler and the round-trip conversions preserve -+ * the original value over the entire range. -+ */ -+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) -+ return -ERANGE; -+ -+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); -+ -+ return sched_group_set_shares(css_tg(css), scale_load(weight)); -+} -+#endif -+ -+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, -+ long period, long quota) -+{ -+ if (quota < 0) -+ seq_puts(sf, "max"); -+ else -+ seq_printf(sf, "%ld", quota); -+ -+ seq_printf(sf, " %ld\n", period); -+} -+ -+/* caller should put the current value in *@periodp before calling */ -+static int __maybe_unused cpu_period_quota_parse(char *buf, -+ u64 *periodp, u64 *quotap) -+{ -+ char tok[21]; /* U64_MAX */ -+ -+ if (!sscanf(buf, "%s %llu", tok, periodp)) -+ return -EINVAL; -+ -+ *periodp *= NSEC_PER_USEC; -+ -+ if (sscanf(tok, "%llu", quotap)) -+ *quotap *= NSEC_PER_USEC; -+ else if (!strcmp(tok, "max")) -+ *quotap = RUNTIME_INF; -+ else -+ return -EINVAL; -+ -+ return 0; -+} -+ -+#ifdef CONFIG_CFS_BANDWIDTH -+static int cpu_max_show(struct seq_file *sf, void *v) -+{ -+ struct task_group *tg = css_tg(seq_css(sf)); -+ -+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); -+ return 0; -+} -+ -+static ssize_t cpu_max_write(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, loff_t off) -+{ -+ struct task_group *tg = css_tg(of_css(of)); -+ u64 period = tg_get_cfs_period(tg); -+ u64 quota; -+ int ret; -+ -+ ret = cpu_period_quota_parse(buf, &period, "a); -+ if (!ret) -+ ret = tg_set_cfs_bandwidth(tg, period, quota); -+ return ret ?: nbytes; -+} -+#endif -+ -+static struct cftype cpu_files[] = { -+ { -+ .name = "stat", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = cpu_stats_show, -+ }, -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ { -+ .name = "weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .read_u64 = cpu_weight_read_u64, -+ .write_u64 = cpu_weight_write_u64, -+ }, -+#endif -+#ifdef CONFIG_CFS_BANDWIDTH -+ { -+ .name = "max", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = cpu_max_show, -+ .write = cpu_max_write, -+ }, -+#endif -+ { } /* terminate */ -+}; -+ - struct cgroup_subsys cpu_cgrp_subsys = { - .css_alloc = cpu_cgroup_css_alloc, - .css_released = cpu_cgroup_css_released, -@@ -8458,7 +8591,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, - .early_init = true, -+#ifdef CONFIG_CGROUP_CPUACCT -+ /* -+ * cpuacct is enabled together with cpu on the unified hierarchy -+ * and its stats are reported through "cpu.stat". -+ */ -+ .depends_on = 1 << cpuacct_cgrp_id, -+#endif - }; - - #endif /* CONFIG_CGROUP_SCHED */ -diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c -index b99030a..a1a5a4b 100644 ---- a/kernel/sched/cpuacct.c -+++ b/kernel/sched/cpuacct.c -@@ -227,6 +227,30 @@ static struct cftype files[] = { - { } /* terminate */ - }; - -+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ -+void cpuacct_cpu_stats_show(struct seq_file *sf) -+{ -+ struct cgroup_subsys_state *css; -+ u64 usage, user, sys; -+ -+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); -+ -+ usage = cpuusage_read(css, seq_cft(sf)); -+ cpuacct_stats_read(css_ca(css), &user, &sys); -+ -+ user *= TICK_NSEC; -+ sys *= TICK_NSEC; -+ do_div(usage, NSEC_PER_USEC); -+ do_div(user, NSEC_PER_USEC); -+ do_div(sys, NSEC_PER_USEC); -+ -+ seq_printf(sf, "usage_usec %llu\n" -+ "user_usec %llu\n" -+ "system_usec %llu\n", usage, user, sys); -+ -+ css_put(css); -+} -+ - /* - * charge this task's execution time to its accounting group. - * -diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h -index ba72807..ddf7af4 100644 ---- a/kernel/sched/cpuacct.h -+++ b/kernel/sched/cpuacct.h -@@ -2,6 +2,7 @@ - - extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); - extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); -+extern void cpuacct_cpu_stats_show(struct seq_file *sf); - - #else - -@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) - { - } - -+static inline void cpuacct_cpu_stats_show(struct seq_file *sf) -+{ -+} -+ - #endif diff --git a/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.7.patch b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.7.patch deleted file mode 100644 index 74dae740e37c..000000000000 --- a/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.7.patch +++ /dev/null @@ -1,407 +0,0 @@ -commit 0d966df508ef4d6c0b1baae9e369f4fb0d3e10af -Author: Tejun Heo -Date: Fri Mar 11 07:31:23 2016 -0500 - - sched: Misc preps for cgroup unified hierarchy interface - - Make the following changes in preparation for the cpu controller - interface implementation for the unified hierarchy. This patch - doesn't cause any functional differences. - - * s/cpu_stats_show()/cpu_cfs_stats_show()/ - - * s/cpu_files/cpu_legacy_files/ - - * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While - at it, remove pointless cpuacct_stat_desc[] array. - - Signed-off-by: Tejun Heo - Cc: Ingo Molnar - Cc: Peter Zijlstra - Cc: Li Zefan - Cc: Johannes Weiner - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 97ee9ac..c148dfe 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -8482,7 +8482,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) - return ret; - } - --static int cpu_stats_show(struct seq_file *sf, void *v) -+static int cpu_cfs_stats_show(struct seq_file *sf, void *v) - { - struct task_group *tg = css_tg(seq_css(sf)); - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; -@@ -8522,7 +8522,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, - } - #endif /* CONFIG_RT_GROUP_SCHED */ - --static struct cftype cpu_files[] = { -+static struct cftype cpu_legacy_files[] = { - #ifdef CONFIG_FAIR_GROUP_SCHED - { - .name = "shares", -@@ -8543,7 +8543,7 @@ static struct cftype cpu_files[] = { - }, - { - .name = "stat", -- .seq_show = cpu_stats_show, -+ .seq_show = cpu_cfs_stats_show, - }, - #endif - #ifdef CONFIG_RT_GROUP_SCHED -@@ -8568,7 +8568,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { - .fork = cpu_cgroup_fork, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, -- .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, - .early_init = true, - }; - -diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c -index 41f85c4..3eb9eda 100644 ---- a/kernel/sched/cpuacct.c -+++ b/kernel/sched/cpuacct.c -@@ -242,36 +242,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); - } - --static const char * const cpuacct_stat_desc[] = { -- [CPUACCT_STAT_USER] = "user", -- [CPUACCT_STAT_SYSTEM] = "system", --}; -- --static int cpuacct_stats_show(struct seq_file *sf, void *v) -+static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp) - { -- struct cpuacct *ca = css_ca(seq_css(sf)); - int cpu; -- s64 val = 0; - -+ *userp = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -- val += kcpustat->cpustat[CPUTIME_USER]; -- val += kcpustat->cpustat[CPUTIME_NICE]; -+ *userp += kcpustat->cpustat[CPUTIME_USER]; -+ *userp += kcpustat->cpustat[CPUTIME_NICE]; - } -- val = cputime64_to_clock_t(val); -- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); - -- val = 0; -+ *sysp = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -- val += kcpustat->cpustat[CPUTIME_SYSTEM]; -- val += kcpustat->cpustat[CPUTIME_IRQ]; -- val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; -+ *sysp += kcpustat->cpustat[CPUTIME_SYSTEM]; -+ *sysp += kcpustat->cpustat[CPUTIME_IRQ]; -+ *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } -+} - -- val = cputime64_to_clock_t(val); -- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); -+static int cpuacct_stats_show(struct seq_file *sf, void *v) -+{ -+ cputime64_t user, sys; - -+ cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys); -+ seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user)); -+ seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys)); - return 0; - } - - -commit ed6d93036ec930cb774da10b7c87f67905ce71f1 -Author: Tejun Heo -Date: Fri Mar 11 07:31:23 2016 -0500 - - sched: Implement interface for cgroup unified hierarchy - - While the cpu controller doesn't have any functional problems, there - are a couple interface issues which can be addressed in the v2 - interface. - - * cpuacct being a separate controller. This separation is artificial - and rather pointless as demonstrated by most use cases co-mounting - the two controllers. It also forces certain information to be - accounted twice. - - * Use of different time units. Writable control knobs use - microseconds, some stat fields use nanoseconds while other cpuacct - stat fields use centiseconds. - - * Control knobs which can't be used in the root cgroup still show up - in the root. - - * Control knob names and semantics aren't consistent with other - controllers. - - This patchset implements cpu controller's interface on the unified - hierarchy which adheres to the controller file conventions described - in Documentation/cgroups/unified-hierarchy.txt. Overall, the - following changes are made. - - * cpuacct is implictly enabled and disabled by cpu and its information - is reported through "cpu.stat" which now uses microseconds for all - time durations. All time duration fields now have "_usec" appended - to them for clarity. While this doesn't solve the double accounting - immediately, once majority of users switch to v2, cpu can directly - account and report the relevant stats and cpuacct can be disabled on - the unified hierarchy. - - Note that cpuacct.usage_percpu is currently not included in - "cpu.stat". If this information is actually called for, it can be - added later. - - * "cpu.shares" is replaced with "cpu.weight" and operates on the - standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). - The weight is scaled to scheduler weight so that 100 maps to 1024 - and the ratio relationship is preserved - if weight is W and its - scaled value is S, W / 100 == S / 1024. While the mapped range is a - bit smaller than the orignal scheduler weight range, the dead zones - on both sides are relatively small and covers wider range than the - nice value mappings. This file doesn't make sense in the root - cgroup and isn't create on root. - - * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" - which contains both quota and period. - - * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by - "cpu.rt.max" which contains both runtime and period. - - v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for - CFS bandwidth stats and also using raw division for u64. Use - CONFIG_CFS_BANDWITH and do_div() instead. - - The semantics of "cpu.rt.max" is not fully decided yet. Dropped - for now. - - Signed-off-by: Tejun Heo - Cc: Ingo Molnar - Cc: Peter Zijlstra - Cc: Li Zefan - Cc: Johannes Weiner - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index c148dfe..7bba2c5 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -8561,6 +8561,139 @@ static struct cftype cpu_legacy_files[] = { - { } /* terminate */ - }; - -+static int cpu_stats_show(struct seq_file *sf, void *v) -+{ -+ cpuacct_cpu_stats_show(sf); -+ -+#ifdef CONFIG_CFS_BANDWIDTH -+ { -+ struct task_group *tg = css_tg(seq_css(sf)); -+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; -+ u64 throttled_usec; -+ -+ throttled_usec = cfs_b->throttled_time; -+ do_div(throttled_usec, NSEC_PER_USEC); -+ -+ seq_printf(sf, "nr_periods %d\n" -+ "nr_throttled %d\n" -+ "throttled_usec %llu\n", -+ cfs_b->nr_periods, cfs_b->nr_throttled, -+ throttled_usec); -+ } -+#endif -+ return 0; -+} -+ -+#ifdef CONFIG_FAIR_GROUP_SCHED -+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, -+ struct cftype *cft) -+{ -+ struct task_group *tg = css_tg(css); -+ u64 weight = scale_load_down(tg->shares); -+ -+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); -+} -+ -+static int cpu_weight_write_u64(struct cgroup_subsys_state *css, -+ struct cftype *cftype, u64 weight) -+{ -+ /* -+ * cgroup weight knobs should use the common MIN, DFL and MAX -+ * values which are 1, 100 and 10000 respectively. While it loses -+ * a bit of range on both ends, it maps pretty well onto the shares -+ * value used by scheduler and the round-trip conversions preserve -+ * the original value over the entire range. -+ */ -+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) -+ return -ERANGE; -+ -+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); -+ -+ return sched_group_set_shares(css_tg(css), scale_load(weight)); -+} -+#endif -+ -+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, -+ long period, long quota) -+{ -+ if (quota < 0) -+ seq_puts(sf, "max"); -+ else -+ seq_printf(sf, "%ld", quota); -+ -+ seq_printf(sf, " %ld\n", period); -+} -+ -+/* caller should put the current value in *@periodp before calling */ -+static int __maybe_unused cpu_period_quota_parse(char *buf, -+ u64 *periodp, u64 *quotap) -+{ -+ char tok[21]; /* U64_MAX */ -+ -+ if (!sscanf(buf, "%s %llu", tok, periodp)) -+ return -EINVAL; -+ -+ *periodp *= NSEC_PER_USEC; -+ -+ if (sscanf(tok, "%llu", quotap)) -+ *quotap *= NSEC_PER_USEC; -+ else if (!strcmp(tok, "max")) -+ *quotap = RUNTIME_INF; -+ else -+ return -EINVAL; -+ -+ return 0; -+} -+ -+#ifdef CONFIG_CFS_BANDWIDTH -+static int cpu_max_show(struct seq_file *sf, void *v) -+{ -+ struct task_group *tg = css_tg(seq_css(sf)); -+ -+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); -+ return 0; -+} -+ -+static ssize_t cpu_max_write(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, loff_t off) -+{ -+ struct task_group *tg = css_tg(of_css(of)); -+ u64 period = tg_get_cfs_period(tg); -+ u64 quota; -+ int ret; -+ -+ ret = cpu_period_quota_parse(buf, &period, "a); -+ if (!ret) -+ ret = tg_set_cfs_bandwidth(tg, period, quota); -+ return ret ?: nbytes; -+} -+#endif -+ -+static struct cftype cpu_files[] = { -+ { -+ .name = "stat", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = cpu_stats_show, -+ }, -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ { -+ .name = "weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .read_u64 = cpu_weight_read_u64, -+ .write_u64 = cpu_weight_write_u64, -+ }, -+#endif -+#ifdef CONFIG_CFS_BANDWIDTH -+ { -+ .name = "max", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = cpu_max_show, -+ .write = cpu_max_write, -+ }, -+#endif -+ { } /* terminate */ -+}; -+ - struct cgroup_subsys cpu_cgrp_subsys = { - .css_alloc = cpu_cgroup_css_alloc, - .css_released = cpu_cgroup_css_released, -@@ -8569,7 +8702,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, - .early_init = true, -+#ifdef CONFIG_CGROUP_CPUACCT -+ /* -+ * cpuacct is enabled together with cpu on the unified hierarchy -+ * and its stats are reported through "cpu.stat". -+ */ -+ .depends_on = 1 << cpuacct_cgrp_id, -+#endif - }; - - #endif /* CONFIG_CGROUP_SCHED */ -diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c -index 3eb9eda..7a02d26 100644 ---- a/kernel/sched/cpuacct.c -+++ b/kernel/sched/cpuacct.c -@@ -305,6 +305,30 @@ static struct cftype files[] = { - { } /* terminate */ - }; - -+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ -+void cpuacct_cpu_stats_show(struct seq_file *sf) -+{ -+ struct cgroup_subsys_state *css; -+ u64 usage, user, sys; -+ -+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); -+ -+ usage = cpuusage_read(css, seq_cft(sf)); -+ cpuacct_stats_read(css_ca(css), &user, &sys); -+ -+ user *= TICK_NSEC; -+ sys *= TICK_NSEC; -+ do_div(usage, NSEC_PER_USEC); -+ do_div(user, NSEC_PER_USEC); -+ do_div(sys, NSEC_PER_USEC); -+ -+ seq_printf(sf, "usage_usec %llu\n" -+ "user_usec %llu\n" -+ "system_usec %llu\n", usage, user, sys); -+ -+ css_put(css); -+} -+ - /* - * charge this task's execution time to its accounting group. - * -diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h -index ba72807..ddf7af4 100644 ---- a/kernel/sched/cpuacct.h -+++ b/kernel/sched/cpuacct.h -@@ -2,6 +2,7 @@ - - extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); - extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); -+extern void cpuacct_cpu_stats_show(struct seq_file *sf); - - #else - -@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) - { - } - -+static inline void cpuacct_cpu_stats_show(struct seq_file *sf) -+{ -+} -+ - #endif diff --git a/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.9.patch b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.9.patch new file mode 100644 index 000000000000..6f0904cbce99 --- /dev/null +++ b/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.9.patch @@ -0,0 +1,784 @@ +commit 280858b0bb3384b9ec06b455e196b453888bd6b8 +Author: Tejun Heo +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Misc preps for cgroup unified hierarchy interface + + Make the following changes in preparation for the cpu controller + interface implementation for the unified hierarchy. This patch + doesn't cause any functional differences. + + * s/cpu_stats_show()/cpu_cfs_stats_show()/ + + * s/cpu_files/cpu_legacy_files/ + + * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While + at it, make the @val array u64 for consistency. + + Signed-off-by: Tejun Heo + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Li Zefan + Cc: Johannes Weiner + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 154fd689fe02..57472485b79c 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8705,7 +8705,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) + return ret; + } + +-static int cpu_stats_show(struct seq_file *sf, void *v) ++static int cpu_cfs_stats_show(struct seq_file *sf, void *v) + { + struct task_group *tg = css_tg(seq_css(sf)); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; +@@ -8745,7 +8745,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + } + #endif /* CONFIG_RT_GROUP_SCHED */ + +-static struct cftype cpu_files[] = { ++static struct cftype cpu_legacy_files[] = { + #ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", +@@ -8766,7 +8766,7 @@ static struct cftype cpu_files[] = { + }, + { + .name = "stat", +- .seq_show = cpu_stats_show, ++ .seq_show = cpu_cfs_stats_show, + }, + #endif + #ifdef CONFIG_RT_GROUP_SCHED +@@ -8791,7 +8791,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .fork = cpu_cgroup_fork, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, +- .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, + .early_init = true, + }; + +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index bc0b309c3f19..d1e5dd0b3a64 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) + return 0; + } + +-static int cpuacct_stats_show(struct seq_file *sf, void *v) ++static void cpuacct_stats_read(struct cpuacct *ca, ++ u64 (*val)[CPUACCT_STAT_NSTATS]) + { +- struct cpuacct *ca = css_ca(seq_css(sf)); +- s64 val[CPUACCT_STAT_NSTATS]; + int cpu; +- int stat; + +- memset(val, 0, sizeof(val)); ++ memset(val, 0, sizeof(*val)); ++ + for_each_possible_cpu(cpu) { + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + } ++} ++ ++static int cpuacct_stats_show(struct seq_file *sf, void *v) ++{ ++ u64 val[CPUACCT_STAT_NSTATS]; ++ int stat; ++ ++ cpuacct_stats_read(css_ca(seq_css(sf)), &val); + + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { +- seq_printf(sf, "%s %lld\n", ++ seq_printf(sf, "%s %llu\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } + +commit 015cbdcb90034fd566d00de9d3d405613da3cd26 +Author: Tejun Heo +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Implement interface for cgroup unified hierarchy + + While the cpu controller doesn't have any functional problems, there + are a couple interface issues which can be addressed in the v2 + interface. + + * cpuacct being a separate controller. This separation is artificial + and rather pointless as demonstrated by most use cases co-mounting + the two controllers. It also forces certain information to be + accounted twice. + + * Use of different time units. Writable control knobs use + microseconds, some stat fields use nanoseconds while other cpuacct + stat fields use centiseconds. + + * Control knobs which can't be used in the root cgroup still show up + in the root. + + * Control knob names and semantics aren't consistent with other + controllers. + + This patchset implements cpu controller's interface on the unified + hierarchy which adheres to the controller file conventions described + in Documentation/cgroups/unified-hierarchy.txt. Overall, the + following changes are made. + + * cpuacct is implictly enabled and disabled by cpu and its information + is reported through "cpu.stat" which now uses microseconds for all + time durations. All time duration fields now have "_usec" appended + to them for clarity. While this doesn't solve the double accounting + immediately, once majority of users switch to v2, cpu can directly + account and report the relevant stats and cpuacct can be disabled on + the unified hierarchy. + + Note that cpuacct.usage_percpu is currently not included in + "cpu.stat". If this information is actually called for, it can be + added later. + + * "cpu.shares" is replaced with "cpu.weight" and operates on the + standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). + The weight is scaled to scheduler weight so that 100 maps to 1024 + and the ratio relationship is preserved - if weight is W and its + scaled value is S, W / 100 == S / 1024. While the mapped range is a + bit smaller than the orignal scheduler weight range, the dead zones + on both sides are relatively small and covers wider range than the + nice value mappings. This file doesn't make sense in the root + cgroup and isn't create on root. + + * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" + which contains both quota and period. + + * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by + "cpu.rt.max" which contains both runtime and period. + + v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for + CFS bandwidth stats and also using raw division for u64. Use + CONFIG_CFS_BANDWITH and do_div() instead. + + The semantics of "cpu.rt.max" is not fully decided yet. Dropped + for now. + + Signed-off-by: Tejun Heo + Cc: Ingo Molnar + Cc: Peter Zijlstra + Cc: Li Zefan + Cc: Johannes Weiner + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 57472485b79c..c0ae869f51c4 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8784,6 +8784,139 @@ static struct cftype cpu_legacy_files[] = { + { } /* terminate */ + }; + ++static int cpu_stats_show(struct seq_file *sf, void *v) ++{ ++ cpuacct_cpu_stats_show(sf); ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ struct task_group *tg = css_tg(seq_css(sf)); ++ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; ++ u64 throttled_usec; ++ ++ throttled_usec = cfs_b->throttled_time; ++ do_div(throttled_usec, NSEC_PER_USEC); ++ ++ seq_printf(sf, "nr_periods %d\n" ++ "nr_throttled %d\n" ++ "throttled_usec %llu\n", ++ cfs_b->nr_periods, cfs_b->nr_throttled, ++ throttled_usec); ++ } ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ struct task_group *tg = css_tg(css); ++ u64 weight = scale_load_down(tg->shares); ++ ++ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); ++} ++ ++static int cpu_weight_write_u64(struct cgroup_subsys_state *css, ++ struct cftype *cftype, u64 weight) ++{ ++ /* ++ * cgroup weight knobs should use the common MIN, DFL and MAX ++ * values which are 1, 100 and 10000 respectively. While it loses ++ * a bit of range on both ends, it maps pretty well onto the shares ++ * value used by scheduler and the round-trip conversions preserve ++ * the original value over the entire range. ++ */ ++ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) ++ return -ERANGE; ++ ++ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); ++ ++ return sched_group_set_shares(css_tg(css), scale_load(weight)); ++} ++#endif ++ ++static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, ++ long period, long quota) ++{ ++ if (quota < 0) ++ seq_puts(sf, "max"); ++ else ++ seq_printf(sf, "%ld", quota); ++ ++ seq_printf(sf, " %ld\n", period); ++} ++ ++/* caller should put the current value in *@periodp before calling */ ++static int __maybe_unused cpu_period_quota_parse(char *buf, ++ u64 *periodp, u64 *quotap) ++{ ++ char tok[21]; /* U64_MAX */ ++ ++ if (!sscanf(buf, "%s %llu", tok, periodp)) ++ return -EINVAL; ++ ++ *periodp *= NSEC_PER_USEC; ++ ++ if (sscanf(tok, "%llu", quotap)) ++ *quotap *= NSEC_PER_USEC; ++ else if (!strcmp(tok, "max")) ++ *quotap = RUNTIME_INF; ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++static int cpu_max_show(struct seq_file *sf, void *v) ++{ ++ struct task_group *tg = css_tg(seq_css(sf)); ++ ++ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); ++ return 0; ++} ++ ++static ssize_t cpu_max_write(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, loff_t off) ++{ ++ struct task_group *tg = css_tg(of_css(of)); ++ u64 period = tg_get_cfs_period(tg); ++ u64 quota; ++ int ret; ++ ++ ret = cpu_period_quota_parse(buf, &period, "a); ++ if (!ret) ++ ret = tg_set_cfs_bandwidth(tg, period, quota); ++ return ret ?: nbytes; ++} ++#endif ++ ++static struct cftype cpu_files[] = { ++ { ++ .name = "stat", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_stats_show, ++ }, ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ { ++ .name = "weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_u64 = cpu_weight_read_u64, ++ .write_u64 = cpu_weight_write_u64, ++ }, ++#endif ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ .name = "max", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_max_show, ++ .write = cpu_max_write, ++ }, ++#endif ++ { } /* terminate */ ++}; ++ + struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, +@@ -8792,7 +8925,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, + .early_init = true, ++#ifdef CONFIG_CGROUP_CPUACCT ++ /* ++ * cpuacct is enabled together with cpu on the unified hierarchy ++ * and its stats are reported through "cpu.stat". ++ */ ++ .depends_on = 1 << cpuacct_cgrp_id, ++#endif + }; + + #endif /* CONFIG_CGROUP_SCHED */ +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index d1e5dd0b3a64..57f390514c39 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -347,6 +347,31 @@ static struct cftype files[] = { + { } /* terminate */ + }; + ++/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ ++void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++ struct cgroup_subsys_state *css; ++ u64 usage, val[CPUACCT_STAT_NSTATS]; ++ ++ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); ++ ++ usage = cpuusage_read(css, seq_cft(sf)); ++ cpuacct_stats_read(css_ca(css), &val); ++ ++ val[CPUACCT_STAT_USER] *= TICK_NSEC; ++ val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC; ++ do_div(usage, NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC); ++ ++ seq_printf(sf, "usage_usec %llu\n" ++ "user_usec %llu\n" ++ "system_usec %llu\n", ++ usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]); ++ ++ css_put(css); ++} ++ + /* + * charge this task's execution time to its accounting group. + * +diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h +index ba72807c73d4..ddf7af466d35 100644 +--- a/kernel/sched/cpuacct.h ++++ b/kernel/sched/cpuacct.h +@@ -2,6 +2,7 @@ + + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); + extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); ++extern void cpuacct_cpu_stats_show(struct seq_file *sf); + + #else + +@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) + { + } + ++static inline void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++} ++ + #endif + +commit 5019fe3d7ec456b58d451ef06fe1f81d7d9f28a9 +Author: Tejun Heo +Date: Fri Aug 5 12:41:01 2016 -0400 + + cgroup: add documentation regarding CPU controller cgroup v2 support + + Signed-off-by: Tejun Heo + +diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt +new file mode 100644 +index 000000000000..1ed7032d4472 +--- /dev/null ++++ b/Documentation/cgroup-v2-cpu.txt +@@ -0,0 +1,368 @@ ++ ++ ++CPU Controller on Control Group v2 ++ ++August, 2016 Tejun Heo ++ ++ ++While most controllers have support for cgroup v2 now, the CPU ++controller support is not upstream yet due to objections from the ++scheduler maintainers on the basic designs of cgroup v2. This ++document explains the current situation as well as an interim ++solution, and details the disagreements and arguments. The latest ++version of this document can be found at the following URL. ++ ++ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu ++ ++This document was posted to the linux-kernel and cgroup mailing lists. ++Unfortunately, no consensus was reached as of Oct, 2016. The thread ++can be found at the following URL. ++ ++ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org ++ ++ ++CONTENTS ++ ++1. Current Situation and Interim Solution ++2. Disagreements and Arguments ++ 2-1. Contentious Restrictions ++ 2-1-1. Process Granularity ++ 2-1-2. No Internal Process Constraint ++ 2-2. Impact on CPU Controller ++ 2-2-1. Impact of Process Granularity ++ 2-2-2. Impact of No Internal Process Constraint ++ 2-3. Arguments for cgroup v2 ++3. Way Forward ++4. References ++ ++ ++1. Current Situation and Interim Solution ++ ++All objections from the scheduler maintainers apply to cgroup v2 core ++design, and there are no known objections to the specifics of the CPU ++controller cgroup v2 interface. The only blocked part is changes to ++expose the CPU controller interface on cgroup v2, which comprises the ++following two patches: ++ ++ [1] sched: Misc preps for cgroup unified hierarchy interface ++ [2] sched: Implement interface for cgroup unified hierarchy ++ ++The necessary changes are superficial and implement the interface ++files on cgroup v2. The combined diffstat is as follows. ++ ++ kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++-- ++ kernel/sched/cpuacct.c | 57 ++++++++++++------ ++ kernel/sched/cpuacct.h | 5 + ++ 3 files changed, 189 insertions(+), 22 deletions(-) ++ ++The patches are easy to apply and forward-port. The following git ++branch will always carry the two patches on top of the latest release ++of the upstream kernel. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu ++ ++There also are versioned branches going back to v4.4. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER ++ ++While it's difficult to tell whether the CPU controller support will ++be merged, there are crucial resource control features in cgroup v2 ++that are only possible due to the design choices that are being ++objected to, and every effort will be made to ease enabling the CPU ++controller cgroup v2 support out-of-tree for parties which choose to. ++ ++ ++2. Disagreements and Arguments ++ ++There have been several lengthy discussion threads [3][4] on LKML ++around the structural constraints of cgroup v2. The two that affect ++the CPU controller are process granularity and no internal process ++constraint. Both arise primarily from the need for common resource ++domain definition across different resources. ++ ++The common resource domain is a powerful concept in cgroup v2 that ++allows controllers to make basic assumptions about the structural ++organization of processes and controllers inside the cgroup hierarchy, ++and thus solve problems spanning multiple types of resources. The ++prime example for this is page cache writeback: dirty page cache is ++regulated through throttling buffered writers based on memory ++availability, and initiating batched write outs to the disk based on ++IO capacity. Tracking and controlling writeback inside a cgroup thus ++requires the direct cooperation of the memory and the IO controller. ++ ++This easily extends to other areas, such as CPU cycles consumed while ++performing memory reclaim or IO encryption. ++ ++ ++2-1. Contentious Restrictions ++ ++For controllers of different resources to work together, they must ++agree on a common organization. This uniform model across controllers ++imposes two contentious restrictions on the CPU controller: process ++granularity and the no-internal-process constraint. ++ ++ ++ 2-1-1. Process Granularity ++ ++ For memory, because an address space is shared between all threads ++ of a process, the terminal consumer is a process, not a thread. ++ Separating the threads of a single process into different memory ++ control domains doesn't make semantical sense. cgroup v2 ensures ++ that all controller can agree on the same organization by requiring ++ that threads of the same process belong to the same cgroup. ++ ++ There are other reasons to enforce process granularity. One ++ important one is isolating system-level management operations from ++ in-process application operations. The cgroup interface, being a ++ virtual filesystem, is very unfit for multiple independent ++ operations taking place at the same time as most operations have to ++ be multi-step and there is no way to synchronize multiple accessors. ++ See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity" ++ ++ ++ 2-1-2. No Internal Process Constraint ++ ++ cgroup v2 does not allow processes to belong to any cgroup which has ++ child cgroups when resource controllers are enabled on it (the ++ notable exception being the root cgroup itself). This is because, ++ for some resources, a resource domain (cgroup) is not directly ++ comparable to the terminal consumer (process/task) of said resource, ++ and so putting the two into a sibling relationship isn't meaningful. ++ ++ - Differing Control Parameters and Capabilities ++ ++ A cgroup controller has different resource control parameters and ++ capabilities from a terminal consumer, be that a task or process. ++ There are a couple cases where a cgroup control knob can be mapped ++ to a per-task or per-process API but they are exceptions and the ++ mappings aren't obvious even in those cases. ++ ++ For example, task priorities (also known as nice values) set ++ through setpriority(2) are mapped to the CPU controller ++ "cpu.shares" values. However, how exactly the two ranges map and ++ even the fact that they map to each other at all are not obvious. ++ ++ The situation gets further muddled when considering other resource ++ types and control knobs. IO priorities set through ioprio_set(2) ++ cannot be mapped to IO controller weights and most cgroup resource ++ control knobs including the bandwidth control knobs of the CPU ++ controller don't have counterparts in the terminal consumers. ++ ++ - Anonymous Resource Consumption ++ ++ For CPU, every time slice consumed from inside a cgroup, which ++ comprises most but not all of consumed CPU time for the cgroup, ++ can be clearly attributed to a specific task or process. Because ++ these two types of entities are directly comparable as consumers ++ of CPU time, it's theoretically possible to mix tasks and cgroups ++ on the same tree levels and let them directly compete for the time ++ quota available to their common ancestor. ++ ++ However, the same can't be said for resource types like memory or ++ IO: the memory consumed by the page cache, for example, can be ++ tracked on a per-cgroup level, but due to mismatches in lifetimes ++ of involved objects (page cache can persist long after processes ++ are gone), shared usages and the implementation overhead of ++ tracking persistent state, it can no longer be attributed to ++ individual processes after instantiation. Consequently, any IO ++ incurred by page cache writeback can be attributed to a cgroup, ++ but not to the individual consumers inside the cgroup. ++ ++ For memory and IO, this makes a resource domain (cgroup) an object ++ of a fundamentally different type than a terminal consumer ++ (process). A process can't be a first class object in the resource ++ distribution graph as its total resource consumption can't be ++ described without the containing resource domain. ++ ++ Disallowing processes in internal cgroups avoids competition between ++ cgroups and processes which cannot be meaningfully defined for these ++ resources. All resource control takes place among cgroups and a ++ terminal consumer interacts with the containing cgroup the same way ++ it would with the system without cgroup. ++ ++ Root cgroup is exempt from this constraint, which is in line with ++ how root cgroup is handled in general - it's excluded from cgroup ++ resource accounting and control. ++ ++ ++Enforcing process granularity and no internal process constraint ++allows all controllers to be on the same footing in terms of resource ++distribution hierarchy. ++ ++ ++2-2. Impact on CPU Controller ++ ++As indicated earlier, the CPU controller's resource distribution graph ++is the simplest. Every schedulable resource consumption can be ++attributed to a specific task. In addition, for weight based control, ++the per-task priority set through setpriority(2) can be translated to ++and from a per-cgroup weight. As such, the CPU controller can treat a ++task and a cgroup symmetrically, allowing support for any tree layout ++of cgroups and tasks. Both process granularity and the no internal ++process constraint restrict how the CPU controller can be used. ++ ++ ++ 2-2-1. Impact of Process Granularity ++ ++ Process granularity prevents tasks belonging to the same process to ++ be assigned to different cgroups. It was pointed out [6] that this ++ excludes the valid use case of hierarchical CPU distribution within ++ processes. ++ ++ To address this issue, the rgroup (resource group) [7][8][9] ++ interface, an extension of the existing setpriority(2) API, was ++ proposed, which is in line with other programmable priority ++ mechanisms and eliminates the risk of in-application configuration ++ and system configuration stepping on each other's toes. ++ Unfortunately, the proposal quickly turned into discussions around ++ cgroup v2 design decisions [4] and no consensus could be reached. ++ ++ ++ 2-2-2. Impact of No Internal Process Constraint ++ ++ The no internal process constraint disallows tasks from competing ++ directly against cgroups. Here is an excerpt from Peter Zijlstra ++ pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and ++ t4 are tasks: ++ ++ ++ R ++ / | \ ++ t1 t2 A ++ / \ ++ t3 t4 ++ ++ ++ Is fundamentally different from: ++ ++ ++ R ++ / \ ++ L A ++ / \ / \ ++ t1 t2 t3 t4 ++ ++ ++ Because if in the first hierarchy you add a task (t5) to R, all of ++ its A will run at 1/4th of total bandwidth where before it had ++ 1/3rd, whereas with the second example, if you add our t5 to L, A ++ doesn't get any less bandwidth. ++ ++ ++ It is true that the trees are semantically different from each other ++ and the symmetric handling of tasks and cgroups is aesthetically ++ pleasing. However, it isn't clear what the practical usefulness of ++ a layout with direct competition between tasks and cgroups would be, ++ considering that number and behavior of tasks are controlled by each ++ application, and cgroups primarily deal with system level resource ++ distribution; changes in the number of active threads would directly ++ impact resource distribution. Real world use cases of such layouts ++ could not be established during the discussions. ++ ++ ++2-3. Arguments for cgroup v2 ++ ++There are strong demands for comprehensive hierarchical resource ++control across all major resources, and establishing a common resource ++hierarchy is an essential step. As with most engineering decisions, ++common resource hierarchy definition comes with its trade-offs. With ++cgroup v2, the trade-offs are in the form of structural constraints ++which, among others, restrict the CPU controller's space of possible ++configurations. ++ ++However, even with the restrictions, cgroup v2, in combination with ++rgroup, covers most of identified real world use cases while enabling ++new important use cases of resource control across multiple resource ++types that were fundamentally broken previously. ++ ++Furthermore, for resource control, treating resource domains as ++objects of a different type from terminal consumers has important ++advantages - it can account for resource consumptions which are not ++tied to any specific terminal consumer, be that a task or process, and ++allows decoupling resource distribution controls from in-application ++APIs. Even the CPU controller may benefit from it as the kernel can ++consume significant amount of CPU cycles in interrupt context or tasks ++shared across multiple resource domains (e.g. softirq). ++ ++Finally, it's important to note that enabling cgroup v2 support for ++the CPU controller doesn't block use cases which require the features ++which are not available on cgroup v2. Unlikely, but should anybody ++actually rely on the CPU controller's symmetric handling of tasks and ++cgroups, backward compatibility is and will be maintained by being ++able to disconnect the controller from the cgroup v2 hierarchy and use ++it standalone. This also holds for cpuset which is often used in ++highly customized configurations which might be a poor fit for common ++resource domains. ++ ++The required changes are minimal, the benefits for the target use ++cases are critical and obvious, and use cases which have to use v1 can ++continue to do so. ++ ++ ++3. Way Forward ++ ++cgroup v2 primarily aims to solve the problem of comprehensive ++hierarchical resource control across all major computing resources, ++which is one of the core problems of modern server infrastructure ++engineering. The trade-offs that cgroup v2 took are results of ++pursuing that goal and gaining a better understanding of the nature of ++resource control in the process. ++ ++I believe that real world usages will prove cgroup v2's model right, ++considering the crucial pieces of comprehensive resource control that ++cannot be implemented without common resource domains. This is not to ++say that cgroup v2 is fixed in stone and can't be updated; if there is ++an approach which better serves both comprehensive resource control ++and the CPU controller's flexibility, we will surely move towards ++that. It goes without saying that discussions around such approach ++should consider practical aspects of resource control as a whole ++rather than absolutely focusing on a particular controller. ++ ++Until such consensus can be reached, the CPU controller cgroup v2 ++support will be maintained out of the mainline kernel in an easily ++accessible form. If there is anything cgroup developers can do to ++ease the pain, please feel free to contact us on the cgroup mailing ++list at cgroups@vger.kernel.org. ++ ++ ++4. References ++ ++[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org ++ [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface ++ Tejun Heo ++ ++[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org ++ [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo ++ ++[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org ++ [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo ++ ++[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Peter Zijlstra ++ ++[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt ++ Control Group v2 ++ Tejun Heo ++ ++[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com ++ Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Paul Turner ++ ++[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org ++ [RFD] cgroup: thread granularity support for cpu controller ++ Tejun Heo ++ ++[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org ++ [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Tejun Heo ++ ++[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org ++ Example program for PRIO_RGRP ++ Tejun Heo ++ ++[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource ++ Peter Zijlstra diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix index 67014b02d9c2..d7ba2652fe17 100644 --- a/pkgs/top-level/all-packages.nix +++ b/pkgs/top-level/all-packages.nix @@ -11716,10 +11716,7 @@ with pkgs; kernelPatches = [ kernelPatches.bridge_stp_helper kernelPatches.p9_fixes - # See pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md - # when adding a new linux version - # !!! 4.7 patch doesn't apply, 4.9 patch not up yet, will keep checking - # kernelPatches.cpu-cgroup-v2."4.7" + kernelPatches.cpu-cgroup-v2."4.9" kernelPatches.modinst_arg_list_too_long ] ++ lib.optionals ((platform.kernelArch or null) == "mips") @@ -11733,10 +11730,7 @@ with pkgs; kernelPatches = [ kernelPatches.bridge_stp_helper kernelPatches.p9_fixes - # See pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md - # when adding a new linux version - # !!! 4.7 patch doesn't apply, 4.9 patch not up yet, will keep checking - # kernelPatches.cpu-cgroup-v2."4.7" + kernelPatches.cpu-cgroup-v2."4.10" kernelPatches.modinst_arg_list_too_long ] ++ lib.optionals ((platform.kernelArch or null) == "mips") @@ -11752,8 +11746,7 @@ with pkgs; kernelPatches.p9_fixes # See pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md # when adding a new linux version - # !!! 4.7 patch doesn't apply, 4.9 patch not up yet, will keep checking - # kernelPatches.cpu-cgroup-v2."4.7" + kernelPatches.cpu-cgroup-v2."4.11" kernelPatches.modinst_arg_list_too_long ] ++ lib.optionals ((platform.kernelArch or null) == "mips")