Skip to content

Commit edaa5dd

Browse files
committed
Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - reorganize & clean up the SD* flags definitions and add a bunch of sanity checks. These new checks caught quite a few bugs or at least inconsistencies, resulting in another set of patches. - rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ - add a new tracepoint to improve CPU capacity tracking - improve overloaded SMP system load-balancing behavior - tweak SMT balancing - energy-aware scheduling updates - NUMA balancing improvements - deadline scheduler fixes and improvements - CPU isolation fixes - misc cleanups, simplifications and smaller optimizations * tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (42 commits) sched/deadline: Unthrottle PI boosted threads while enqueuing sched/debug: Add new tracepoint to track cpu_capacity sched/fair: Tweak pick_next_entity() rseq/selftests: Test MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ rseq/selftests,x86_64: Add rseq_offset_deref_addv() rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ sched/fair: Use dst group while checking imbalance for NUMA balancer sched/fair: Reduce busy load balance interval sched/fair: Minimize concurrent LBs between domain level sched/fair: Reduce minimal imbalance threshold sched/fair: Relax constraint on task's load during load balance sched/fair: Remove the force parameter of update_tg_load_avg() sched/fair: Fix wrong cpu selecting from isolated domain sched: Remove unused inline function uclamp_bucket_base_value() sched/rt: Disable RT_RUNTIME_SHARE by default sched/deadline: Fix stale throttling on de-/boosted tasks sched/numa: Use runnable_avg to classify node sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL MAINTAINERS: Add myself as SCHED_DEADLINE reviewer sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h ...
2 parents 13cb734 + feff2e6 commit edaa5dd

19 files changed

Lines changed: 803 additions & 152 deletions

File tree

MAINTAINERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15407,6 +15407,7 @@ R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
1540715407
R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
1540815408
R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
1540915409
R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
15410+
R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
1541015411
L: linux-kernel@vger.kernel.org
1541115412
S: Maintained
1541215413
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core

arch/arm/kernel/topology.c

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {}
177177
static inline void update_cpu_capacity(unsigned int cpuid) {}
178178
#endif
179179

180-
/*
181-
* The current assumption is that we can power gate each core independently.
182-
* This will be superseded by DT binding once available.
183-
*/
184-
const struct cpumask *cpu_corepower_mask(int cpu)
185-
{
186-
return &cpu_topology[cpu].thread_sibling;
187-
}
188-
189180
/*
190181
* store_cpu_topology is called at boot when only one cpu is running
191182
* and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@@ -241,20 +232,6 @@ void store_cpu_topology(unsigned int cpuid)
241232
update_siblings_masks(cpuid);
242233
}
243234

244-
static inline int cpu_corepower_flags(void)
245-
{
246-
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
247-
}
248-
249-
static struct sched_domain_topology_level arm_topology[] = {
250-
#ifdef CONFIG_SCHED_MC
251-
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
252-
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
253-
#endif
254-
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
255-
{ NULL, },
256-
};
257-
258235
/*
259236
* init_cpu_topology is called at boot when only one cpu is running
260237
* which prevent simultaneous write access to cpu_topology array
@@ -265,7 +242,4 @@ void __init init_cpu_topology(void)
265242
smp_wmb();
266243

267244
parse_dt_topology();
268-
269-
/* Set scheduler topology descriptor */
270-
set_sched_topology(arm_topology);
271245
}

include/linux/sched.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1491,9 +1491,10 @@ extern struct pid *cad_pid;
14911491
/*
14921492
* Per process flags
14931493
*/
1494+
#define PF_VCPU 0x00000001 /* I'm a virtual CPU */
14941495
#define PF_IDLE 0x00000002 /* I am an IDLE thread */
14951496
#define PF_EXITING 0x00000004 /* Getting shut down */
1496-
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
1497+
#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
14971498
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
14981499
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
14991500
#define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */
@@ -1517,7 +1518,6 @@ extern struct pid *cad_pid;
15171518
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
15181519
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
15191520
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
1520-
#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
15211521
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
15221522
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
15231523

@@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
20462046
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
20472047

20482048
int sched_trace_rq_cpu(struct rq *rq);
2049+
int sched_trace_rq_cpu_capacity(struct rq *rq);
20492050
int sched_trace_rq_nr_running(struct rq *rq);
20502051

20512052
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

include/linux/sched/mm.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,10 +348,13 @@ enum {
348348
MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
349349
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
350350
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
351+
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6),
352+
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7),
351353
};
352354

353355
enum {
354356
MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
357+
MEMBARRIER_FLAG_RSEQ = (1U << 1),
355358
};
356359

357360
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS

include/linux/sched/sd_flags.h

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* sched-domains (multiprocessor balancing) flag declarations.
4+
*/
5+
6+
#ifndef SD_FLAG
7+
# error "Incorrect import of SD flags definitions"
8+
#endif
9+
10+
/*
11+
* Hierarchical metaflags
12+
*
13+
* SHARED_CHILD: These flags are meant to be set from the base domain upwards.
14+
* If a domain has this flag set, all of its children should have it set. This
15+
* is usually because the flag describes some shared resource (all CPUs in that
16+
* domain share the same resource), or because they are tied to a scheduling
17+
* behaviour that we want to disable at some point in the hierarchy for
18+
* scalability reasons.
19+
*
20+
* In those cases it doesn't make sense to have the flag set for a domain but
21+
* not have it in (some of) its children: sched domains ALWAYS span their child
22+
* domains, so operations done with parent domains will cover CPUs in the lower
23+
* child domains.
24+
*
25+
*
26+
* SHARED_PARENT: These flags are meant to be set from the highest domain
27+
* downwards. If a domain has this flag set, all of its parents should have it
28+
* set. This is usually for topology properties that start to appear above a
29+
* certain level (e.g. domain starts spanning CPUs outside of the base CPU's
30+
* socket).
31+
*/
32+
#define SDF_SHARED_CHILD 0x1
33+
#define SDF_SHARED_PARENT 0x2
34+
35+
/*
36+
* Behavioural metaflags
37+
*
38+
* NEEDS_GROUPS: These flags are only relevant if the domain they are set on has
39+
* more than one group. This is usually for balancing flags (load balancing
40+
* involves equalizing a metric between groups), or for flags describing some
41+
* shared resource (which would be shared between groups).
42+
*/
43+
#define SDF_NEEDS_GROUPS 0x4
44+
45+
/*
46+
* Balance when about to become idle
47+
*
48+
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
49+
* NEEDS_GROUPS: Load balancing flag.
50+
*/
51+
SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
52+
53+
/*
54+
* Balance on exec
55+
*
56+
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
57+
* NEEDS_GROUPS: Load balancing flag.
58+
*/
59+
SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
60+
61+
/*
62+
* Balance on fork, clone
63+
*
64+
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
65+
* NEEDS_GROUPS: Load balancing flag.
66+
*/
67+
SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
68+
69+
/*
70+
* Balance on wakeup
71+
*
72+
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
73+
* NEEDS_GROUPS: Load balancing flag.
74+
*/
75+
SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
76+
77+
/*
78+
* Consider waking task on waking CPU.
79+
*
80+
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
81+
*/
82+
SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
83+
84+
/*
85+
* Domain members have different CPU capacities
86+
*
87+
* SHARED_PARENT: Set from the topmost domain down to the first domain where
88+
* asymmetry is detected.
89+
* NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
90+
*/
91+
SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
92+
93+
/*
94+
* Domain members share CPU capacity (i.e. SMT)
95+
*
96+
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
97+
* CPU capacity.
98+
* NEEDS_GROUPS: Capacity is shared between groups.
99+
*/
100+
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
101+
102+
/*
103+
* Domain members share CPU package resources (i.e. caches)
104+
*
105+
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
106+
* the same cache(s).
107+
* NEEDS_GROUPS: Caches are shared between groups.
108+
*/
109+
SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
110+
111+
/*
112+
* Only a single load balancing instance
113+
*
114+
* SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a
115+
* different level upwards, but it doesn't change that if a
116+
* domain has this flag set, then all of its parents need to have
117+
* it too (otherwise the serialization doesn't make sense).
118+
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
119+
*/
120+
SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
121+
122+
/*
123+
* Place busy tasks earlier in the domain
124+
*
125+
* SHARED_CHILD: Usually set on the SMT level. Technically could be set further
126+
* up, but currently assumed to be set from the base domain
127+
* upwards (see update_top_cache_domain()).
128+
* NEEDS_GROUPS: Load balancing flag.
129+
*/
130+
SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
131+
132+
/*
133+
* Prefer to place tasks in a sibling domain
134+
*
135+
* Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD
136+
* flag, but cleared below domains with SD_ASYM_CPUCAPACITY.
137+
*
138+
* NEEDS_GROUPS: Load balancing flag.
139+
*/
140+
SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
141+
142+
/*
143+
* sched_groups of this level overlap
144+
*
145+
* SHARED_PARENT: Set for all NUMA levels above NODE.
146+
* NEEDS_GROUPS: Overlaps can only exist with more than one group.
147+
*/
148+
SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
149+
150+
/*
151+
* Cross-node balancing
152+
*
153+
* SHARED_PARENT: Set for all NUMA levels above NODE.
154+
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
155+
*/
156+
SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)

include/linux/sched/topology.h

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,29 @@
1111
*/
1212
#ifdef CONFIG_SMP
1313

14-
#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
15-
#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
16-
#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
17-
#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
18-
#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
19-
#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
20-
#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
21-
#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
22-
#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
23-
#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
24-
#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
25-
#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
26-
#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
27-
#define SD_NUMA 0x2000 /* cross-node balancing */
14+
/* Generate SD flag indexes */
15+
#define SD_FLAG(name, mflags) __##name,
16+
enum {
17+
#include <linux/sched/sd_flags.h>
18+
__SD_FLAG_CNT,
19+
};
20+
#undef SD_FLAG
21+
/* Generate SD flag bits */
22+
#define SD_FLAG(name, mflags) name = 1 << __##name,
23+
enum {
24+
#include <linux/sched/sd_flags.h>
25+
};
26+
#undef SD_FLAG
27+
28+
#ifdef CONFIG_SCHED_DEBUG
29+
30+
struct sd_flag_debug {
31+
unsigned int meta_flags;
32+
char *name;
33+
};
34+
extern const struct sd_flag_debug sd_flag_debug[];
35+
36+
#endif
2837

2938
#ifdef CONFIG_SCHED_SMT
3039
static inline int cpu_smt_flags(void)

include/linux/syscalls.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
974974
const char __user *const __user *argv,
975975
const char __user *const __user *envp, int flags);
976976
asmlinkage long sys_userfaultfd(int flags);
977-
asmlinkage long sys_membarrier(int cmd, int flags);
977+
asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
978978
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
979979
asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
980980
int fd_out, loff_t __user *off_out,

include/trace/events/sched.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp,
630630
TP_PROTO(struct sched_entity *se),
631631
TP_ARGS(se));
632632

633+
DECLARE_TRACE(sched_cpu_capacity_tp,
634+
TP_PROTO(struct rq *rq),
635+
TP_ARGS(rq));
636+
633637
DECLARE_TRACE(sched_overutilized_tp,
634638
TP_PROTO(struct root_domain *rd, bool overutilized),
635639
TP_ARGS(rd, overutilized));

include/uapi/linux/membarrier.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,26 @@
114114
* If this command is not implemented by an
115115
* architecture, -EINVAL is returned.
116116
* Returns 0 on success.
117+
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
118+
* Ensure the caller thread, upon return from
119+
* system call, that all its running thread
120+
* siblings have any currently running rseq
121+
* critical sections restarted if @flags
122+
* parameter is 0; if @flags parameter is
123+
* MEMBARRIER_CMD_FLAG_CPU,
124+
* then this operation is performed only
125+
* on CPU indicated by @cpu_id. If this command is
126+
* not implemented by an architecture, -EINVAL
127+
* is returned. A process needs to register its
128+
* intent to use the private expedited rseq
129+
* command prior to using it, otherwise
130+
* this command returns -EPERM.
131+
* @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
132+
* Register the process intent to use
133+
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ.
134+
* If this command is not implemented by an
135+
* architecture, -EINVAL is returned.
136+
* Returns 0 on success.
117137
* @MEMBARRIER_CMD_SHARED:
118138
* Alias to MEMBARRIER_CMD_GLOBAL. Provided for
119139
* header backward compatibility.
@@ -131,9 +151,15 @@ enum membarrier_cmd {
131151
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
132152
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
133153
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
154+
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7),
155+
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8),
134156

135157
/* Alias for header backward compatibility. */
136158
MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
137159
};
138160

161+
enum membarrier_cmd_flag {
162+
MEMBARRIER_CMD_FLAG_CPU = (1 << 0),
163+
};
164+
139165
#endif /* _UAPI_LINUX_MEMBARRIER_H */

0 commit comments

Comments
 (0)