Skip to content

Commit dd502a8

Browse files
committed
Merge tag 'core-static_call-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull static call support from Ingo Molnar: "This introduces static_call(), which is the idea of static_branch() applied to indirect function calls. Remove a data load (indirection) by modifying the text. They give the flexibility of function pointers, but with better performance. (This is especially important for cases where retpolines would otherwise be used, as retpolines can be pretty slow.) API overview: DECLARE_STATIC_CALL(name, func); DEFINE_STATIC_CALL(name, func); DEFINE_STATIC_CALL_NULL(name, typename); static_call(name)(args...); static_call_cond(name)(args...); static_call_update(name, func); x86 is supported via text patching, otherwise basic indirect calls are used, with function pointers. There's a second variant using inline code patching, inspired by jump-labels, implemented on x86 as well. The new APIs are utilized in the x86 perf code, a heavy user of function pointers, where static calls speed up the PMU handler by 4.2% (!). The generic implementation is not really excercised on other architectures, outside of the trivial test_static_call_init() self-test" * tag 'core-static_call-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits) static_call: Fix return type of static_call_init tracepoint: Fix out of sync data passing by static caller tracepoint: Fix overly long tracepoint names x86/perf, static_call: Optimize x86_pmu methods tracepoint: Optimize using static_call() static_call: Allow early init static_call: Add some validation static_call: Handle tail-calls static_call: Add static_call_cond() x86/alternatives: Teach text_poke_bp() to emulate RET static_call: Add simple self-test for static calls x86/static_call: Add inline static call implementation for x86-64 x86/static_call: Add out-of-line static call implementation static_call: Avoid kprobes on inline static_call()s static_call: Add inline static call infrastructure static_call: Add basic static call infrastructure compiler.h: Make __ADDRESSABLE() symbol truly unique jump_label,module: Fix module lifetime for __jump_label_mod_text_reserved() module: Properly propagate MODULE_STATE_COMING failure module: Fix up module_notifier return values ...
2 parents 34eb62d + 69e0ad3 commit dd502a8

47 files changed

Lines changed: 1585 additions & 241 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

arch/Kconfig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ config STATIC_KEYS_SELFTEST
106106
help
107107
Boot time self-test of the branch patching code.
108108

109+
config STATIC_CALL_SELFTEST
110+
bool "Static call selftest"
111+
depends on HAVE_STATIC_CALL
112+
help
113+
Boot time self-test of the call patching code.
114+
109115
config OPTPROBES
110116
def_bool y
111117
depends on KPROBES && HAVE_OPTPROBES
@@ -975,6 +981,13 @@ config HAVE_SPARSE_SYSCALL_NR
975981
config ARCH_HAS_VDSO_DATA
976982
bool
977983

984+
config HAVE_STATIC_CALL
985+
bool
986+
987+
config HAVE_STATIC_CALL_INLINE
988+
bool
989+
depends on HAVE_STATIC_CALL
990+
978991
source "kernel/gcov/Kconfig"
979992

980993
source "scripts/gcc-plugins/Kconfig"

arch/x86/Kconfig

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,8 @@ config X86
215215
select HAVE_FUNCTION_ARG_ACCESS_API
216216
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
217217
select HAVE_STACK_VALIDATION if X86_64
218+
select HAVE_STATIC_CALL
219+
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
218220
select HAVE_RSEQ
219221
select HAVE_SYSCALL_TRACEPOINTS
220222
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -230,6 +232,7 @@ config X86
230232
select RTC_MC146818_LIB
231233
select SPARSE_IRQ
232234
select SRCU
235+
select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
233236
select SYSCTL_EXCEPTION_TRACE
234237
select THREAD_INFO_IN_TASK
235238
select USER_STACKTRACE_SUPPORT
@@ -451,7 +454,6 @@ config GOLDFISH
451454
config RETPOLINE
452455
bool "Avoid speculative indirect branches in kernel"
453456
default y
454-
select STACK_VALIDATION if HAVE_STACK_VALIDATION
455457
help
456458
Compile kernel with the retpoline compiler options to guard against
457459
kernel-to-user data leaks by avoiding speculative indirect

arch/x86/events/core.c

Lines changed: 94 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <linux/bitops.h>
2929
#include <linux/device.h>
3030
#include <linux/nospec.h>
31+
#include <linux/static_call.h>
3132

3233
#include <asm/apic.h>
3334
#include <asm/stacktrace.h>
@@ -52,6 +53,34 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
5253
DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
5354
DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
5455

56+
/*
57+
* This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
58+
* from just a typename, as opposed to an actual function.
59+
*/
60+
DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
61+
DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
62+
DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
63+
DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
64+
DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
65+
66+
DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
67+
DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
68+
DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
69+
70+
DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
71+
DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
72+
DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
73+
74+
DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling);
75+
DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
76+
DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
77+
78+
DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
79+
DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
80+
81+
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
82+
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
83+
5584
u64 __read_mostly hw_cache_event_ids
5685
[PERF_COUNT_HW_CACHE_MAX]
5786
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -660,7 +689,7 @@ static void x86_pmu_disable(struct pmu *pmu)
660689
cpuc->enabled = 0;
661690
barrier();
662691

663-
x86_pmu.disable_all();
692+
static_call(x86_pmu_disable_all)();
664693
}
665694

666695
void x86_pmu_enable_all(int added)
@@ -907,8 +936,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
907936
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
908937
n0 -= cpuc->n_txn;
909938

910-
if (x86_pmu.start_scheduling)
911-
x86_pmu.start_scheduling(cpuc);
939+
static_call_cond(x86_pmu_start_scheduling)(cpuc);
912940

913941
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
914942
c = cpuc->event_constraint[i];
@@ -925,7 +953,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
925953
* change due to external factors (sibling state, allow_tfa).
926954
*/
927955
if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
928-
c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
956+
c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
929957
cpuc->event_constraint[i] = c;
930958
}
931959

@@ -1008,8 +1036,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
10081036
if (!unsched && assign) {
10091037
for (i = 0; i < n; i++) {
10101038
e = cpuc->event_list[i];
1011-
if (x86_pmu.commit_scheduling)
1012-
x86_pmu.commit_scheduling(cpuc, i, assign[i]);
1039+
static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
10131040
}
10141041
} else {
10151042
for (i = n0; i < n; i++) {
@@ -1018,15 +1045,13 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
10181045
/*
10191046
* release events that failed scheduling
10201047
*/
1021-
if (x86_pmu.put_event_constraints)
1022-
x86_pmu.put_event_constraints(cpuc, e);
1048+
static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
10231049

10241050
cpuc->event_constraint[i] = NULL;
10251051
}
10261052
}
10271053

1028-
if (x86_pmu.stop_scheduling)
1029-
x86_pmu.stop_scheduling(cpuc);
1054+
static_call_cond(x86_pmu_stop_scheduling)(cpuc);
10301055

10311056
return unsched ? -EINVAL : 0;
10321057
}
@@ -1226,7 +1251,7 @@ static void x86_pmu_enable(struct pmu *pmu)
12261251
cpuc->enabled = 1;
12271252
barrier();
12281253

1229-
x86_pmu.enable_all(added);
1254+
static_call(x86_pmu_enable_all)(added);
12301255
}
12311256

12321257
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1347,7 +1372,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
13471372
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
13481373
goto done_collect;
13491374

1350-
ret = x86_pmu.schedule_events(cpuc, n, assign);
1375+
ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
13511376
if (ret)
13521377
goto out;
13531378
/*
@@ -1365,13 +1390,11 @@ static int x86_pmu_add(struct perf_event *event, int flags)
13651390
cpuc->n_added += n - n0;
13661391
cpuc->n_txn += n - n0;
13671392

1368-
if (x86_pmu.add) {
1369-
/*
1370-
* This is before x86_pmu_enable() will call x86_pmu_start(),
1371-
* so we enable LBRs before an event needs them etc..
1372-
*/
1373-
x86_pmu.add(event);
1374-
}
1393+
/*
1394+
* This is before x86_pmu_enable() will call x86_pmu_start(),
1395+
* so we enable LBRs before an event needs them etc..
1396+
*/
1397+
static_call_cond(x86_pmu_add)(event);
13751398

13761399
ret = 0;
13771400
out:
@@ -1399,7 +1422,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
13991422
cpuc->events[idx] = event;
14001423
__set_bit(idx, cpuc->active_mask);
14011424
__set_bit(idx, cpuc->running);
1402-
x86_pmu.enable(event);
1425+
static_call(x86_pmu_enable)(event);
14031426
perf_event_update_userpage(event);
14041427
}
14051428

@@ -1469,7 +1492,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
14691492
struct hw_perf_event *hwc = &event->hw;
14701493

14711494
if (test_bit(hwc->idx, cpuc->active_mask)) {
1472-
x86_pmu.disable(event);
1495+
static_call(x86_pmu_disable)(event);
14731496
__clear_bit(hwc->idx, cpuc->active_mask);
14741497
cpuc->events[hwc->idx] = NULL;
14751498
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
@@ -1519,8 +1542,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
15191542
if (i >= cpuc->n_events - cpuc->n_added)
15201543
--cpuc->n_added;
15211544

1522-
if (x86_pmu.put_event_constraints)
1523-
x86_pmu.put_event_constraints(cpuc, event);
1545+
static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
15241546

15251547
/* Delete the array entry. */
15261548
while (++i < cpuc->n_events) {
@@ -1533,13 +1555,12 @@ static void x86_pmu_del(struct perf_event *event, int flags)
15331555
perf_event_update_userpage(event);
15341556

15351557
do_del:
1536-
if (x86_pmu.del) {
1537-
/*
1538-
* This is after x86_pmu_stop(); so we disable LBRs after any
1539-
* event can need them etc..
1540-
*/
1541-
x86_pmu.del(event);
1542-
}
1558+
1559+
/*
1560+
* This is after x86_pmu_stop(); so we disable LBRs after any
1561+
* event can need them etc..
1562+
*/
1563+
static_call_cond(x86_pmu_del)(event);
15431564
}
15441565

15451566
int x86_pmu_handle_irq(struct pt_regs *regs)
@@ -1617,7 +1638,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
16171638
return NMI_DONE;
16181639

16191640
start_clock = sched_clock();
1620-
ret = x86_pmu.handle_irq(regs);
1641+
ret = static_call(x86_pmu_handle_irq)(regs);
16211642
finish_clock = sched_clock();
16221643

16231644
perf_sample_event_took(finish_clock - start_clock);
@@ -1830,6 +1851,38 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
18301851
static struct attribute_group x86_pmu_attr_group;
18311852
static struct attribute_group x86_pmu_caps_group;
18321853

1854+
static void x86_pmu_static_call_update(void)
1855+
{
1856+
static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
1857+
static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
1858+
static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
1859+
static_call_update(x86_pmu_enable, x86_pmu.enable);
1860+
static_call_update(x86_pmu_disable, x86_pmu.disable);
1861+
1862+
static_call_update(x86_pmu_add, x86_pmu.add);
1863+
static_call_update(x86_pmu_del, x86_pmu.del);
1864+
static_call_update(x86_pmu_read, x86_pmu.read);
1865+
1866+
static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
1867+
static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
1868+
static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
1869+
1870+
static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
1871+
static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
1872+
static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
1873+
1874+
static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
1875+
static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);
1876+
1877+
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
1878+
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
1879+
}
1880+
1881+
static void _x86_pmu_read(struct perf_event *event)
1882+
{
1883+
x86_perf_event_update(event);
1884+
}
1885+
18331886
static int __init init_hw_perf_events(void)
18341887
{
18351888
struct x86_pmu_quirk *quirk;
@@ -1898,6 +1951,11 @@ static int __init init_hw_perf_events(void)
18981951
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
18991952
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
19001953

1954+
if (!x86_pmu.read)
1955+
x86_pmu.read = _x86_pmu_read;
1956+
1957+
x86_pmu_static_call_update();
1958+
19011959
/*
19021960
* Install callbacks. Core will call them for each online
19031961
* cpu.
@@ -1934,11 +1992,9 @@ static int __init init_hw_perf_events(void)
19341992
}
19351993
early_initcall(init_hw_perf_events);
19361994

1937-
static inline void x86_pmu_read(struct perf_event *event)
1995+
static void x86_pmu_read(struct perf_event *event)
19381996
{
1939-
if (x86_pmu.read)
1940-
return x86_pmu.read(event);
1941-
x86_perf_event_update(event);
1997+
static_call(x86_pmu_read)(event);
19421998
}
19431999

19442000
/*
@@ -2015,7 +2071,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
20152071
if (!x86_pmu_initialized())
20162072
return -EAGAIN;
20172073

2018-
ret = x86_pmu.schedule_events(cpuc, n, assign);
2074+
ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
20192075
if (ret)
20202076
return ret;
20212077

@@ -2308,15 +2364,13 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
23082364

23092365
static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
23102366
{
2311-
if (x86_pmu.sched_task)
2312-
x86_pmu.sched_task(ctx, sched_in);
2367+
static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
23132368
}
23142369

23152370
static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
23162371
struct perf_event_context *next)
23172372
{
2318-
if (x86_pmu.swap_task_ctx)
2319-
x86_pmu.swap_task_ctx(prev, next);
2373+
static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
23202374
}
23212375

23222376
void perf_check_microcode(void)

arch/x86/include/asm/static_call.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
#ifndef _ASM_STATIC_CALL_H
3+
#define _ASM_STATIC_CALL_H
4+
5+
#include <asm/text-patching.h>
6+
7+
/*
8+
* For CONFIG_HAVE_STATIC_CALL_INLINE, this is a temporary trampoline which
9+
* uses the current value of the key->func pointer to do an indirect jump to
10+
* the function. This trampoline is only used during boot, before the call
11+
* sites get patched by static_call_update(). The name of this trampoline has
12+
* a magical aspect: objtool uses it to find static call sites so it can create
13+
* the .static_call_sites section.
14+
*
15+
* For CONFIG_HAVE_STATIC_CALL, this is a permanent trampoline which
16+
* does a direct jump to the function. The direct jump gets patched by
17+
* static_call_update().
18+
*
19+
* Having the trampoline in a special section forces GCC to emit a JMP.d32 when
20+
* it does tail-call optimization on the call; since you cannot compute the
21+
* relative displacement across sections.
22+
*/
23+
24+
#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insns) \
25+
asm(".pushsection .static_call.text, \"ax\" \n" \
26+
".align 4 \n" \
27+
".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
28+
STATIC_CALL_TRAMP_STR(name) ": \n" \
29+
insns " \n" \
30+
".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
31+
".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
32+
".popsection \n")
33+
34+
#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \
35+
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")
36+
37+
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
38+
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
39+
40+
#endif /* _ASM_STATIC_CALL_H */

0 commit comments

Comments
 (0)