Skip to content

Commit c80ed4e

Browse files
atomic-kernel1Naim
authored andcommitted
sched/core: Make finish_task_switch() and its subfunctions always inline
finish_task_switch() is not inlined even in the O2 level optimization, performance testing indicates that this could lead to a significant performance degradation when certain Spectre vulnerability mitigations are enabled. In switch_mm_irq_off(), some mitigations may clear branch prediction history, or the instruction cache, like arm64_apply_bp_hardening() on arm64, BPIALL/ICIALLU on arm, and indirect_branch_prediction_barrier() on x86. finish_task_switch() is right after switch_mm_irqs_off(), the performance is greatly affected by function calls and branch jumps. __schedule() has a __sched attribute, which makes it be placed in '.sched.text' section, while finish_task_switch() does not. This makes they "far away from each other" in vmlinux, which aggravating the performance degradation. Make finish_task_switch() and its subfunctions always inline to optimize the performance. Performance test data - time spent on calling finish_task_switch(): 1. x86-64: Intel i5-8300h@4Ghz, DDR4@2666mhz; unit: x86's tsc | test scenario | old | new | delta | | gcc 15.2 | 27.50 | 25.45 | -2.05 ( -7.5%) | | gcc 15.2 + spectre_v2_user=on | 46.75 | 25.96 | -20.79 (-44.5%) | | clang 21.1.7 | 27.25 | 25.45 | -1.80 ( -6.6%) | | clang 21.1.7 + spectre_v2_user=on | 39.50 | 26.00 | -13.50 (-34.2%) | 2. x86-64: AMD 9600x@5.45Ghz, DDR5@4800mhz; unit: x86's tsc | test scenario | old | new | delta | | gcc 15.2 | 27.51 | 27.51 | 0 ( 0%) | | gcc 15.2 + spectre_v2_user=on | 105.21 | 67.89 | -37.32 (-35.5%) | | clang 21.1.7 | 27.51 | 27.51 | 0 ( 0%) | | clang 21.1.7 + spectre_v2_user=on | 104.15 | 67.52 | -36.63 (-35.2%) | 3. arm64: Raspberry Pi 3b Rev 1.2, Cortex-A53@1.2Ghz; unit: cntvct_el0 | test scenario | old | new | delta | | gcc 15.2 | 1.453 | 1.115 | -0.338 (-23.3%) | | clang 21.1.7 | 1.532 | 1.123 | -0.409 (-26.7%) | 4. arm32: Raspberry Pi 3b Rev 1.2, Cortex-A53@1.2Ghz; unit: cntvct_el0 | test scenario | old | new | delta | | gcc 15.2 | 1.421 | 1.187 | -0.234 (-16.5%) | | clang 21.1.7 | 1.437 | 1.200 | -0.237 (-16.5%) | Cc: Thomas Gleixner <tglx@kernel.org> Cc: Rik van Riel <riel@surriel.com> Cc: Segher Boessenkool <segher@kernel.crashing.org> Cc: David Hildenbrand (Red Hat) <david@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: H. Peter Anvin (Intel) <hpa@zytor.com> Cc: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Xie Yuanbin <qq570070308@gmail.com>
1 parent 7d6c90b commit c80ed4e

11 files changed

Lines changed: 35 additions & 35 deletions

File tree

arch/arm/include/asm/mmu_context.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ static inline void check_and_switch_context(struct mm_struct *mm,
8080
#ifndef MODULE
8181
#define finish_arch_post_lock_switch \
8282
finish_arch_post_lock_switch
83-
static inline void finish_arch_post_lock_switch(void)
83+
static __always_inline void finish_arch_post_lock_switch(void)
8484
{
8585
struct mm_struct *mm = current->mm;
8686

arch/riscv/include/asm/sync_core.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* RISC-V implements return to user-space through an xRET instruction,
77
* which is not core serializing.
88
*/
9-
static inline void sync_core_before_usermode(void)
9+
static __always_inline void sync_core_before_usermode(void)
1010
{
1111
asm volatile ("fence.i" ::: "memory");
1212
}

arch/s390/include/asm/mmu_context.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
9393
}
9494

9595
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
96-
static inline void finish_arch_post_lock_switch(void)
96+
static __always_inline void finish_arch_post_lock_switch(void)
9797
{
9898
struct task_struct *tsk = current;
9999
struct mm_struct *mm = tsk->mm;

arch/sparc/include/asm/mmu_context_64.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ static inline void arch_start_context_switch(struct task_struct *prev)
160160
}
161161

162162
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
163-
static inline void finish_arch_post_lock_switch(void)
163+
static __always_inline void finish_arch_post_lock_switch(void)
164164
{
165165
/* Restore the state of MCDPER register for the new process
166166
* just switched to.

arch/x86/include/asm/sync_core.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ static __always_inline void sync_core(void)
9393
* to user-mode. x86 implements return to user-space through sysexit,
9494
* sysrel, and sysretq, which are not core serializing.
9595
*/
96-
static inline void sync_core_before_usermode(void)
96+
static __always_inline void sync_core_before_usermode(void)
9797
{
9898
/* With PTI, we unconditionally serialize before running user code. */
9999
if (static_cpu_has(X86_FEATURE_PTI))

include/linux/perf_event.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1632,7 +1632,7 @@ static inline void perf_event_task_migrate(struct task_struct *task)
16321632
task->sched_migrated = 1;
16331633
}
16341634

1635-
static inline void perf_event_task_sched_in(struct task_struct *prev,
1635+
static __always_inline void perf_event_task_sched_in(struct task_struct *prev,
16361636
struct task_struct *task)
16371637
{
16381638
if (static_branch_unlikely(&perf_sched_events))

include/linux/sched/mm.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ static inline void smp_mb__after_mmgrab(void)
4444

4545
extern void __mmdrop(struct mm_struct *mm);
4646

47-
static inline void mmdrop(struct mm_struct *mm)
47+
static __always_inline void mmdrop(struct mm_struct *mm)
4848
{
4949
/*
5050
* The implicit full barrier implied by atomic_dec_and_test() is
@@ -71,14 +71,14 @@ static inline void __mmdrop_delayed(struct rcu_head *rhp)
7171
* Invoked from finish_task_switch(). Delegates the heavy lifting on RT
7272
* kernels via RCU.
7373
*/
74-
static inline void mmdrop_sched(struct mm_struct *mm)
74+
static __always_inline void mmdrop_sched(struct mm_struct *mm)
7575
{
7676
/* Provides a full memory barrier. See mmdrop() */
7777
if (atomic_dec_and_test(&mm->mm_count))
7878
call_rcu(&mm->delayed_drop, __mmdrop_delayed);
7979
}
8080
#else
81-
static inline void mmdrop_sched(struct mm_struct *mm)
81+
static __always_inline void mmdrop_sched(struct mm_struct *mm)
8282
{
8383
mmdrop(mm);
8484
}
@@ -104,7 +104,7 @@ static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
104104
}
105105
}
106106

107-
static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
107+
static __always_inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
108108
{
109109
if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
110110
mmdrop_sched(mm);
@@ -532,7 +532,7 @@ enum {
532532
#include <asm/membarrier.h>
533533
#endif
534534

535-
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
535+
static __always_inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
536536
{
537537
/*
538538
* The atomic_read() below prevents CSE. The following should

include/linux/tick.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ extern cpumask_var_t tick_nohz_full_mask;
177177
#ifdef CONFIG_NO_HZ_FULL
178178
extern bool tick_nohz_full_running;
179179

180-
static inline bool tick_nohz_full_enabled(void)
180+
static __always_inline bool tick_nohz_full_enabled(void)
181181
{
182182
if (!context_tracking_enabled())
183183
return false;
@@ -301,7 +301,7 @@ static inline void __tick_nohz_task_switch(void) { }
301301
static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
302302
#endif
303303

304-
static inline void tick_nohz_task_switch(void)
304+
static __always_inline void tick_nohz_task_switch(void)
305305
{
306306
if (tick_nohz_full_enabled())
307307
__tick_nohz_task_switch();

include/linux/vtime.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,24 +67,24 @@ static __always_inline void vtime_account_guest_exit(void)
6767
* For now vtime state is tied to context tracking. We might want to decouple
6868
* those later if necessary.
6969
*/
70-
static inline bool vtime_accounting_enabled(void)
70+
static __always_inline bool vtime_accounting_enabled(void)
7171
{
7272
return context_tracking_enabled();
7373
}
7474

75-
static inline bool vtime_accounting_enabled_cpu(int cpu)
75+
static __always_inline bool vtime_accounting_enabled_cpu(int cpu)
7676
{
7777
return context_tracking_enabled_cpu(cpu);
7878
}
7979

80-
static inline bool vtime_accounting_enabled_this_cpu(void)
80+
static __always_inline bool vtime_accounting_enabled_this_cpu(void)
8181
{
8282
return context_tracking_enabled_this_cpu();
8383
}
8484

8585
extern void vtime_task_switch_generic(struct task_struct *prev);
8686

87-
static inline void vtime_task_switch(struct task_struct *prev)
87+
static __always_inline void vtime_task_switch(struct task_struct *prev)
8888
{
8989
if (vtime_accounting_enabled_this_cpu())
9090
vtime_task_switch_generic(prev);

kernel/sched/core.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4891,7 +4891,7 @@ static inline void prepare_task(struct task_struct *next)
48914891
WRITE_ONCE(next->on_cpu, 1);
48924892
}
48934893

4894-
static inline void finish_task(struct task_struct *prev)
4894+
static __always_inline void finish_task(struct task_struct *prev)
48954895
{
48964896
/*
48974897
* This must be the very last reference to @prev from this CPU. After
@@ -4907,7 +4907,7 @@ static inline void finish_task(struct task_struct *prev)
49074907
smp_store_release(&prev->on_cpu, 0);
49084908
}
49094909

4910-
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
4910+
static __always_inline void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
49114911
{
49124912
void (*func)(struct rq *rq);
49134913
struct balance_callback *next;
@@ -4942,7 +4942,7 @@ struct balance_callback balance_push_callback = {
49424942
.func = balance_push,
49434943
};
49444944

4945-
static inline struct balance_callback *
4945+
static __always_inline struct balance_callback *
49464946
__splice_balance_callbacks(struct rq *rq, bool split)
49474947
{
49484948
struct balance_callback *head = rq->balance_callback;
@@ -5016,7 +5016,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
50165016
__acquire(__rq_lockp(this_rq()));
50175017
}
50185018

5019-
static inline void finish_lock_switch(struct rq *rq)
5019+
static __always_inline void finish_lock_switch(struct rq *rq)
50205020
__releases(__rq_lockp(rq))
50215021
{
50225022
/*
@@ -5049,7 +5049,7 @@ static inline void kmap_local_sched_out(void)
50495049
#endif
50505050
}
50515051

5052-
static inline void kmap_local_sched_in(void)
5052+
static __always_inline void kmap_local_sched_in(void)
50535053
{
50545054
#ifdef CONFIG_KMAP_LOCAL
50555055
if (unlikely(current->kmap_ctrl.idx))
@@ -5103,7 +5103,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
51035103
* past. 'prev == current' is still correct but we need to recalculate this_rq
51045104
* because prev may have moved to another CPU.
51055105
*/
5106-
static struct rq *finish_task_switch(struct task_struct *prev)
5106+
static __always_inline struct rq *finish_task_switch(struct task_struct *prev)
51075107
__releases(__rq_lockp(this_rq()))
51085108
{
51095109
struct rq *rq = this_rq();

0 commit comments

Comments
 (0)