diff options
| author | Mark Brown <broonie@kernel.org> | 2026-05-29 23:00:10 +0100 |
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2026-05-29 23:00:10 +0100 |
| commit | e8b79e654b43e511a9ff0af1963f86718564c7d6 (patch) | |
| tree | 716d1d18f145b89bd61f852ef7ac0b94914f6e5a /tools | |
| parent | 8d353be232212e7e9a53c582d6cbc9570ad24ab8 (diff) | |
| parent | 1c24a913b8ebd4e7ef86e1259f3a77b06b4911b9 (diff) | |
| download | linux-next-history-e8b79e654b43e511a9ff0af1963f86718564c7d6.tar.gz | |
Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
Diffstat (limited to 'tools')
| -rw-r--r-- | tools/sched_ext/README.md | 6 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/cid.bpf.h | 678 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/common.bpf.h | 28 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.bpf.h | 30 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.h | 23 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/user_exit_info.bpf.h | 3 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/user_exit_info.h | 2 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/user_exit_info_common.h | 5 | ||||
| -rw-r--r-- | tools/sched_ext/scx_central.bpf.c | 10 | ||||
| -rw-r--r-- | tools/sched_ext/scx_cpu0.bpf.c | 2 | ||||
| -rw-r--r-- | tools/sched_ext/scx_cpu0.c | 2 | ||||
| -rw-r--r-- | tools/sched_ext/scx_flatcg.c | 12 | ||||
| -rw-r--r-- | tools/sched_ext/scx_qmap.bpf.c | 890 | ||||
| -rw-r--r-- | tools/sched_ext/scx_qmap.c | 107 | ||||
| -rw-r--r-- | tools/sched_ext/scx_qmap.h | 73 | ||||
| -rw-r--r-- | tools/sched_ext/scx_show_state.py | 19 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c | 7 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/select_cpu_dfl.c | 54 |
18 files changed, 1531 insertions, 420 deletions
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md index 6e282bce453cc..0ee5a3d997e5d 100644 --- a/tools/sched_ext/README.md +++ b/tools/sched_ext/README.md @@ -168,9 +168,9 @@ well on single-socket systems with a unified L3 cache. Another simple, yet slightly more complex scheduler that provides an example of a basic weighted FIFO queuing policy. It also provides examples of some common -useful BPF features, such as sleepable per-task storage allocation in the -`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to -enqueue tasks. It also illustrates how core-sched support could be implemented. +useful BPF features, such as arena-backed doubly-linked lists threaded through +per-task context and `bpf_res_spin_lock` for per-queue synchronization. It also +illustrates how core-sched support could be implemented. ## scx_central diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h new file mode 100644 index 0000000000000..70f2a3829af4d --- /dev/null +++ b/tools/sched_ext/include/scx/cid.bpf.h @@ -0,0 +1,678 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF-side helpers for cids and cmasks. See kernel/sched/ext_cid.h for the + * authoritative layout and semantics. The BPF-side helpers use the cmask_* + * naming (no scx_ prefix); cmask is the SCX bitmap type so the prefix is + * redundant in BPF code. Atomics use __sync_val_compare_and_swap and every + * helper is inline (no .c counterpart). + * + * Included by scx/common.bpf.h; don't include directly. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef __SCX_CID_BPF_H +#define __SCX_CID_BPF_H + +#include "bpf_arena_common.bpf.h" + +#ifndef BIT_U64 +#define BIT_U64(nr) (1ULL << (nr)) +#endif +#ifndef GENMASK_U64 +#define GENMASK_U64(h, l) ((~0ULL << (l)) & (~0ULL >> (63 - (h)))) +#endif + +/* + * Storage cap for bounded loops over bits[]. Sized to cover NR_CPUS=8192 with + * one extra word for head-misalignment. Increase if deployment targets larger + * NR_CPUS. + */ +#ifndef CMASK_MAX_WORDS +#define CMASK_MAX_WORDS 129 +#endif + +/* + * Mirrors SCX_CMASK_NR_WORDS in kernel/sched/ext_types.h. The u64 cast keeps + * the +63 from wrapping when @nr_cids is near U32_MAX, so cmask_reframe() + * bounds-checking the result against alloc_words catches the overflow instead + * of seeing a small value. + */ +#define CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1)) + +static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid) +{ + return cid >= m->base && cid < m->base + m->nr_cids; +} + +static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena *m, u32 cid) +{ + return (u64 __arena *)&m->bits[cid / 64 - m->base / 64]; +} + +/** + * __cmask_init - Initialize @m with explicit storage capacity + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * @alloc_cids: storage capacity in cids, at least @nr_cids + * + * Use when storage is sized larger than the initial active range. All of + * bits[] is zeroed. + */ +static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base, + u32 nr_cids, u32 alloc_cids) +{ + u32 alloc_words, i; + + if (unlikely(nr_cids > alloc_cids)) { + scx_bpf_error("__cmask_init: nr_cids=%u exceeds alloc_cids=%u", + nr_cids, alloc_cids); + return; + } + alloc_words = CMASK_NR_WORDS(alloc_cids); + + m->base = base; + m->nr_cids = nr_cids; + m->alloc_words = alloc_words; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= alloc_words) + break; + m->bits[i] = 0; + } +} + +/** + * cmask_init - Initialize @m on tight storage + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * + * All of bits[] is zeroed. + */ +static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids) +{ + __cmask_init(m, base, nr_cids, nr_cids); +} + +/** + * cmask_reframe - Reshape @m's active range without resizing storage + * @m: cmask to reframe + * @base: new active range base + * @nr_cids: new active range length, must fit within @m->alloc_words + * + * Body bits within the new range become garbage - only the head and tail + * words are zeroed to keep the padding invariant. + */ +static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids) +{ + if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) { + scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u", + nr_cids, m->alloc_words); + return; + } + if (nr_cids) { + u32 last_word = ((base & 63) + nr_cids - 1) / 64; + + m->bits[0] = 0; + m->bits[last_word] = 0; + } + m->base = base; + m->nr_cids = nr_cids; +} + +static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid) +{ + if (!__cmask_contains(m, cid)) + return false; + return *__cmask_word(m, cid) & BIT_U64(cid & 63); +} + +/* + * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena + * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only + * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement + * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop. + * + * CMASK_CAS_TRIES is sized so exhausting it means seconds of real spinning + * on one word - past any plausible contention. Abort hard. + */ +#define CMASK_CAS_TRIES (1U << 23) + +static __always_inline void cmask_set(struct scx_cmask __arena *m, u32 cid) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(m, cid)) + return; + w = __cmask_word(m, cid); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (old & bit) + return; + new = old | bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return; + } + scx_bpf_error("cmask_set CAS exhausted at cid %u", cid); +} + +static __always_inline void cmask_clear(struct scx_cmask __arena *m, u32 cid) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(m, cid)) + return; + w = __cmask_word(m, cid); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (!(old & bit)) + return; + new = old & ~bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return; + } + scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid); +} + +static __always_inline bool cmask_test_and_set(struct scx_cmask __arena *m, u32 cid) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(m, cid)) + return false; + w = __cmask_word(m, cid); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (old & bit) + return true; + new = old | bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return false; + } + scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid); + return false; +} + +static __always_inline bool cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(m, cid)) + return false; + w = __cmask_word(m, cid); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (!(old & bit)) + return false; + new = old & ~bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return true; + } + scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid); + return false; +} + +static __always_inline void __cmask_set(struct scx_cmask __arena *m, u32 cid) +{ + if (!__cmask_contains(m, cid)) + return; + *__cmask_word(m, cid) |= BIT_U64(cid & 63); +} + +static __always_inline void __cmask_clear(struct scx_cmask __arena *m, u32 cid) +{ + if (!__cmask_contains(m, cid)) + return; + *__cmask_word(m, cid) &= ~BIT_U64(cid & 63); +} + +static __always_inline bool __cmask_test_and_set(struct scx_cmask __arena *m, u32 cid) +{ + u64 bit = BIT_U64(cid & 63); + u64 __arena *w; + u64 prev; + + if (!__cmask_contains(m, cid)) + return false; + w = __cmask_word(m, cid); + prev = *w & bit; + *w |= bit; + return prev; +} + +static __always_inline bool __cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid) +{ + u64 bit = BIT_U64(cid & 63); + u64 __arena *w; + u64 prev; + + if (!__cmask_contains(m, cid)) + return false; + w = __cmask_word(m, cid); + prev = *w & bit; + *w &= ~bit; + return prev; +} + +static __always_inline void cmask_zero(struct scx_cmask __arena *m) +{ + u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= nr_words) + break; + m->bits[i] = 0; + } +} + +/* + * BPF_-prefixed to avoid colliding with the kernel's anonymous CMASK_OP_* + * enum in ext_cid.c, which is exported via BTF and reachable through + * vmlinux.h. + */ +enum { + BPF_CMASK_OP_AND, + BPF_CMASK_OP_OR, + BPF_CMASK_OP_COPY, + BPF_CMASK_OP_ANDNOT, +}; + +static __always_inline void cmask_op_word(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src, + u32 di, u32 si, u64 mask, int op) +{ + u64 dv = dst->bits[di]; + u64 sv = src->bits[si]; + u64 rv; + + if (op == BPF_CMASK_OP_AND) + rv = dv & sv; + else if (op == BPF_CMASK_OP_OR) + rv = dv | sv; + else if (op == BPF_CMASK_OP_ANDNOT) + rv = dv & ~sv; + else + rv = sv; + + dst->bits[di] = (dv & ~mask) | (rv & mask); +} + +static __always_inline void cmask_op(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src, int op) +{ + u32 d_end = dst->base + dst->nr_cids; + u32 s_end = src->base + src->nr_cids; + u32 lo = dst->base > src->base ? dst->base : src->base; + u32 hi = d_end < s_end ? d_end : s_end; + u32 d_base = dst->base / 64; + u32 s_base = src->base / 64; + u32 lo_word, hi_word, i; + u64 head_mask, tail_mask; + + if (lo >= hi) + return; + + lo_word = lo / 64; + hi_word = (hi - 1) / 64; + head_mask = GENMASK_U64(63, lo & 63); + tail_mask = GENMASK_U64((hi - 1) & 63, 0); + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 w = lo_word + i; + u64 m; + + if (w > hi_word) + break; + + m = GENMASK_U64(63, 0); + if (w == lo_word) + m &= head_mask; + if (w == hi_word) + m &= tail_mask; + + cmask_op_word(dst, src, w - d_base, w - s_base, m, op); + } +} + +/* + * cmask_and/or/copy only modify @dst bits that lie in the intersection of + * [@dst->base, @dst->base + @dst->nr_cids) and [@src->base, + * @src->base + @src->nr_cids). Bits in @dst outside that window + * keep their prior values - in particular, cmask_copy() does NOT zero @dst + * bits that lie outside @src's range. + */ +static __always_inline void cmask_and(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_AND); +} + +static __always_inline void cmask_or(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_OR); +} + +static __always_inline void cmask_copy(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_COPY); +} + +static __always_inline void cmask_andnot(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_ANDNOT); +} + +/* + * True iff @a and @b have identical bits over their (assumed equal) range. + * Callers are expected to pass same-shape cmasks; differing shapes always + * compare unequal. + */ +static __always_inline bool cmask_equal(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b) +{ + u32 nr_words, i; + + if (a->base != b->base || a->nr_cids != b->nr_cids) + return false; + nr_words = CMASK_NR_WORDS(a->nr_cids); + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= nr_words) + break; + if (a->bits[i] != b->bits[i]) + return false; + } + return true; +} + +/* + * True iff every bit set in @a is also set in @b over the intersection of + * their ranges. Bits of @a outside @b's range fail the test. + */ +static __always_inline bool cmask_subset(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b) +{ + u32 a_end = a->base + a->nr_cids; + u32 b_end = b->base + b->nr_cids; + u32 a_wbase = a->base / 64; + u32 b_wbase = b->base / 64; + u32 nr_words, i; + + /* any bit of @a outside @b's range is a subset violation */ + if (a->base < b->base || a_end > b_end) + return false; + + nr_words = CMASK_NR_WORDS(a->nr_cids); + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 wi_b; + + if (i >= nr_words) + break; + wi_b = a_wbase + i - b_wbase; + if (a->bits[i] & ~b->bits[wi_b]) + return false; + } + return true; +} + +/** + * cmask_next_set - find the first set bit at or after @cid + * @m: cmask to search + * @cid: starting cid (clamped to @m->base if below) + * + * Returns the smallest set cid in [@cid, @m->base + @m->nr_cids), or + * @m->base + @m->nr_cids if none (the out-of-range sentinel matches the + * termination condition used by cmask_for_each()). + */ +static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid) +{ + u32 end = m->base + m->nr_cids; + u32 base = m->base / 64; + u32 last_wi = (end - 1) / 64 - base; + u32 start_wi, start_bit, i; + + if (cid < m->base) + cid = m->base; + if (cid >= end) + return end; + + start_wi = cid / 64 - base; + start_bit = cid & 63; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 wi = start_wi + i; + u64 word; + u32 found; + + if (wi > last_wi) + break; + + word = m->bits[wi]; + if (i == 0) + word &= GENMASK_U64(63, start_bit); + if (!word) + continue; + + found = (base + wi) * 64 + ctzll(word); + if (found >= end) + return end; + return found; + } + return end; +} + +static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m) +{ + return cmask_next_set(m, m->base); +} + +#define cmask_for_each(cid, m) \ + for ((cid) = cmask_first_set(m); \ + (cid) < (m)->base + (m)->nr_cids; \ + (cid) = cmask_next_set((m), (cid) + 1)) + +/* + * Population count over [base, base + nr_cids). Padding bits in the head/tail + * words are guaranteed zero by the mutating helpers, so a flat popcount over + * all words is correct. + */ +static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m) +{ + u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i; + u32 count = 0; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= nr_words) + break; + count += __builtin_popcountll(m->bits[i]); + } + return count; +} + +/* + * True if @a and @b share any set bit. Walk only the intersection of their + * ranges, matching the semantics of cmask_and(). + */ +static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b) +{ + u32 a_end = a->base + a->nr_cids; + u32 b_end = b->base + b->nr_cids; + u32 lo = a->base > b->base ? a->base : b->base; + u32 hi = a_end < b_end ? a_end : b_end; + u32 a_base = a->base / 64; + u32 b_base = b->base / 64; + u32 lo_word, hi_word, i; + u64 head_mask, tail_mask; + + if (lo >= hi) + return false; + + lo_word = lo / 64; + hi_word = (hi - 1) / 64; + head_mask = GENMASK_U64(63, lo & 63); + tail_mask = GENMASK_U64((hi - 1) & 63, 0); + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 w = lo_word + i; + u64 mask, av, bv; + + if (w > hi_word) + break; + + mask = GENMASK_U64(63, 0); + if (w == lo_word) + mask &= head_mask; + if (w == hi_word) + mask &= tail_mask; + + av = a->bits[w - a_base] & mask; + bv = b->bits[w - b_base] & mask; + if (av & bv) + return true; + } + return false; +} + +/* + * Find the next cid set in both @a and @b at or after @start, bounded by the + * intersection of the two ranges. Return a->base + a->nr_cids if none found. + * + * Building block for cmask_next_and_set_wrap(). Callers that want a bounded + * scan without wrap call this directly. + */ +static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b, + u32 start) +{ + u32 a_end = a->base + a->nr_cids; + u32 b_end = b->base + b->nr_cids; + u32 a_wbase = a->base / 64; + u32 b_wbase = b->base / 64; + u32 lo = a->base > b->base ? a->base : b->base; + u32 hi = a_end < b_end ? a_end : b_end; + u32 last_wi, start_wi, start_bit, i; + + if (lo >= hi) + return a_end; + if (start < lo) + start = lo; + if (start >= hi) + return a_end; + + last_wi = (hi - 1) / 64; + start_wi = start / 64; + start_bit = start & 63; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 abs_wi = start_wi + i; + u64 word; + u32 found; + + if (abs_wi > last_wi) + break; + + word = a->bits[abs_wi - a_wbase] & b->bits[abs_wi - b_wbase]; + if (i == 0) + word &= GENMASK_U64(63, start_bit); + if (!word) + continue; + + found = abs_wi * 64 + ctzll(word); + if (found >= hi) + return a_end; + return found; + } + return a_end; +} + +/* + * Find the next set cid in @m at or after @start, wrapping to @m->base if no + * set bit is found in [start, m->base + m->nr_cids). Return m->base + + * m->nr_cids if @m is empty. + * + * Callers do round-robin distribution by passing (last_cid + 1) as @start. + */ +static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m, + u32 start) +{ + u32 end = m->base + m->nr_cids; + u32 found; + + found = cmask_next_set(m, start); + if (found < end || start <= m->base) + return found; + + found = cmask_next_set(m, m->base); + return found < start ? found : end; +} + +/* + * Find the next cid set in both @a and @b at or after @start, wrapping to + * @a->base if none found in the forward half. Return a->base + a->nr_cids + * if the intersection is empty. + * + * Callers do round-robin distribution by passing (last_cid + 1) as @start. + */ +static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b, + u32 start) +{ + u32 a_end = a->base + a->nr_cids; + u32 found; + + found = cmask_next_and_set(a, b, start); + if (found < a_end || start <= a->base) + return found; + + found = cmask_next_and_set(a, b, a->base); + return found < start ? found : a_end; +} + +/** + * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask + * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base + + * @m->nr_cids) are updated - cpus mapping to cids outside that range + * are ignored. + * @cpumask: kernel cpumask to translate + * + * For each cpu in @cpumask, set the cpu's cid in @m. Caller must ensure + * @cpumask stays stable across the call (e.g. RCU read lock for + * task->cpus_ptr). + */ +static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m, + const struct cpumask *cpumask) +{ + u32 nr_cpu_ids = scx_bpf_nr_cpu_ids(); + s32 cpu; + + cmask_zero(m); + bpf_for(cpu, 0, nr_cpu_ids) { + s32 cid; + + if (!bpf_cpumask_test_cpu(cpu, cpumask)) + continue; + cid = scx_bpf_cpu_to_cid(cpu); + if (cid >= 0) + __cmask_set(m, cid); + } +} + +#endif /* __SCX_CID_BPF_H */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 19459dedde41a..5f715d69cde6d 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -99,8 +99,21 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym; struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; +struct task_struct *scx_bpf_tid_to_task(u64 tid) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; +s32 scx_bpf_cpu_to_cid(s32 cpu) __ksym __weak; +s32 scx_bpf_cid_to_cpu(s32 cid) __ksym __weak; +void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out) __ksym __weak; +void scx_bpf_kick_cid(s32 cid, u64 flags) __ksym __weak; +s32 scx_bpf_task_cid(const struct task_struct *p) __ksym __weak; +s32 scx_bpf_this_cid(void) __ksym __weak; +struct task_struct *scx_bpf_cid_curr(s32 cid) __ksym __weak; +u32 scx_bpf_nr_cids(void) __ksym __weak; +u32 scx_bpf_nr_online_cids(void) __ksym __weak; +u32 scx_bpf_cidperf_cap(s32 cid) __ksym __weak; +u32 scx_bpf_cidperf_cur(s32 cid) __ksym __weak; +void scx_bpf_cidperf_set(s32 cid, u32 perf) __ksym __weak; /* * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from @@ -526,6 +539,10 @@ static inline bool is_migration_disabled(const struct task_struct *p) void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; +/* resilient qspinlock */ +int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak; +void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak; + /* * Time helpers, most of which are from jiffies.h. */ @@ -1035,7 +1052,18 @@ static inline u64 scx_clock_irq(u32 cpu) return irqt ? BPF_CORE_READ(irqt, total) : 0; } +/* Abbreviated forms of <linux/overflow.h>'s struct_size() family. */ +#define flex_array_size(p, member, count) \ + ((count) * sizeof(*(p)->member)) + +#define struct_size(p, member, count) \ + (offsetof(typeof(*(p)), member) + flex_array_size(p, member, count)) + +#define struct_size_t(type, member, count) \ + struct_size((type *)NULL, member, count) + #include "compat.bpf.h" #include "enums.bpf.h" +#include "cid.bpf.h" #endif /* __SCX_COMMON_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 8977b5a2caa10..87f15f2962348 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -121,6 +121,18 @@ static inline bool scx_bpf_sub_dispatch(u64 cgroup_id) return false; } +/* + * v7.2: scx_bpf_cid_override() for explicit cpu->cid mapping. Ignore if + * missing. + */ +void scx_bpf_cid_override___compat(const s32 *cpu_to_cid, u32 cpu_to_cid__sz) __ksym __weak; + +static inline void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz) +{ + if (bpf_ksym_exists(scx_bpf_cid_override___compat)) + return scx_bpf_cid_override___compat(cpu_to_cid, cpu_to_cid__sz); +} + /** * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on * in a compatible way. We will preserve this __COMPAT helper until v6.16. @@ -423,8 +435,10 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags) } /* - * Define sched_ext_ops. This may be expanded to define multiple variants for - * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). + * Define sched_ext_ops. See compat.h::SCX_OPS_OPEN() for how backward + * compatibility is handled (this macro can be expanded to emit multiple + * variants for incompatible op changes; SCX_OPS_OPEN() handles purely + * additive changes at load time). */ #define SCX_OPS_DEFINE(__name, ...) \ SEC(".struct_ops.link") \ @@ -432,4 +446,16 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags) __VA_ARGS__, \ }; +/* + * Define a cid-form sched_ext_ops. Programs targeting this struct_ops type + * use cid-form callback signatures (select_cid, set_cmask, cid_online/offline, + * dispatch with cid arg, etc.) and may only call the cid-form scx_bpf_* + * kfuncs (kick_cid, task_cid, this_cid, ...). + */ +#define SCX_OPS_CID_DEFINE(__name, ...) \ + SEC(".struct_ops.link") \ + struct sched_ext_ops_cid __name = { \ + __VA_ARGS__, \ + }; + #endif /* __SCX_COMPAT_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 039854c490d5e..602f07061ee39 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -149,10 +149,24 @@ static inline long scx_hotplug_seq(void) } /* - * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() - * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load - * and attach it, backward compatibility is automatically maintained where - * reasonable. + * Open the sched_ext_ops skeleton. + * + * struct sched_ext_ops can change over time. Two complementary mechanisms + * keep BPF schedulers built against newer headers running on older kernels: + * + * 1. Load-time fix-up (this macro). For each optional ops callback or field + * added to struct sched_ext_ops, an explicit stanza below probes the + * running kernel's BTF via __COMPAT_struct_has_field() and, if the field + * is missing, clears it in the in-memory struct_ops (with a warning to + * stderr) before load. Handles additive changes - a new stanza must be + * added here for each new optional field. + * + * 2. Multi-variant struct_ops via compat.bpf.h::SCX_OPS_DEFINE(). That + * macro can be expanded to emit several variants of struct sched_ext_ops, + * and SCX_OPS_LOAD()/ATTACH() can pick the right one based on what the + * kernel supports. Needed when an existing operation has to change + * incompatibly (e.g. a callback signature changes); the load-time + * fix-up above only handles purely additive changes. * * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is * the current minimum required kernel version. @@ -225,6 +239,7 @@ static inline void __scx_ops_assoc_prog(struct bpf_program *prog, } #endif +/* See SCX_OPS_OPEN() above for backward-compatibility handling. */ #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ struct bpf_program *__prog; \ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h index e7ac6611a9901..98cab643c8d9a 100644 --- a/tools/sched_ext/include/scx/user_exit_info.bpf.h +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -32,6 +32,9 @@ __uei_name##_dump_len, (__ei)->dump); \ if (bpf_core_field_exists((__ei)->exit_code)) \ __uei_name.exit_code = (__ei)->exit_code; \ + __uei_name.exit_cpu = -1; \ + if (bpf_core_field_exists((__ei)->exit_cpu)) \ + __uei_name.exit_cpu = (__ei)->exit_cpu; \ /* use __sync to force memory barrier */ \ __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ (__ei)->kind); \ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 399697fa372fb..56a02b549aef9 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -39,6 +39,8 @@ fprintf(stderr, "EXIT: %s", __uei->reason); \ if (__uei->msg[0] != '\0') \ fprintf(stderr, " (%s)", __uei->msg); \ + if (__uei->exit_cpu >= 0) \ + fprintf(stderr, " on CPU %d", __uei->exit_cpu); \ fputs("\n", stderr); \ __uei->exit_code; \ }) diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h index 2d0981aedd898..76e2a055eb4b0 100644 --- a/tools/sched_ext/include/scx/user_exit_info_common.h +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -22,6 +22,11 @@ enum uei_sizes { struct user_exit_info { int kind; + /* + * CPU that triggered the exit, or -1 if unset (e.g. running on an + * older kernel that does not expose this field). + */ + s32 exit_cpu; s64 exit_code; char reason[UEI_REASON_LEN]; char msg[UEI_MSG_LEN]; diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 4efcce099bd52..64dd60b3e9223 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -149,10 +149,14 @@ static bool dispatch_to_cpu(s32 cpu) } /* - * If we can't run the task at the top, do the dumb thing and - * bounce it to the fallback dsq. + * If we can't run the task at the top for whatever reason, + * bounce it to the fallback dsq. Also check + * is_migration_disabled() explicitly as p->cpus_ptr may not + * reflect the migration-disabled state yet if + * migrate_disable_switch() hasn't run. */ - if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr) || + (is_migration_disabled(p) && scx_bpf_task_cpu(p) != cpu)) { __sync_fetch_and_add(&nr_mismatches, 1); scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); bpf_task_release(p); diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c index 0b1a7ce879b06..909d1be1bfe3c 100644 --- a/tools/sched_ext/scx_cpu0.bpf.c +++ b/tools/sched_ext/scx_cpu0.bpf.c @@ -18,8 +18,6 @@ char _license[] SEC("license") = "GPL"; -const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ - UEI_DEFINE(uei); /* diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c index a6fba9978b9ce..4966e3d4c7249 100644 --- a/tools/sched_ext/scx_cpu0.c +++ b/tools/sched_ext/scx_cpu0.c @@ -72,8 +72,6 @@ restart: optind = 1; skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0); - skel->rodata->nr_cpus = libbpf_num_possible_cpus(); - while ((opt = getopt(argc, argv, "vh")) != -1) { switch (opt) { case 'v': diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index d865c381589bb..de2bef86d64d6 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -130,7 +130,6 @@ int main(int argc, char **argv) struct scx_flatcg *skel; struct bpf_link *link; struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; - bool dump_cgrps = false; __u64 last_cpu_sum = 0, last_cpu_idle = 0; __u64 last_stats[FCG_NR_STATS] = {}; unsigned long seq = 0; @@ -148,7 +147,7 @@ restart: assert(skel->rodata->nr_cpus > 0); skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { + while ((opt = getopt(argc, argv, "s:i:fvh")) != -1) { double v; switch (opt) { @@ -161,9 +160,6 @@ restart: intv_ts.tv_sec = v; intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; break; - case 'd': - dump_cgrps = true; - break; case 'f': skel->rodata->fifo_sched = true; break; @@ -177,10 +173,10 @@ restart: } } - printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", + printf("slice=%.1lfms intv=%.1lfs", (double)skel->rodata->cgrp_slice_ns / 1000000.0, - (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, - dump_cgrps); + (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0); + SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg); diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index aad698fe294bf..8a2d6a8ebd8ed 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -2,15 +2,16 @@ /* * A simple five-level FIFO queue scheduler. * - * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets - * assigned to one depending on its compound weight. Each CPU round robins - * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from - * queue0, 2 from queue1, 4 from queue2 and so on. + * There are five FIFOs implemented as arena-backed doubly-linked lists + * threaded through per-task context. A task gets assigned to one depending on + * its compound weight. Each CPU round robins through the FIFOs and dispatches + * more from FIFOs with higher indices - 1 from queue0, 2 from queue1, 4 from + * queue2 and so on. * * This scheduler demonstrates: * - * - BPF-side queueing using PIDs. - * - Sleepable per-task storage allocation using ops.prep_enable(). + * - BPF-side queueing using TIDs. + * - BPF arena for scheduler state. * - Core-sched support. * * This scheduler is primarily for demonstration and testing of sched_ext @@ -22,6 +23,8 @@ */ #include <scx/common.bpf.h> +#include "scx_qmap.h" + enum consts { ONE_SEC_IN_NS = 1000000000, ONE_MSEC_IN_NS = 1000000, @@ -47,40 +50,72 @@ const volatile s32 disallow_tgid; const volatile bool suppress_dump; const volatile bool always_enq_immed; const volatile u32 immed_stress_nth; +const volatile u32 max_tasks; -u64 nr_highpri_queued; -u32 test_error_cnt; - -#define MAX_SUB_SCHEDS 8 -u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS]; +/* + * Optional cid-override test harness. When cid_override_mode is non-zero, + * qmap_init() calls scx_bpf_cid_override() with the caller-supplied + * cpu_to_cid array to exercise the kfunc's acceptance and error paths. + * + * 0 = disabled + * 1 = valid reverse mapping + * 2 = invalid: duplicate cid assignment + * 3 = invalid: out-of-range cid + */ +const volatile u32 cid_override_mode; +/* + * Array lives in bss (writable) because scx_bpf_cid_override()'s BPF + * verifier signature treats its len-paired pointer as read/write - rodata + * fails verification with "write into map forbidden". Userspace populates + * it before SCX_OPS_LOAD, same as rodata, and nothing writes it after. + */ +s32 cid_override_cpu_to_cid[SCX_QMAP_MAX_CPUS]; UEI_DEFINE(uei); -struct qmap { - __uint(type, BPF_MAP_TYPE_QUEUE); - __uint(max_entries, 4096); - __type(value, u32); -} queue0 SEC(".maps"), - queue1 SEC(".maps"), - queue2 SEC(".maps"), - queue3 SEC(".maps"), - queue4 SEC(".maps"), - dump_store SEC(".maps"); - +/* + * All scheduler state - per-cpu context, stats counters, core-sched sequence + * numbers, sub-sched cgroup ids - lives in this single BPF arena map. Userspace + * reaches it via skel->arena->qa. + */ struct { - __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); - __uint(max_entries, 5); - __type(key, int); - __array(values, struct qmap); -} queue_arr SEC(".maps") = { - .values = { - [0] = &queue0, - [1] = &queue1, - [2] = &queue2, - [3] = &queue3, - [4] = &queue4, - }, -}; + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1 << 16); /* upper bound in pages */ +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) + __ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */ +#else + __ulong(map_extra, 0x1ull << 44); +#endif +} arena SEC(".maps"); + +struct qmap_arena __arena_global qa; + +/* + * Global idle-cid tracking, maintained via update_idle / cpu_offline and + * scanned by the direct-dispatch path. Allocated in qmap_init() from one + * arena page, sized to the full cid space. + */ +struct scx_cmask __arena *qa_idle_cids; + +/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */ +__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0"); +__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1"); +__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2"); +__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3"); +__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4"); + +static struct bpf_res_spin_lock *qa_q_lock(s32 qid) +{ + switch (qid) { + case 0: return &qa_q_lock0; + case 1: return &qa_q_lock1; + case 2: return &qa_q_lock2; + case 3: return &qa_q_lock3; + case 4: return &qa_q_lock4; + default: return NULL; + } +} /* * If enabled, CPU performance target is set according to the queue index @@ -102,85 +137,214 @@ static const u32 qidx_to_cpuperf_target[] = { * task's seq and the associated queue's head seq is called the queue distance * and used when comparing two tasks for ordering. See qmap_core_sched_before(). */ -static u64 core_sched_head_seqs[5]; -static u64 core_sched_tail_seqs[5]; -/* Per-task scheduling context */ +/* + * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in + * arena. While the task is alive the entry is referenced from task_ctx_stor; + * while it's free the entry sits on the free list singly-linked through + * @next_free. + * + * When the task is queued on one of the five priority FIFOs, @q_idx is the + * queue index and @q_next/@q_prev link it in the queue's doubly-linked list. + * @q_idx is -1 when the task isn't on any queue. + */ struct task_ctx { - bool force_local; /* Dispatch directly to local_dsq */ - bool highpri; - u64 core_sched_seq; + struct task_ctx __arena *next_free; /* only valid on free list */ + struct task_ctx __arena *q_next; /* queue link, NULL if tail */ + struct task_ctx __arena *q_prev; /* queue link, NULL if head */ + struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */ + u64 tid; + s32 pid; /* for dump only */ + bool force_local; /* Dispatch directly to local_dsq */ + bool highpri; + u64 core_sched_seq; + struct scx_cmask cpus_allowed; /* per-task affinity in cid space */ +}; + +/* + * Slab stride for task_ctx. cpus_allowed's flex array bits[] overlaps the + * tail bytes appended per entry; struct_size() gives the actual per-entry + * footprint. + */ +#define TASK_CTX_STRIDE \ + struct_size_t(struct task_ctx, cpus_allowed.bits, \ + CMASK_NR_WORDS(SCX_QMAP_MAX_CPUS)) + +/* All task_ctx pointers are arena pointers. */ +typedef struct task_ctx __arena task_ctx_t; + +/* Holds an arena pointer to the task's slab entry. */ +struct task_ctx_stor_val { + task_ctx_t *taskc; }; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); - __type(value, struct task_ctx); + __type(value, struct task_ctx_stor_val); } task_ctx_stor SEC(".maps"); -struct cpu_ctx { - u64 dsp_idx; /* dispatch index */ - u64 dsp_cnt; /* remaining count */ - u32 avg_weight; - u32 cpuperf_target; -}; +/* Protects the task_ctx slab free list. */ +__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock"); -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, 1); - __type(key, u32); - __type(value, struct cpu_ctx); -} cpu_ctx_stor SEC(".maps"); +static int qmap_spin_lock(struct bpf_res_spin_lock *lock) +{ + if (bpf_res_spin_lock(lock)) { + scx_bpf_error("res_spin_lock failed"); + return -EBUSY; + } + return 0; +} -/* Statistics */ -u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq; -u64 nr_core_sched_execed; -u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; -u32 cpuperf_min, cpuperf_avg, cpuperf_max; -u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; +/* + * Try prev_cid, then scan taskc->cpus_allowed AND qa_idle_cids round-robin + * from prev_cid + 1. Atomic claim retries on race; bounded by + * IDLE_PICK_RETRIES to keep the verifier's insn budget in check. + */ +#define IDLE_PICK_RETRIES 16 -static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) +static s32 pick_direct_dispatch_cid(struct task_struct *p, s32 prev_cid, + task_ctx_t *taskc) { - s32 cpu; + u32 nr_cids = scx_bpf_nr_cids(); + s32 cid; + u32 i; if (!always_enq_immed && p->nr_cpus_allowed == 1) - return prev_cpu; - - if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) - return prev_cpu; + return prev_cid; - cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - return cpu; + if (cmask_test_and_clear(qa_idle_cids, prev_cid)) + return prev_cid; + cid = prev_cid; + bpf_for(i, 0, IDLE_PICK_RETRIES) { + cid = cmask_next_and_set_wrap(&taskc->cpus_allowed, + qa_idle_cids, cid + 1); + barrier_var(cid); + if (cid >= nr_cids) + return -1; + if (cmask_test_and_clear(qa_idle_cids, cid)) + return cid; + } return -1; } -static struct task_ctx *lookup_task_ctx(struct task_struct *p) +/* + * Force a reference to the arena map. The verifier associates an arena with + * a program by finding an LD_IMM64 instruction that loads the arena's BPF + * map; programs that only use arena pointers returned from task-local + * storage (like qmap_select_cpu) never reference @arena directly. Without + * this, the verifier rejects addr_space_cast with "addr_space_cast insn + * can only be used in a program that has an associated arena". + */ +#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0) + +static task_ctx_t *lookup_task_ctx(struct task_struct *p) +{ + struct task_ctx_stor_val *v; + + QMAP_TOUCH_ARENA(); + + v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!v || !v->taskc) + return NULL; + return v->taskc; +} + +/* Append @taskc to the tail of @fifo. Must not already be queued. */ +static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo, task_ctx_t *taskc) +{ + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx); + + if (!lock || qmap_spin_lock(lock)) + return; + taskc->fifo = fifo; + taskc->q_next = NULL; + taskc->q_prev = fifo->tail; + if (fifo->tail) + fifo->tail->q_next = taskc; + else + fifo->head = taskc; + fifo->tail = taskc; + bpf_res_spin_unlock(lock); +} + +/* Pop the head of @fifo. Returns NULL if empty. */ +static task_ctx_t *qmap_fifo_pop(struct qmap_fifo __arena *fifo) +{ + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx); + task_ctx_t *taskc; + + if (!lock || qmap_spin_lock(lock)) + return NULL; + taskc = fifo->head; + if (taskc) { + fifo->head = taskc->q_next; + if (taskc->q_next) + taskc->q_next->q_prev = NULL; + else + fifo->tail = NULL; + taskc->q_next = NULL; + taskc->q_prev = NULL; + taskc->fifo = NULL; + } + bpf_res_spin_unlock(lock); + return taskc; +} + +/* Remove @taskc from its fifo. No-op if not queued. */ +static void qmap_fifo_remove(task_ctx_t *taskc) { - return bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + struct qmap_fifo __arena *fifo = taskc->fifo; + struct bpf_res_spin_lock *lock; + + if (!fifo) + return; + + lock = qa_q_lock(fifo->idx); + if (!lock || qmap_spin_lock(lock)) + return; + + /* Re-check under lock — a concurrent pop may have cleared fifo. */ + if (taskc->fifo != fifo) { + bpf_res_spin_unlock(lock); + return; + } + + if (taskc->q_next) + taskc->q_next->q_prev = taskc->q_prev; + else + fifo->tail = taskc->q_prev; + if (taskc->q_prev) + taskc->q_prev->q_next = taskc->q_next; + else + fifo->head = taskc->q_next; + taskc->q_next = NULL; + taskc->q_prev = NULL; + taskc->fifo = NULL; + bpf_res_spin_unlock(lock); } -s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, - s32 prev_cpu, u64 wake_flags) +s32 BPF_STRUCT_OPS(qmap_select_cid, struct task_struct *p, + s32 prev_cid, u64 wake_flags) { - struct task_ctx *tctx; - s32 cpu; + task_ctx_t *taskc; + s32 cid; - if (!(tctx = lookup_task_ctx(p))) - return prev_cpu; + if (!(taskc = lookup_task_ctx(p))) + return prev_cid; if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD)) - return prev_cpu; + return prev_cid; - cpu = pick_direct_dispatch_cpu(p, prev_cpu); + cid = pick_direct_dispatch_cid(p, prev_cid, taskc); - if (cpu >= 0) { - tctx->force_local = true; - return cpu; + if (cid >= 0) { + taskc->force_local = true; + return cid; } else { - return prev_cpu; + return prev_cid; } } @@ -202,16 +366,14 @@ static int weight_to_idx(u32 weight) void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) { static u32 user_cnt, kernel_cnt; - struct task_ctx *tctx; - u32 pid = p->pid; + task_ctx_t *taskc; int idx = weight_to_idx(p->scx.weight); - void *ring; - s32 cpu; + s32 cid; if (enq_flags & SCX_ENQ_REENQ) { - __sync_fetch_and_add(&nr_reenqueued, 1); - if (scx_bpf_task_cpu(p) == 0) - __sync_fetch_and_add(&nr_reenqueued_cpu0, 1); + __sync_fetch_and_add(&qa.nr_reenqueued, 1); + if (scx_bpf_task_cid(p) == 0) + __sync_fetch_and_add(&qa.nr_reenqueued_cid0, 1); } if (p->flags & PF_KTHREAD) { @@ -222,17 +384,17 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - if (test_error_cnt && !--test_error_cnt) + if (qa.test_error_cnt && !--qa.test_error_cnt) scx_bpf_error("test triggering error"); - if (!(tctx = lookup_task_ctx(p))) + if (!(taskc = lookup_task_ctx(p))) return; /* * All enqueued tasks must have their core_sched_seq updated for correct * core-sched ordering. Also, take a look at the end of qmap_dispatch(). */ - tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++; /* * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch @@ -243,19 +405,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) static u32 immed_stress_cnt; if (!(++immed_stress_cnt % immed_stress_nth)) { - tctx->force_local = false; - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p), + taskc->force_local = false; + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cid(p), slice_ns, enq_flags); return; } } /* - * If qmap_select_cpu() is telling us to or this is the last runnable + * If qmap_select_cid() is telling us to or this is the last runnable * task on the CPU, enqueue locally. */ - if (tctx->force_local) { - tctx->force_local = false; + if (taskc->force_local) { + taskc->force_local = false; scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); return; } @@ -267,11 +429,11 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* if select_cpu() wasn't called, try direct dispatch */ + /* if select_cid() wasn't called, try direct dispatch */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) && - (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { - __sync_fetch_and_add(&nr_ddsp_from_enq, 1); - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); + (cid = pick_direct_dispatch_cid(p, scx_bpf_task_cid(p), taskc)) >= 0) { + __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cid, slice_ns, enq_flags); return; } @@ -279,55 +441,52 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) * If the task was re-enqueued due to the CPU being preempted by a * higher priority scheduling class, just re-enqueue the task directly * on the global DSQ. As we want another CPU to pick it up, find and - * kick an idle CPU. + * kick an idle cid. */ if (enq_flags & SCX_ENQ_REENQ) { - s32 cpu; + s32 cid; scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags); - cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + cid = cmask_next_and_set_wrap(&taskc->cpus_allowed, + qa_idle_cids, 0); + if (cid < scx_bpf_nr_cids()) + scx_bpf_kick_cid(cid, SCX_KICK_IDLE); return; } - ring = bpf_map_lookup_elem(&queue_arr, &idx); - if (!ring) { - scx_bpf_error("failed to find ring %d", idx); - return; - } - - /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ - if (bpf_map_push_elem(ring, &pid, 0)) { - scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags); - return; - } + /* Queue on the selected FIFO. */ + qmap_fifo_enqueue(&qa.fifos[idx], taskc); if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { - tctx->highpri = true; - __sync_fetch_and_add(&nr_highpri_queued, 1); + taskc->highpri = true; + __sync_fetch_and_add(&qa.nr_highpri_queued, 1); } - __sync_fetch_and_add(&nr_enqueued, 1); + __sync_fetch_and_add(&qa.nr_enqueued, 1); } -/* - * The BPF queue map doesn't support removal and sched_ext can handle spurious - * dispatches. qmap_dequeue() is only used to collect statistics. - */ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) { - __sync_fetch_and_add(&nr_dequeued, 1); + task_ctx_t *taskc; + + __sync_fetch_and_add(&qa.nr_dequeued, 1); if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) - __sync_fetch_and_add(&nr_core_sched_execed, 1); + __sync_fetch_and_add(&qa.nr_core_sched_execed, 1); + + taskc = lookup_task_ctx(p); + if (taskc && taskc->fifo) { + if (taskc->highpri) + __sync_fetch_and_sub(&qa.nr_highpri_queued, 1); + qmap_fifo_remove(taskc); + } } static void update_core_sched_head_seq(struct task_struct *p) { int idx = weight_to_idx(p->scx.weight); - struct task_ctx *tctx; + task_ctx_t *taskc; - if ((tctx = lookup_task_ctx(p))) - core_sched_head_seqs[idx] = tctx->core_sched_seq; + if ((taskc = lookup_task_ctx(p))) + qa.core_sched_head_seqs[idx] = taskc->core_sched_seq; } /* @@ -343,17 +502,18 @@ static void update_core_sched_head_seq(struct task_struct *p) static bool dispatch_highpri(bool from_timer) { struct task_struct *p; - s32 this_cpu = bpf_get_smp_processor_id(); + s32 this_cid = scx_bpf_this_cid(); + u32 nr_cids = scx_bpf_nr_cids(); /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { static u64 highpri_seq; - struct task_ctx *tctx; + task_ctx_t *taskc; - if (!(tctx = lookup_task_ctx(p))) + if (!(taskc = lookup_task_ctx(p))) return false; - if (tctx->highpri) { + if (taskc->highpri) { /* exercise the set_*() and vtime interface too */ scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); @@ -362,30 +522,38 @@ static bool dispatch_highpri(bool from_timer) } /* - * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU - * is found. + * Scan HIGHPRI_DSQ and dispatch until a task that can run here is + * found. Prefer this_cid if the task allows it; otherwise RR-scan the + * task's cpus_allowed starting after this_cid. */ bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { + task_ctx_t *taskc; bool dispatched = false; - s32 cpu; + s32 cid; + + if (!(taskc = lookup_task_ctx(p))) + return false; - if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) - cpu = this_cpu; + if (cmask_test(&taskc->cpus_allowed, this_cid)) + cid = this_cid; else - cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); + cid = cmask_next_set_wrap(&taskc->cpus_allowed, + this_cid + 1); + if (cid >= nr_cids) + continue; - if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cid, SCX_ENQ_PREEMPT)) { - if (cpu == this_cpu) { + if (cid == this_cid) { dispatched = true; - __sync_fetch_and_add(&nr_expedited_local, 1); + __sync_fetch_and_add(&qa.nr_expedited_local, 1); } else { - __sync_fetch_and_add(&nr_expedited_remote, 1); + __sync_fetch_and_add(&qa.nr_expedited_remote, 1); } if (from_timer) - __sync_fetch_and_add(&nr_expedited_from_timer, 1); + __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1); } else { - __sync_fetch_and_add(&nr_expedited_lost, 1); + __sync_fetch_and_add(&qa.nr_expedited_lost, 1); } if (dispatched) @@ -395,22 +563,21 @@ static bool dispatch_highpri(bool from_timer) return false; } -void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) +void BPF_STRUCT_OPS(qmap_dispatch, s32 cid, struct task_struct *prev) { struct task_struct *p; - struct cpu_ctx *cpuc; - struct task_ctx *tctx; - u32 zero = 0, batch = dsp_batch ?: 1; - void *fifo; - s32 i, pid; + struct cpu_ctx __arena *cpuc; + task_ctx_t *taskc; + u32 batch = dsp_batch ?: 1; + s32 i; if (dispatch_highpri(false)) return; - if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0)) + if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0)) return; - if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { + if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) { /* * PID 2 should be kthreadd which should mostly be idle and off * the scheduler. Let's keep dispatching it to force the kernel @@ -424,10 +591,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } } - if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { - scx_bpf_error("failed to look up cpu_ctx"); - return; - } + cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()]; for (i = 0; i < 5; i++) { /* Advance the dispatch cursor and pick the fifo. */ @@ -436,33 +600,23 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) cpuc->dsp_cnt = 1 << cpuc->dsp_idx; } - fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); - if (!fifo) { - scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); - return; - } - /* Dispatch or advance. */ bpf_repeat(BPF_MAX_LOOPS) { - struct task_ctx *tctx; + task_ctx_t *taskc; - if (bpf_map_pop_elem(fifo, &pid)) + taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]); + if (!taskc) break; - p = bpf_task_from_pid(pid); + p = scx_bpf_tid_to_task(taskc->tid); if (!p) continue; - if (!(tctx = lookup_task_ctx(p))) { - bpf_task_release(p); - return; - } - - if (tctx->highpri) - __sync_fetch_and_sub(&nr_highpri_queued, 1); + if (taskc->highpri) + __sync_fetch_and_sub(&qa.nr_highpri_queued, 1); update_core_sched_head_seq(p); - __sync_fetch_and_add(&nr_dispatched, 1); + __sync_fetch_and_add(&qa.nr_dispatched, 1); scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0); @@ -502,10 +656,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) * document this class of issue -- other schedulers * seeing similar warnings can use this as a reference. */ - if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) - scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0); - - bpf_task_release(p); + if (!cmask_test(&taskc->cpus_allowed, cid)) + scx_bpf_kick_cid(scx_bpf_task_cid(p), 0); batch--; cpuc->dsp_cnt--; @@ -523,8 +675,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } for (i = 0; i < MAX_SUB_SCHEDS; i++) { - if (sub_sched_cgroup_ids[i] && - scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i])) + if (qa.sub_sched_cgroup_ids[i] && + scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i])) return; } @@ -533,24 +685,20 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) * if the task were enqueued and dispatched immediately. */ if (prev) { - tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); - if (tctx) - tctx->core_sched_seq = - core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; + taskc = lookup_task_ctx(prev); + if (!taskc) + return; + + taskc->core_sched_seq = + qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; } } void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) { - struct cpu_ctx *cpuc; - u32 zero = 0; + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()]; int idx; - if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { - scx_bpf_error("failed to look up cpu_ctx"); - return; - } - /* * Use the running avg of weights to select the target cpuperf level. * This is a demonstration of the cpuperf feature rather than a @@ -560,7 +708,7 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) idx = weight_to_idx(cpuc->avg_weight); cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; - scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); + scx_bpf_cidperf_set(scx_bpf_task_cid(p), cpuc->cpuperf_target); } /* @@ -570,14 +718,14 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) static s64 task_qdist(struct task_struct *p) { int idx = weight_to_idx(p->scx.weight); - struct task_ctx *tctx; + task_ctx_t *taskc; s64 qdist; - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); - if (!tctx) + taskc = lookup_task_ctx(p); + if (!taskc) return 0; - qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx]; /* * As queue index increments, the priority doubles. The queue w/ index 3 @@ -610,70 +758,110 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before, * tasks when a higher-priority scheduling class takes the CPU. */ -s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, - struct scx_init_task_args *args) +s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) { + struct task_ctx_stor_val *v; + task_ctx_t *taskc; + if (p->tgid == disallow_tgid) p->scx.disallow = true; - /* - * @p is new. Let's ensure that its task_ctx is available. We can sleep - * in this function and the following will automatically use GFP_KERNEL. - */ - if (bpf_task_storage_get(&task_ctx_stor, p, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE)) - return 0; - else + /* pop a slab entry off the free list */ + if (qmap_spin_lock(&qa_task_lock)) + return -EBUSY; + taskc = qa.task_free_head; + if (taskc) + qa.task_free_head = taskc->next_free; + bpf_res_spin_unlock(&qa_task_lock); + if (!taskc) { + scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks); + return -ENOMEM; + } + + taskc->next_free = NULL; + taskc->q_next = NULL; + taskc->q_prev = NULL; + taskc->fifo = NULL; + taskc->tid = p->scx.tid; + taskc->pid = p->pid; + taskc->force_local = false; + taskc->highpri = false; + taskc->core_sched_seq = 0; + cmask_init(&taskc->cpus_allowed, 0, scx_bpf_nr_cids()); + bpf_rcu_read_lock(); + cmask_from_cpumask(&taskc->cpus_allowed, p->cpus_ptr); + bpf_rcu_read_unlock(); + + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!v) { + /* push back to the free list */ + if (!qmap_spin_lock(&qa_task_lock)) { + taskc->next_free = qa.task_free_head; + qa.task_free_head = taskc; + bpf_res_spin_unlock(&qa_task_lock); + } return -ENOMEM; + } + v->taskc = taskc; + return 0; } -void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) +void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) { - s32 i, pid; + struct task_ctx_stor_val *v; + task_ctx_t *taskc; - if (suppress_dump) + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0); + if (!v || !v->taskc) return; + taskc = v->taskc; + v->taskc = NULL; - bpf_for(i, 0, 5) { - void *fifo; + if (qmap_spin_lock(&qa_task_lock)) + return; + taskc->next_free = qa.task_free_head; + qa.task_free_head = taskc; + bpf_res_spin_unlock(&qa_task_lock); +} - if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) - return; +void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) +{ + task_ctx_t *taskc; + s32 i; - scx_bpf_dump("QMAP FIFO[%d]:", i); + QMAP_TOUCH_ARENA(); - /* - * Dump can be invoked anytime and there is no way to iterate in - * a non-destructive way. Pop and store in dump_store and then - * restore afterwards. If racing against new enqueues, ordering - * can get mixed up. - */ - bpf_repeat(4096) { - if (bpf_map_pop_elem(fifo, &pid)) - break; - bpf_map_push_elem(&dump_store, &pid, 0); - scx_bpf_dump(" %d", pid); - } + if (suppress_dump) + return; + /* + * Walk the queue lists without locking - kfunc calls (scx_bpf_dump) + * aren't in the verifier's kfunc_spin_allowed() list so we can't hold + * a lock and dump. Best-effort; racing may print stale tids but the + * walk is bounded by bpf_repeat() so it always terminates. + */ + bpf_for(i, 0, 5) { + scx_bpf_dump("QMAP FIFO[%d]:", i); + taskc = qa.fifos[i].head; bpf_repeat(4096) { - if (bpf_map_pop_elem(&dump_store, &pid)) + if (!taskc) break; - bpf_map_push_elem(fifo, &pid, 0); + scx_bpf_dump(" %d:%llu", taskc->pid, taskc->tid); + taskc = taskc->q_next; } - scx_bpf_dump("\n"); } } -void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) +void BPF_STRUCT_OPS(qmap_dump_cid, struct scx_dump_ctx *dctx, s32 cid, bool idle) { - u32 zero = 0; - struct cpu_ctx *cpuc; + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid]; if (suppress_dump || idle) return; - if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) - return; scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, @@ -682,12 +870,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) { - struct task_ctx *taskc; + struct task_ctx_stor_val *v; + task_ctx_t *taskc; + + QMAP_TOUCH_ARENA(); if (suppress_dump) return; - if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0); + if (!v || !v->taskc) return; + taskc = v->taskc; scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", taskc->force_local, taskc->core_sched_seq); @@ -716,61 +909,25 @@ void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, cgrp->kn->id, period_us, quota_us, burst_us); } -/* - * Print out the online and possible CPU map using bpf_printk() as a - * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). - */ -static void print_cpus(void) +void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle) { - const struct cpumask *possible, *online; - s32 cpu; - char buf[128] = "", *p; - int idx; - - possible = scx_bpf_get_possible_cpumask(); - online = scx_bpf_get_online_cpumask(); - - idx = 0; - bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { - if (!(p = MEMBER_VPTR(buf, [idx++]))) - break; - if (bpf_cpumask_test_cpu(cpu, online)) - *p++ = 'O'; - else if (bpf_cpumask_test_cpu(cpu, possible)) - *p++ = 'X'; - else - *p++ = ' '; - - if ((cpu & 7) == 7) { - if (!(p = MEMBER_VPTR(buf, [idx++]))) - break; - *p++ = '|'; - } - } - buf[sizeof(buf) - 1] = '\0'; - - scx_bpf_put_cpumask(online); - scx_bpf_put_cpumask(possible); - - bpf_printk("CPUS: |%s", buf); + QMAP_TOUCH_ARENA(); + if (idle) + cmask_set(qa_idle_cids, cid); + else + cmask_clear(qa_idle_cids, cid); } -void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) +void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p, + const struct scx_cmask *cmask_in) { - if (print_msgs) { - bpf_printk("CPU %d coming online", cpu); - /* @cpu is already online at this point */ - print_cpus(); - } -} + struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in; + task_ctx_t *taskc; -void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) -{ - if (print_msgs) { - bpf_printk("CPU %d going offline", cpu); - /* @cpu is still online at this point */ - print_cpus(); - } + taskc = lookup_task_ctx(p); + if (!taskc) + return; + cmask_copy(&taskc->cpus_allowed, cmask); } struct monitor_timer { @@ -785,64 +942,49 @@ struct { } monitor_timer SEC(".maps"); /* - * Print out the min, avg and max performance levels of CPUs every second to - * demonstrate the cpuperf interface. + * Aggregate cidperf across the first nr_online_cids cids. Post-hotplug + * the first-N-are-online invariant drifts, so some cap/cur values may + * be stale. For this demo monitor that's fine; the scheduler exits on + * the enable-time hotplug_seq mismatch and userspace restarts, which + * rebuilds the layout. */ static void monitor_cpuperf(void) { - u32 zero = 0, nr_cpu_ids; + u32 nr_online = scx_bpf_nr_online_cids(); u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; - const struct cpumask *online; - int i, nr_online_cpus = 0; - - nr_cpu_ids = scx_bpf_nr_cpu_ids(); - online = scx_bpf_get_online_cpumask(); - - bpf_for(i, 0, nr_cpu_ids) { - struct cpu_ctx *cpuc; - u32 cap, cur; + s32 cid; - if (!bpf_cpumask_test_cpu(i, online)) - continue; - nr_online_cpus++; + QMAP_TOUCH_ARENA(); - /* collect the capacity and current cpuperf */ - cap = scx_bpf_cpuperf_cap(i); - cur = scx_bpf_cpuperf_cur(i); + bpf_for(cid, 0, nr_online) { + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid]; + u32 cap = scx_bpf_cidperf_cap(cid); + u32 cur = scx_bpf_cidperf_cur(cid); + u32 target; cur_min = cur < cur_min ? cur : cur_min; cur_max = cur > cur_max ? cur : cur_max; - /* - * $cur is relative to $cap. Scale it down accordingly so that - * it's in the same scale as other CPUs and $cur_sum/$cap_sum - * makes sense. - */ - cur_sum += cur * cap / SCX_CPUPERF_ONE; + cur_sum += (u64)cur * cap / SCX_CPUPERF_ONE; cap_sum += cap; - if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { - scx_bpf_error("failed to look up cpu_ctx"); - goto out; - } - - /* collect target */ - cur = cpuc->cpuperf_target; - target_sum += cur; - target_min = cur < target_min ? cur : target_min; - target_max = cur > target_max ? cur : target_max; + target = cpuc->cpuperf_target; + target_sum += target; + target_min = target < target_min ? target : target_min; + target_max = target > target_max ? target : target_max; } - cpuperf_min = cur_min; - cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; - cpuperf_max = cur_max; + if (!nr_online || !cap_sum) + return; + + qa.cpuperf_min = cur_min; + qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; + qa.cpuperf_max = cur_max; - cpuperf_target_min = target_min; - cpuperf_target_avg = target_sum / nr_online_cpus; - cpuperf_target_max = target_max; -out: - scx_bpf_put_cpumask(online); + qa.cpuperf_target_min = target_min; + qa.cpuperf_target_avg = target_sum / nr_online; + qa.cpuperf_target_max = target_max; } /* @@ -927,12 +1069,76 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer) s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { - u32 key = 0; + u8 __arena *slab; + u32 nr_pages, key = 0, i; + u32 nr_cids, nr_cpu_ids; struct bpf_timer *timer; s32 ret; - if (print_msgs && !sub_cgroup_id) - print_cpus(); + nr_cids = scx_bpf_nr_cids(); + nr_cpu_ids = scx_bpf_nr_cpu_ids(); + + if (nr_cids > SCX_QMAP_MAX_CPUS) { + scx_bpf_error("nr_cids=%u exceeds SCX_QMAP_MAX_CPUS=%d", + nr_cids, SCX_QMAP_MAX_CPUS); + return -EINVAL; + } + if (nr_cpu_ids > SCX_QMAP_MAX_CPUS) { + scx_bpf_error("nr_cpu_ids=%u exceeds SCX_QMAP_MAX_CPUS=%d", + nr_cpu_ids, SCX_QMAP_MAX_CPUS); + return -EINVAL; + } + + /* + * cid-override test hook. Must run before anything that reads the + * cid space (scx_bpf_nr_cids, cmask_init, etc.). On invalid input, + * the kfunc calls scx_error() which aborts the scheduler. + */ + if (cid_override_mode) { + scx_bpf_cid_override((const s32 *)cid_override_cpu_to_cid, + nr_cpu_ids * sizeof(s32)); + } + + /* + * Allocate the task_ctx slab in arena and thread the entire slab onto + * the free list. max_tasks is set by userspace before load. Each entry + * is TASK_CTX_STRIDE bytes - task_ctx's trailing cpus_allowed flex + * array extends into the stride tail. + */ + if (!max_tasks) { + scx_bpf_error("max_tasks must be > 0"); + return -EINVAL; + } + + nr_pages = (max_tasks * TASK_CTX_STRIDE + PAGE_SIZE - 1) / PAGE_SIZE; + slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0); + if (!slab) { + scx_bpf_error("failed to allocate task_ctx slab"); + return -ENOMEM; + } + qa.task_ctxs = (task_ctx_t *)slab; + + bpf_for(i, 0, 5) + qa.fifos[i].idx = i; + + bpf_for(i, 0, max_tasks) { + task_ctx_t *cur = (task_ctx_t *)(slab + i * TASK_CTX_STRIDE); + task_ctx_t *next = (i + 1 < max_tasks) ? + (task_ctx_t *)(slab + (i + 1) * TASK_CTX_STRIDE) : NULL; + cur->next_free = next; + } + qa.task_free_head = (task_ctx_t *)slab; + + /* + * Allocate and initialize the idle cmask. Starts empty - update_idle + * fills it as cpus enter idle. + */ + qa_idle_cids = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!qa_idle_cids) { + scx_bpf_error("failed to allocate idle cmask"); + return -ENOMEM; + } + cmask_init(qa_idle_cids, 0, nr_cids); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) { @@ -984,8 +1190,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args) s32 i; for (i = 0; i < MAX_SUB_SCHEDS; i++) { - if (!sub_sched_cgroup_ids[i]) { - sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id; + if (!qa.sub_sched_cgroup_ids[i]) { + qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id; bpf_printk("attaching sub-sched[%d] on %s", i, args->cgroup_path); return 0; @@ -1000,8 +1206,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args) s32 i; for (i = 0; i < MAX_SUB_SCHEDS; i++) { - if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) { - sub_sched_cgroup_ids[i] = 0; + if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) { + qa.sub_sched_cgroup_ids[i] = 0; bpf_printk("detaching sub-sched[%d] on %s", i, args->cgroup_path); break; @@ -1009,24 +1215,26 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args) } } -SCX_OPS_DEFINE(qmap_ops, - .select_cpu = (void *)qmap_select_cpu, +SCX_OPS_CID_DEFINE(qmap_ops, + .flags = SCX_OPS_ENQ_EXITING | SCX_OPS_TID_TO_TASK, + .select_cid = (void *)qmap_select_cid, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, .tick = (void *)qmap_tick, .core_sched_before = (void *)qmap_core_sched_before, + .set_cmask = (void *)qmap_set_cmask, + .update_idle = (void *)qmap_update_idle, .init_task = (void *)qmap_init_task, + .exit_task = (void *)qmap_exit_task, .dump = (void *)qmap_dump, - .dump_cpu = (void *)qmap_dump_cpu, + .dump_cid = (void *)qmap_dump_cid, .dump_task = (void *)qmap_dump_task, .cgroup_init = (void *)qmap_cgroup_init, .cgroup_set_weight = (void *)qmap_cgroup_set_weight, .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, .sub_attach = (void *)qmap_sub_attach, .sub_detach = (void *)qmap_sub_detach, - .cpu_online = (void *)qmap_cpu_online, - .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .timeout_ms = 5000U, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index e7c89a2bc3d80..67ddd483a4c75 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -10,9 +10,11 @@ #include <inttypes.h> #include <signal.h> #include <libgen.h> +#include <sys/mman.h> #include <sys/stat.h> #include <bpf/bpf.h> #include <scx/common.h> +#include "scx_qmap.h" #include "scx_qmap.bpf.skel.h" const char help_fmt[] = @@ -21,23 +23,27 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n" +" [-N COUNT] [-P] [-M] [-H] [-c CG_PATH] [-d PID] [-D LEN] [-S] [-p] [-I]\n" +" [-F COUNT] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" " -t COUNT Stall every COUNT'th user thread\n" " -T COUNT Stall every COUNT'th kernel thread\n" +" -N COUNT Size of the task_ctx arena slab (default 16384)\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -P Print out DSQ content and event counters to trace_pipe every second\n" " -M Print out debug messages to trace_pipe\n" " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" +" -c CG_PATH Cgroup path to attach as sub-scheduler, must run parent scheduler first\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" " -S Suppress qmap-specific debug dump\n" " -p Switch only tasks on SCHED_EXT policy instead of all\n" " -I Turn on SCX_OPS_ALWAYS_ENQ_IMMED\n" " -F COUNT IMMED stress: force every COUNT'th enqueue to a busy local DSQ (use with -I)\n" +" -C MODE cid-override test (shuffle|bad-dup|bad-range)\n" " -v Print libbpf debug messages\n" " -h Display this help and exit\n"; @@ -60,23 +66,36 @@ int main(int argc, char **argv) { struct scx_qmap *skel; struct bpf_link *link; + struct qmap_arena *qa; + __u32 test_error_cnt = 0; + __u64 ecode; int opt; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); + if (libbpf_num_possible_cpus() > SCX_QMAP_MAX_CPUS) { + fprintf(stderr, + "scx_qmap: %d possible CPUs exceeds compile-time cap %d; " + "rebuild with larger SCX_QMAP_MAX_CPUS\n", + libbpf_num_possible_cpus(), SCX_QMAP_MAX_CPUS); + return 1; + } +restart: + optind = 1; skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + skel->rodata->max_tasks = 16384; - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:C:vh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; break; case 'e': - skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); + test_error_cnt = strtoul(optarg, NULL, 0); break; case 't': skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); @@ -90,6 +109,9 @@ int main(int argc, char **argv) case 'b': skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; + case 'N': + skel->rodata->max_tasks = strtoul(optarg, NULL, 0); + break; case 'P': skel->rodata->print_dsqs_and_events = true; break; @@ -130,6 +152,35 @@ int main(int argc, char **argv) case 'F': skel->rodata->immed_stress_nth = strtoul(optarg, NULL, 0); break; + case 'C': { + u32 nr_cpus = libbpf_num_possible_cpus(); + u32 mode, i; + + if (!strcmp(optarg, "shuffle")) + mode = 1; + else if (!strcmp(optarg, "bad-dup")) + mode = 2; + else if (!strcmp(optarg, "bad-range")) + mode = 3; + else { + fprintf(stderr, "unknown cid-override mode '%s'\n", optarg); + return 1; + } + skel->rodata->cid_override_mode = mode; + + /* shuffle: reversed cpu_to_cid, bad-dup: dup cid 0, bad-range: identity */ + for (i = 0; i < nr_cpus; i++) { + if (mode == 1) + skel->bss->cid_override_cpu_to_cid[i] = nr_cpus - 1 - i; + else + skel->bss->cid_override_cpu_to_cid[i] = i; + } + if (mode == 2 && nr_cpus >= 2) + skel->bss->cid_override_cpu_to_cid[1] = 0; + if (mode == 3) + skel->bss->cid_override_cpu_to_cid[0] = (s32)nr_cpus; + break; + } case 'v': verbose = true; break; @@ -142,39 +193,41 @@ int main(int argc, char **argv) SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); + qa = &skel->arena->qa; + qa->test_error_cnt = test_error_cnt; + while (!exit_req && !UEI_EXITED(skel, uei)) { - long nr_enqueued = skel->bss->nr_enqueued; - long nr_dispatched = skel->bss->nr_dispatched; + long nr_enqueued = qa->nr_enqueued; + long nr_dispatched = qa->nr_dispatched; - printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n", + printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cid0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n", nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, - skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0, - skel->bss->nr_dequeued, - skel->bss->nr_core_sched_execed, - skel->bss->nr_ddsp_from_enq); - printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", - skel->bss->nr_expedited_local, - skel->bss->nr_expedited_remote, - skel->bss->nr_expedited_from_timer, - skel->bss->nr_expedited_lost); - if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) + qa->nr_reenqueued, qa->nr_reenqueued_cid0, + qa->nr_dequeued, + qa->nr_core_sched_execed, + qa->nr_ddsp_from_enq); + printf(" exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n", + qa->nr_expedited_local, + qa->nr_expedited_remote, + qa->nr_expedited_from_timer, + qa->nr_expedited_lost); + if (__COMPAT_has_ksym("scx_bpf_cidperf_cur")) printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", - skel->bss->cpuperf_min, - skel->bss->cpuperf_avg, - skel->bss->cpuperf_max, - skel->bss->cpuperf_target_min, - skel->bss->cpuperf_target_avg, - skel->bss->cpuperf_target_max); + qa->cpuperf_min, + qa->cpuperf_avg, + qa->cpuperf_max, + qa->cpuperf_target_min, + qa->cpuperf_target_avg, + qa->cpuperf_target_max); fflush(stdout); sleep(1); } bpf_link__destroy(link); - UEI_REPORT(skel, uei); + ecode = UEI_REPORT(skel, uei); scx_qmap__destroy(skel); - /* - * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart - * on CPU hotplug events. - */ + + if (UEI_ECODE_RESTART(ecode)) + goto restart; return 0; } diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h new file mode 100644 index 0000000000000..d15a705d5ac52 --- /dev/null +++ b/tools/sched_ext/scx_qmap.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared definitions between scx_qmap.bpf.c and scx_qmap.c. + * + * The scheduler keeps all state in a single BPF arena map. struct + * qmap_arena is the one object that lives at the base of the arena and is + * mmap'd into userspace so the loader can read counters directly. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef __SCX_QMAP_H +#define __SCX_QMAP_H + +#ifdef __BPF__ +#include <scx/bpf_arena_common.bpf.h> +#else +#include <linux/types.h> +#include <scx/bpf_arena_common.h> +#endif + +#define MAX_SUB_SCHEDS 8 + +/* + * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and + * userspace. Keep this in sync with NR_CPUS used by the BPF side. + */ +#define SCX_QMAP_MAX_CPUS 1024 + +struct cpu_ctx { + __u64 dsp_idx; /* dispatch index */ + __u64 dsp_cnt; /* remaining count */ + __u32 avg_weight; + __u32 cpuperf_target; +}; + +/* Opaque to userspace; defined in scx_qmap.bpf.c. */ +struct task_ctx; + +struct qmap_fifo { + struct task_ctx __arena *head; + struct task_ctx __arena *tail; + __s32 idx; +}; + +struct qmap_arena { + /* userspace-visible stats */ + __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cid0; + __u64 nr_dequeued, nr_ddsp_from_enq; + __u64 nr_core_sched_execed; + __u64 nr_expedited_local, nr_expedited_remote; + __u64 nr_expedited_lost, nr_expedited_from_timer; + __u64 nr_highpri_queued; + __u32 test_error_cnt; + __u32 cpuperf_min, cpuperf_avg, cpuperf_max; + __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; + + /* kernel-side runtime state */ + __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS]; + __u64 core_sched_head_seqs[5]; + __u64 core_sched_tail_seqs[5]; + + struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS]; + + /* task_ctx slab; allocated and threaded by qmap_init() */ + struct task_ctx __arena *task_ctxs; + struct task_ctx __arena *task_free_head; + + /* five priority FIFOs, each a doubly-linked list through task_ctx */ + struct qmap_fifo fifos[5]; +}; + +#endif /* __SCX_QMAP_H */ diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py index 02e43c184d438..446d82807f904 100644 --- a/tools/sched_ext/scx_show_state.py +++ b/tools/sched_ext/scx_show_state.py @@ -27,18 +27,25 @@ def read_static_key(name): def state_str(state): return prog['scx_enable_state_str'][state].string_().decode() +def read_root_ops_name(): + if root: + return root.ops.name.string_().decode() + return '' + +def read_root_field(name, default): + if root: + return getattr(root, name).value_() + return default + root = prog['scx_root'] enable_state = read_atomic("scx_enable_state_var") -if root: - print(f'ops : {root.ops.name.string_().decode()}') -else: - print('ops : ') +print(f'ops : {read_root_ops_name()}') print(f'enabled : {read_static_key("__scx_enabled")}') print(f'switching_all : {read_int("scx_switching_all")}') print(f'switched_all : {read_static_key("__scx_switched_all")}') print(f'enable_state : {state_str(enable_state)} ({enable_state})') -print(f'aborting : {prog["scx_aborting"].value_()}') -print(f'bypass_depth : {prog["scx_bypass_depth"].value_()}') +print(f'aborting : {read_root_field("aborting", False)}') +print(f'bypass_depth : {read_root_field("bypass_depth", 0)}') print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') print(f'enable_seq : {read_atomic("scx_enable_seq")}') diff --git a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c index 9f16d39255e73..0d6fcc8e5eb61 100644 --- a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c +++ b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c @@ -9,12 +9,7 @@ * Copyright (C) 2026 Cheng-Yang Chou <yphbchou0911@gmail.com> */ -#include <vmlinux.h> -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> - -/* SCX kfunc from scx_kfunc_ids_any set */ -void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +#include <scx/common.bpf.h> SEC("struct_ops/ssthresh") __u32 BPF_PROG(tcp_ca_ssthresh, struct sock *sk) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c index 5b6e045e1109b..7e342c0cec653 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -6,6 +6,7 @@ */ #include <bpf/bpf.h> #include <scx/common.h> +#include <stdlib.h> #include <sys/wait.h> #include <unistd.h> #include "select_cpu_dfl.bpf.skel.h" @@ -13,29 +14,44 @@ #define NUM_CHILDREN 1028 +struct select_cpu_dfl_ctx { + struct select_cpu_dfl *skel; + struct bpf_link *link; +}; + static enum scx_test_status setup(void **ctx) { - struct select_cpu_dfl *skel; + struct select_cpu_dfl_ctx *tctx; + + tctx = malloc(sizeof(*tctx)); + SCX_FAIL_IF(!tctx, "Failed to allocate test context"); + tctx->link = NULL; - skel = select_cpu_dfl__open(); - SCX_FAIL_IF(!skel, "Failed to open"); - SCX_ENUM_INIT(skel); - SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel"); + tctx->skel = select_cpu_dfl__open(); + if (!tctx->skel) { + free(tctx); + SCX_FAIL("Failed to open"); + } + SCX_ENUM_INIT(tctx->skel); + if (select_cpu_dfl__load(tctx->skel)) { + select_cpu_dfl__destroy(tctx->skel); + free(tctx); + SCX_FAIL("Failed to load skel"); + } - *ctx = skel; + *ctx = tctx; return SCX_TEST_PASS; } static enum scx_test_status run(void *ctx) { - struct select_cpu_dfl *skel = ctx; - struct bpf_link *link; + struct select_cpu_dfl_ctx *tctx = ctx; pid_t pids[NUM_CHILDREN]; - int i, status; + int i, status, nforked = 0; - link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); - SCX_FAIL_IF(!link, "Failed to attach scheduler"); + tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler"); for (i = 0; i < NUM_CHILDREN; i++) { pids[i] = fork(); @@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx) sleep(1); exit(0); } + if (pids[i] > 0) + nforked++; } for (i = 0; i < NUM_CHILDREN; i++) { + if (pids[i] <= 0) + continue; SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); SCX_EQ(status, 0); } - SCX_ASSERT(!skel->bss->saw_local); - - bpf_link__destroy(link); + SCX_GT(nforked, 0); + SCX_ASSERT(!tctx->skel->bss->saw_local); return SCX_TEST_PASS; } static void cleanup(void *ctx) { - struct select_cpu_dfl *skel = ctx; + struct select_cpu_dfl_ctx *tctx = ctx; - select_cpu_dfl__destroy(skel); + if (tctx->link) + bpf_link__destroy(tctx->link); + select_cpu_dfl__destroy(tctx->skel); + free(tctx); } struct scx_test select_cpu_dfl = { |
