aboutsummaryrefslogtreecommitdiffstats
path: root/tools
diff options
authorMark Brown <broonie@kernel.org>2026-05-29 23:00:10 +0100
committerMark Brown <broonie@kernel.org>2026-05-29 23:00:10 +0100
commite8b79e654b43e511a9ff0af1963f86718564c7d6 (patch)
tree716d1d18f145b89bd61f852ef7ac0b94914f6e5a /tools
parent8d353be232212e7e9a53c582d6cbc9570ad24ab8 (diff)
parent1c24a913b8ebd4e7ef86e1259f3a77b06b4911b9 (diff)
downloadlinux-next-history-e8b79e654b43e511a9ff0af1963f86718564c7d6.tar.gz
Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
Diffstat (limited to 'tools')
-rw-r--r--tools/sched_ext/README.md6
-rw-r--r--tools/sched_ext/include/scx/cid.bpf.h678
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h28
-rw-r--r--tools/sched_ext/include/scx/compat.bpf.h30
-rw-r--r--tools/sched_ext/include/scx/compat.h23
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.bpf.h3
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.h2
-rw-r--r--tools/sched_ext/include/scx/user_exit_info_common.h5
-rw-r--r--tools/sched_ext/scx_central.bpf.c10
-rw-r--r--tools/sched_ext/scx_cpu0.bpf.c2
-rw-r--r--tools/sched_ext/scx_cpu0.c2
-rw-r--r--tools/sched_ext/scx_flatcg.c12
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c890
-rw-r--r--tools/sched_ext/scx_qmap.c107
-rw-r--r--tools/sched_ext/scx_qmap.h73
-rw-r--r--tools/sched_ext/scx_show_state.py19
-rw-r--r--tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c7
-rw-r--r--tools/testing/selftests/sched_ext/select_cpu_dfl.c54
18 files changed, 1531 insertions, 420 deletions
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
index 6e282bce453cc..0ee5a3d997e5d 100644
--- a/tools/sched_ext/README.md
+++ b/tools/sched_ext/README.md
@@ -168,9 +168,9 @@ well on single-socket systems with a unified L3 cache.
Another simple, yet slightly more complex scheduler that provides an example of
a basic weighted FIFO queuing policy. It also provides examples of some common
-useful BPF features, such as sleepable per-task storage allocation in the
-`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
-enqueue tasks. It also illustrates how core-sched support could be implemented.
+useful BPF features, such as arena-backed doubly-linked lists threaded through
+per-task context and `bpf_res_spin_lock` for per-queue synchronization. It also
+illustrates how core-sched support could be implemented.
## scx_central
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
new file mode 100644
index 0000000000000..70f2a3829af4d
--- /dev/null
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -0,0 +1,678 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF-side helpers for cids and cmasks. See kernel/sched/ext_cid.h for the
+ * authoritative layout and semantics. The BPF-side helpers use the cmask_*
+ * naming (no scx_ prefix); cmask is the SCX bitmap type so the prefix is
+ * redundant in BPF code. Atomics use __sync_val_compare_and_swap and every
+ * helper is inline (no .c counterpart).
+ *
+ * Included by scx/common.bpf.h; don't include directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_CID_BPF_H
+#define __SCX_CID_BPF_H
+
+#include "bpf_arena_common.bpf.h"
+
+#ifndef BIT_U64
+#define BIT_U64(nr) (1ULL << (nr))
+#endif
+#ifndef GENMASK_U64
+#define GENMASK_U64(h, l) ((~0ULL << (l)) & (~0ULL >> (63 - (h))))
+#endif
+
+/*
+ * Storage cap for bounded loops over bits[]. Sized to cover NR_CPUS=8192 with
+ * one extra word for head-misalignment. Increase if deployment targets larger
+ * NR_CPUS.
+ */
+#ifndef CMASK_MAX_WORDS
+#define CMASK_MAX_WORDS 129
+#endif
+
+/*
+ * Mirrors SCX_CMASK_NR_WORDS in kernel/sched/ext_types.h. The u64 cast keeps
+ * the +63 from wrapping when @nr_cids is near U32_MAX, so cmask_reframe()
+ * bounds-checking the result against alloc_words catches the overflow instead
+ * of seeing a small value.
+ */
+#define CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1))
+
+static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid)
+{
+ return cid >= m->base && cid < m->base + m->nr_cids;
+}
+
+static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena *m, u32 cid)
+{
+ return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
+}
+
+/**
+ * __cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base,
+ u32 nr_cids, u32 alloc_cids)
+{
+ u32 alloc_words, i;
+
+ if (unlikely(nr_cids > alloc_cids)) {
+ scx_bpf_error("__cmask_init: nr_cids=%u exceeds alloc_cids=%u",
+ nr_cids, alloc_cids);
+ return;
+ }
+ alloc_words = CMASK_NR_WORDS(alloc_cids);
+
+ m->base = base;
+ m->nr_cids = nr_cids;
+ m->alloc_words = alloc_words;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= alloc_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+/**
+ * cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+ __cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+ if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) {
+ scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u",
+ nr_cids, m->alloc_words);
+ return;
+ }
+ if (nr_cids) {
+ u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+ m->bits[0] = 0;
+ m->bits[last_word] = 0;
+ }
+ m->base = base;
+ m->nr_cids = nr_cids;
+}
+
+static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return false;
+ return *__cmask_word(m, cid) & BIT_U64(cid & 63);
+}
+
+/*
+ * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena
+ * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only
+ * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement
+ * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop.
+ *
+ * CMASK_CAS_TRIES is sized so exhausting it means seconds of real spinning
+ * on one word - past any plausible contention. Abort hard.
+ */
+#define CMASK_CAS_TRIES (1U << 23)
+
+static __always_inline void cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_set CAS exhausted at cid %u", cid);
+}
+
+static __always_inline void cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid);
+}
+
+static __always_inline bool cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return true;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return false;
+ }
+ scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline bool cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return false;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return true;
+ }
+ scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline void __cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) |= BIT_U64(cid & 63);
+}
+
+static __always_inline void __cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) &= ~BIT_U64(cid & 63);
+}
+
+static __always_inline bool __cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w |= bit;
+ return prev;
+}
+
+static __always_inline bool __cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w &= ~bit;
+ return prev;
+}
+
+static __always_inline void cmask_zero(struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+/*
+ * BPF_-prefixed to avoid colliding with the kernel's anonymous CMASK_OP_*
+ * enum in ext_cid.c, which is exported via BTF and reachable through
+ * vmlinux.h.
+ */
+enum {
+ BPF_CMASK_OP_AND,
+ BPF_CMASK_OP_OR,
+ BPF_CMASK_OP_COPY,
+ BPF_CMASK_OP_ANDNOT,
+};
+
+static __always_inline void cmask_op_word(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src,
+ u32 di, u32 si, u64 mask, int op)
+{
+ u64 dv = dst->bits[di];
+ u64 sv = src->bits[si];
+ u64 rv;
+
+ if (op == BPF_CMASK_OP_AND)
+ rv = dv & sv;
+ else if (op == BPF_CMASK_OP_OR)
+ rv = dv | sv;
+ else if (op == BPF_CMASK_OP_ANDNOT)
+ rv = dv & ~sv;
+ else
+ rv = sv;
+
+ dst->bits[di] = (dv & ~mask) | (rv & mask);
+}
+
+static __always_inline void cmask_op(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src, int op)
+{
+ u32 d_end = dst->base + dst->nr_cids;
+ u32 s_end = src->base + src->nr_cids;
+ u32 lo = dst->base > src->base ? dst->base : src->base;
+ u32 hi = d_end < s_end ? d_end : s_end;
+ u32 d_base = dst->base / 64;
+ u32 s_base = src->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 m;
+
+ if (w > hi_word)
+ break;
+
+ m = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ m &= head_mask;
+ if (w == hi_word)
+ m &= tail_mask;
+
+ cmask_op_word(dst, src, w - d_base, w - s_base, m, op);
+ }
+}
+
+/*
+ * cmask_and/or/copy only modify @dst bits that lie in the intersection of
+ * [@dst->base, @dst->base + @dst->nr_cids) and [@src->base,
+ * @src->base + @src->nr_cids). Bits in @dst outside that window
+ * keep their prior values - in particular, cmask_copy() does NOT zero @dst
+ * bits that lie outside @src's range.
+ */
+static __always_inline void cmask_and(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_AND);
+}
+
+static __always_inline void cmask_or(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_OR);
+}
+
+static __always_inline void cmask_copy(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_COPY);
+}
+
+static __always_inline void cmask_andnot(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_ANDNOT);
+}
+
+/*
+ * True iff @a and @b have identical bits over their (assumed equal) range.
+ * Callers are expected to pass same-shape cmasks; differing shapes always
+ * compare unequal.
+ */
+static __always_inline bool cmask_equal(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 nr_words, i;
+
+ if (a->base != b->base || a->nr_cids != b->nr_cids)
+ return false;
+ nr_words = CMASK_NR_WORDS(a->nr_cids);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ if (a->bits[i] != b->bits[i])
+ return false;
+ }
+ return true;
+}
+
+/*
+ * True iff every bit set in @a is also set in @b over the intersection of
+ * their ranges. Bits of @a outside @b's range fail the test.
+ */
+static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 b_end = b->base + b->nr_cids;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 nr_words, i;
+
+ /* any bit of @a outside @b's range is a subset violation */
+ if (a->base < b->base || a_end > b_end)
+ return false;
+
+ nr_words = CMASK_NR_WORDS(a->nr_cids);
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi_b;
+
+ if (i >= nr_words)
+ break;
+ wi_b = a_wbase + i - b_wbase;
+ if (a->bits[i] & ~b->bits[wi_b])
+ return false;
+ }
+ return true;
+}
+
+/**
+ * cmask_next_set - find the first set bit at or after @cid
+ * @m: cmask to search
+ * @cid: starting cid (clamped to @m->base if below)
+ *
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_cids), or
+ * @m->base + @m->nr_cids if none (the out-of-range sentinel matches the
+ * termination condition used by cmask_for_each()).
+ */
+static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
+{
+ u32 end = m->base + m->nr_cids;
+ u32 base = m->base / 64;
+ u32 last_wi = (end - 1) / 64 - base;
+ u32 start_wi, start_bit, i;
+
+ if (cid < m->base)
+ cid = m->base;
+ if (cid >= end)
+ return end;
+
+ start_wi = cid / 64 - base;
+ start_bit = cid & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (wi > last_wi)
+ break;
+
+ word = m->bits[wi];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = (base + wi) * 64 + ctzll(word);
+ if (found >= end)
+ return end;
+ return found;
+ }
+ return end;
+}
+
+static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m)
+{
+ return cmask_next_set(m, m->base);
+}
+
+#define cmask_for_each(cid, m) \
+ for ((cid) = cmask_first_set(m); \
+ (cid) < (m)->base + (m)->nr_cids; \
+ (cid) = cmask_next_set((m), (cid) + 1))
+
+/*
+ * Population count over [base, base + nr_cids). Padding bits in the head/tail
+ * words are guaranteed zero by the mutating helpers, so a flat popcount over
+ * all words is correct.
+ */
+static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
+ u32 count = 0;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ count += __builtin_popcountll(m->bits[i]);
+ }
+ return count;
+}
+
+/*
+ * True if @a and @b share any set bit. Walk only the intersection of their
+ * ranges, matching the semantics of cmask_and().
+ */
+static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 b_end = b->base + b->nr_cids;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 a_base = a->base / 64;
+ u32 b_base = b->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return false;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 mask, av, bv;
+
+ if (w > hi_word)
+ break;
+
+ mask = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ mask &= head_mask;
+ if (w == hi_word)
+ mask &= tail_mask;
+
+ av = a->bits[w - a_base] & mask;
+ bv = b->bits[w - b_base] & mask;
+ if (av & bv)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, bounded by the
+ * intersection of the two ranges. Return a->base + a->nr_cids if none found.
+ *
+ * Building block for cmask_next_and_set_wrap(). Callers that want a bounded
+ * scan without wrap call this directly.
+ */
+static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 b_end = b->base + b->nr_cids;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 last_wi, start_wi, start_bit, i;
+
+ if (lo >= hi)
+ return a_end;
+ if (start < lo)
+ start = lo;
+ if (start >= hi)
+ return a_end;
+
+ last_wi = (hi - 1) / 64;
+ start_wi = start / 64;
+ start_bit = start & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 abs_wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (abs_wi > last_wi)
+ break;
+
+ word = a->bits[abs_wi - a_wbase] & b->bits[abs_wi - b_wbase];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = abs_wi * 64 + ctzll(word);
+ if (found >= hi)
+ return a_end;
+ return found;
+ }
+ return a_end;
+}
+
+/*
+ * Find the next set cid in @m at or after @start, wrapping to @m->base if no
+ * set bit is found in [start, m->base + m->nr_cids). Return m->base +
+ * m->nr_cids if @m is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m,
+ u32 start)
+{
+ u32 end = m->base + m->nr_cids;
+ u32 found;
+
+ found = cmask_next_set(m, start);
+ if (found < end || start <= m->base)
+ return found;
+
+ found = cmask_next_set(m, m->base);
+ return found < start ? found : end;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, wrapping to
+ * @a->base if none found in the forward half. Return a->base + a->nr_cids
+ * if the intersection is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 found;
+
+ found = cmask_next_and_set(a, b, start);
+ if (found < a_end || start <= a->base)
+ return found;
+
+ found = cmask_next_and_set(a, b, a->base);
+ return found < start ? found : a_end;
+}
+
+/**
+ * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask
+ * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base +
+ * @m->nr_cids) are updated - cpus mapping to cids outside that range
+ * are ignored.
+ * @cpumask: kernel cpumask to translate
+ *
+ * For each cpu in @cpumask, set the cpu's cid in @m. Caller must ensure
+ * @cpumask stays stable across the call (e.g. RCU read lock for
+ * task->cpus_ptr).
+ */
+static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
+ const struct cpumask *cpumask)
+{
+ u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
+ s32 cpu;
+
+ cmask_zero(m);
+ bpf_for(cpu, 0, nr_cpu_ids) {
+ s32 cid;
+
+ if (!bpf_cpumask_test_cpu(cpu, cpumask))
+ continue;
+ cid = scx_bpf_cpu_to_cid(cpu);
+ if (cid >= 0)
+ __cmask_set(m, cid);
+ }
+}
+
+#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 19459dedde41a..5f715d69cde6d 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -99,8 +99,21 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct rq *scx_bpf_locked_rq(void) __ksym;
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
+struct task_struct *scx_bpf_tid_to_task(u64 tid) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
+s32 scx_bpf_cpu_to_cid(s32 cpu) __ksym __weak;
+s32 scx_bpf_cid_to_cpu(s32 cid) __ksym __weak;
+void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out) __ksym __weak;
+void scx_bpf_kick_cid(s32 cid, u64 flags) __ksym __weak;
+s32 scx_bpf_task_cid(const struct task_struct *p) __ksym __weak;
+s32 scx_bpf_this_cid(void) __ksym __weak;
+struct task_struct *scx_bpf_cid_curr(s32 cid) __ksym __weak;
+u32 scx_bpf_nr_cids(void) __ksym __weak;
+u32 scx_bpf_nr_online_cids(void) __ksym __weak;
+u32 scx_bpf_cidperf_cap(s32 cid) __ksym __weak;
+u32 scx_bpf_cidperf_cur(s32 cid) __ksym __weak;
+void scx_bpf_cidperf_set(s32 cid, u32 perf) __ksym __weak;
/*
* Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
@@ -526,6 +539,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
void bpf_rcu_read_lock(void) __ksym;
void bpf_rcu_read_unlock(void) __ksym;
+/* resilient qspinlock */
+int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
+void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
+
/*
* Time helpers, most of which are from jiffies.h.
*/
@@ -1035,7 +1052,18 @@ static inline u64 scx_clock_irq(u32 cpu)
return irqt ? BPF_CORE_READ(irqt, total) : 0;
}
+/* Abbreviated forms of <linux/overflow.h>'s struct_size() family. */
+#define flex_array_size(p, member, count) \
+ ((count) * sizeof(*(p)->member))
+
+#define struct_size(p, member, count) \
+ (offsetof(typeof(*(p)), member) + flex_array_size(p, member, count))
+
+#define struct_size_t(type, member, count) \
+ struct_size((type *)NULL, member, count)
+
#include "compat.bpf.h"
#include "enums.bpf.h"
+#include "cid.bpf.h"
#endif /* __SCX_COMMON_BPF_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 8977b5a2caa10..87f15f2962348 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -121,6 +121,18 @@ static inline bool scx_bpf_sub_dispatch(u64 cgroup_id)
return false;
}
+/*
+ * v7.2: scx_bpf_cid_override() for explicit cpu->cid mapping. Ignore if
+ * missing.
+ */
+void scx_bpf_cid_override___compat(const s32 *cpu_to_cid, u32 cpu_to_cid__sz) __ksym __weak;
+
+static inline void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz)
+{
+ if (bpf_ksym_exists(scx_bpf_cid_override___compat))
+ return scx_bpf_cid_override___compat(cpu_to_cid, cpu_to_cid__sz);
+}
+
/**
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
* in a compatible way. We will preserve this __COMPAT helper until v6.16.
@@ -423,8 +435,10 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
}
/*
- * Define sched_ext_ops. This may be expanded to define multiple variants for
- * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
+ * Define sched_ext_ops. See compat.h::SCX_OPS_OPEN() for how backward
+ * compatibility is handled (this macro can be expanded to emit multiple
+ * variants for incompatible op changes; SCX_OPS_OPEN() handles purely
+ * additive changes at load time).
*/
#define SCX_OPS_DEFINE(__name, ...) \
SEC(".struct_ops.link") \
@@ -432,4 +446,16 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
__VA_ARGS__, \
};
+/*
+ * Define a cid-form sched_ext_ops. Programs targeting this struct_ops type
+ * use cid-form callback signatures (select_cid, set_cmask, cid_online/offline,
+ * dispatch with cid arg, etc.) and may only call the cid-form scx_bpf_*
+ * kfuncs (kick_cid, task_cid, this_cid, ...).
+ */
+#define SCX_OPS_CID_DEFINE(__name, ...) \
+ SEC(".struct_ops.link") \
+ struct sched_ext_ops_cid __name = { \
+ __VA_ARGS__, \
+ };
+
#endif /* __SCX_COMPAT_BPF_H */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 039854c490d5e..602f07061ee39 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -149,10 +149,24 @@ static inline long scx_hotplug_seq(void)
}
/*
- * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
- * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
- * and attach it, backward compatibility is automatically maintained where
- * reasonable.
+ * Open the sched_ext_ops skeleton.
+ *
+ * struct sched_ext_ops can change over time. Two complementary mechanisms
+ * keep BPF schedulers built against newer headers running on older kernels:
+ *
+ * 1. Load-time fix-up (this macro). For each optional ops callback or field
+ * added to struct sched_ext_ops, an explicit stanza below probes the
+ * running kernel's BTF via __COMPAT_struct_has_field() and, if the field
+ * is missing, clears it in the in-memory struct_ops (with a warning to
+ * stderr) before load. Handles additive changes - a new stanza must be
+ * added here for each new optional field.
+ *
+ * 2. Multi-variant struct_ops via compat.bpf.h::SCX_OPS_DEFINE(). That
+ * macro can be expanded to emit several variants of struct sched_ext_ops,
+ * and SCX_OPS_LOAD()/ATTACH() can pick the right one based on what the
+ * kernel supports. Needed when an existing operation has to change
+ * incompatibly (e.g. a callback signature changes); the load-time
+ * fix-up above only handles purely additive changes.
*
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
* the current minimum required kernel version.
@@ -225,6 +239,7 @@ static inline void __scx_ops_assoc_prog(struct bpf_program *prog,
}
#endif
+/* See SCX_OPS_OPEN() above for backward-compatibility handling. */
#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
struct bpf_program *__prog; \
UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h
index e7ac6611a9901..98cab643c8d9a 100644
--- a/tools/sched_ext/include/scx/user_exit_info.bpf.h
+++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h
@@ -32,6 +32,9 @@
__uei_name##_dump_len, (__ei)->dump); \
if (bpf_core_field_exists((__ei)->exit_code)) \
__uei_name.exit_code = (__ei)->exit_code; \
+ __uei_name.exit_cpu = -1; \
+ if (bpf_core_field_exists((__ei)->exit_cpu)) \
+ __uei_name.exit_cpu = (__ei)->exit_cpu; \
/* use __sync to force memory barrier */ \
__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \
(__ei)->kind); \
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 399697fa372fb..56a02b549aef9 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -39,6 +39,8 @@
fprintf(stderr, "EXIT: %s", __uei->reason); \
if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \
+ if (__uei->exit_cpu >= 0) \
+ fprintf(stderr, " on CPU %d", __uei->exit_cpu); \
fputs("\n", stderr); \
__uei->exit_code; \
})
diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h
index 2d0981aedd898..76e2a055eb4b0 100644
--- a/tools/sched_ext/include/scx/user_exit_info_common.h
+++ b/tools/sched_ext/include/scx/user_exit_info_common.h
@@ -22,6 +22,11 @@ enum uei_sizes {
struct user_exit_info {
int kind;
+ /*
+ * CPU that triggered the exit, or -1 if unset (e.g. running on an
+ * older kernel that does not expose this field).
+ */
+ s32 exit_cpu;
s64 exit_code;
char reason[UEI_REASON_LEN];
char msg[UEI_MSG_LEN];
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 4efcce099bd52..64dd60b3e9223 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -149,10 +149,14 @@ static bool dispatch_to_cpu(s32 cpu)
}
/*
- * If we can't run the task at the top, do the dumb thing and
- * bounce it to the fallback dsq.
+ * If we can't run the task at the top for whatever reason,
+ * bounce it to the fallback dsq. Also check
+ * is_migration_disabled() explicitly as p->cpus_ptr may not
+ * reflect the migration-disabled state yet if
+ * migrate_disable_switch() hasn't run.
*/
- if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ (is_migration_disabled(p) && scx_bpf_task_cpu(p) != cpu)) {
__sync_fetch_and_add(&nr_mismatches, 1);
scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
bpf_task_release(p);
diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c
index 0b1a7ce879b06..909d1be1bfe3c 100644
--- a/tools/sched_ext/scx_cpu0.bpf.c
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@@ -18,8 +18,6 @@
char _license[] SEC("license") = "GPL";
-const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
-
UEI_DEFINE(uei);
/*
diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c
index a6fba9978b9ce..4966e3d4c7249 100644
--- a/tools/sched_ext/scx_cpu0.c
+++ b/tools/sched_ext/scx_cpu0.c
@@ -72,8 +72,6 @@ restart:
optind = 1;
skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
- skel->rodata->nr_cpus = libbpf_num_possible_cpus();
-
while ((opt = getopt(argc, argv, "vh")) != -1) {
switch (opt) {
case 'v':
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index d865c381589bb..de2bef86d64d6 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -130,7 +130,6 @@ int main(int argc, char **argv)
struct scx_flatcg *skel;
struct bpf_link *link;
struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
- bool dump_cgrps = false;
__u64 last_cpu_sum = 0, last_cpu_idle = 0;
__u64 last_stats[FCG_NR_STATS] = {};
unsigned long seq = 0;
@@ -148,7 +147,7 @@ restart:
assert(skel->rodata->nr_cpus > 0);
skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
- while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:i:fvh")) != -1) {
double v;
switch (opt) {
@@ -161,9 +160,6 @@ restart:
intv_ts.tv_sec = v;
intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
break;
- case 'd':
- dump_cgrps = true;
- break;
case 'f':
skel->rodata->fifo_sched = true;
break;
@@ -177,10 +173,10 @@ restart:
}
}
- printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
+ printf("slice=%.1lfms intv=%.1lfs",
(double)skel->rodata->cgrp_slice_ns / 1000000.0,
- (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
- dump_cgrps);
+ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0);
+
SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index aad698fe294bf..8a2d6a8ebd8ed 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -2,15 +2,16 @@
/*
* A simple five-level FIFO queue scheduler.
*
- * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
- * assigned to one depending on its compound weight. Each CPU round robins
- * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
- * queue0, 2 from queue1, 4 from queue2 and so on.
+ * There are five FIFOs implemented as arena-backed doubly-linked lists
+ * threaded through per-task context. A task gets assigned to one depending on
+ * its compound weight. Each CPU round robins through the FIFOs and dispatches
+ * more from FIFOs with higher indices - 1 from queue0, 2 from queue1, 4 from
+ * queue2 and so on.
*
* This scheduler demonstrates:
*
- * - BPF-side queueing using PIDs.
- * - Sleepable per-task storage allocation using ops.prep_enable().
+ * - BPF-side queueing using TIDs.
+ * - BPF arena for scheduler state.
* - Core-sched support.
*
* This scheduler is primarily for demonstration and testing of sched_ext
@@ -22,6 +23,8 @@
*/
#include <scx/common.bpf.h>
+#include "scx_qmap.h"
+
enum consts {
ONE_SEC_IN_NS = 1000000000,
ONE_MSEC_IN_NS = 1000000,
@@ -47,40 +50,72 @@ const volatile s32 disallow_tgid;
const volatile bool suppress_dump;
const volatile bool always_enq_immed;
const volatile u32 immed_stress_nth;
+const volatile u32 max_tasks;
-u64 nr_highpri_queued;
-u32 test_error_cnt;
-
-#define MAX_SUB_SCHEDS 8
-u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+/*
+ * Optional cid-override test harness. When cid_override_mode is non-zero,
+ * qmap_init() calls scx_bpf_cid_override() with the caller-supplied
+ * cpu_to_cid array to exercise the kfunc's acceptance and error paths.
+ *
+ * 0 = disabled
+ * 1 = valid reverse mapping
+ * 2 = invalid: duplicate cid assignment
+ * 3 = invalid: out-of-range cid
+ */
+const volatile u32 cid_override_mode;
+/*
+ * Array lives in bss (writable) because scx_bpf_cid_override()'s BPF
+ * verifier signature treats its len-paired pointer as read/write - rodata
+ * fails verification with "write into map forbidden". Userspace populates
+ * it before SCX_OPS_LOAD, same as rodata, and nothing writes it after.
+ */
+s32 cid_override_cpu_to_cid[SCX_QMAP_MAX_CPUS];
UEI_DEFINE(uei);
-struct qmap {
- __uint(type, BPF_MAP_TYPE_QUEUE);
- __uint(max_entries, 4096);
- __type(value, u32);
-} queue0 SEC(".maps"),
- queue1 SEC(".maps"),
- queue2 SEC(".maps"),
- queue3 SEC(".maps"),
- queue4 SEC(".maps"),
- dump_store SEC(".maps");
-
+/*
+ * All scheduler state - per-cpu context, stats counters, core-sched sequence
+ * numbers, sub-sched cgroup ids - lives in this single BPF arena map. Userspace
+ * reaches it via skel->arena->qa.
+ */
struct {
- __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
- __uint(max_entries, 5);
- __type(key, int);
- __array(values, struct qmap);
-} queue_arr SEC(".maps") = {
- .values = {
- [0] = &queue0,
- [1] = &queue1,
- [2] = &queue2,
- [3] = &queue3,
- [4] = &queue4,
- },
-};
+ __uint(type, BPF_MAP_TYPE_ARENA);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(max_entries, 1 << 16); /* upper bound in pages */
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+ __ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */
+#else
+ __ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+struct qmap_arena __arena_global qa;
+
+/*
+ * Global idle-cid tracking, maintained via update_idle / cpu_offline and
+ * scanned by the direct-dispatch path. Allocated in qmap_init() from one
+ * arena page, sized to the full cid space.
+ */
+struct scx_cmask __arena *qa_idle_cids;
+
+/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
+__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
+__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
+__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
+__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
+__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
+
+static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
+{
+ switch (qid) {
+ case 0: return &qa_q_lock0;
+ case 1: return &qa_q_lock1;
+ case 2: return &qa_q_lock2;
+ case 3: return &qa_q_lock3;
+ case 4: return &qa_q_lock4;
+ default: return NULL;
+ }
+}
/*
* If enabled, CPU performance target is set according to the queue index
@@ -102,85 +137,214 @@ static const u32 qidx_to_cpuperf_target[] = {
* task's seq and the associated queue's head seq is called the queue distance
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
-static u64 core_sched_head_seqs[5];
-static u64 core_sched_tail_seqs[5];
-/* Per-task scheduling context */
+/*
+ * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
+ * arena. While the task is alive the entry is referenced from task_ctx_stor;
+ * while it's free the entry sits on the free list singly-linked through
+ * @next_free.
+ *
+ * When the task is queued on one of the five priority FIFOs, @q_idx is the
+ * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
+ * @q_idx is -1 when the task isn't on any queue.
+ */
struct task_ctx {
- bool force_local; /* Dispatch directly to local_dsq */
- bool highpri;
- u64 core_sched_seq;
+ struct task_ctx __arena *next_free; /* only valid on free list */
+ struct task_ctx __arena *q_next; /* queue link, NULL if tail */
+ struct task_ctx __arena *q_prev; /* queue link, NULL if head */
+ struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */
+ u64 tid;
+ s32 pid; /* for dump only */
+ bool force_local; /* Dispatch directly to local_dsq */
+ bool highpri;
+ u64 core_sched_seq;
+ struct scx_cmask cpus_allowed; /* per-task affinity in cid space */
+};
+
+/*
+ * Slab stride for task_ctx. cpus_allowed's flex array bits[] overlaps the
+ * tail bytes appended per entry; struct_size() gives the actual per-entry
+ * footprint.
+ */
+#define TASK_CTX_STRIDE \
+ struct_size_t(struct task_ctx, cpus_allowed.bits, \
+ CMASK_NR_WORDS(SCX_QMAP_MAX_CPUS))
+
+/* All task_ctx pointers are arena pointers. */
+typedef struct task_ctx __arena task_ctx_t;
+
+/* Holds an arena pointer to the task's slab entry. */
+struct task_ctx_stor_val {
+ task_ctx_t *taskc;
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
- __type(value, struct task_ctx);
+ __type(value, struct task_ctx_stor_val);
} task_ctx_stor SEC(".maps");
-struct cpu_ctx {
- u64 dsp_idx; /* dispatch index */
- u64 dsp_cnt; /* remaining count */
- u32 avg_weight;
- u32 cpuperf_target;
-};
+/* Protects the task_ctx slab free list. */
+__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
-struct {
- __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
- __uint(max_entries, 1);
- __type(key, u32);
- __type(value, struct cpu_ctx);
-} cpu_ctx_stor SEC(".maps");
+static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ if (bpf_res_spin_lock(lock)) {
+ scx_bpf_error("res_spin_lock failed");
+ return -EBUSY;
+ }
+ return 0;
+}
-/* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
-u64 nr_core_sched_execed;
-u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
-u32 cpuperf_min, cpuperf_avg, cpuperf_max;
-u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+/*
+ * Try prev_cid, then scan taskc->cpus_allowed AND qa_idle_cids round-robin
+ * from prev_cid + 1. Atomic claim retries on race; bounded by
+ * IDLE_PICK_RETRIES to keep the verifier's insn budget in check.
+ */
+#define IDLE_PICK_RETRIES 16
-static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
+static s32 pick_direct_dispatch_cid(struct task_struct *p, s32 prev_cid,
+ task_ctx_t *taskc)
{
- s32 cpu;
+ u32 nr_cids = scx_bpf_nr_cids();
+ s32 cid;
+ u32 i;
if (!always_enq_immed && p->nr_cpus_allowed == 1)
- return prev_cpu;
-
- if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
- return prev_cpu;
+ return prev_cid;
- cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- return cpu;
+ if (cmask_test_and_clear(qa_idle_cids, prev_cid))
+ return prev_cid;
+ cid = prev_cid;
+ bpf_for(i, 0, IDLE_PICK_RETRIES) {
+ cid = cmask_next_and_set_wrap(&taskc->cpus_allowed,
+ qa_idle_cids, cid + 1);
+ barrier_var(cid);
+ if (cid >= nr_cids)
+ return -1;
+ if (cmask_test_and_clear(qa_idle_cids, cid))
+ return cid;
+ }
return -1;
}
-static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+/*
+ * Force a reference to the arena map. The verifier associates an arena with
+ * a program by finding an LD_IMM64 instruction that loads the arena's BPF
+ * map; programs that only use arena pointers returned from task-local
+ * storage (like qmap_select_cpu) never reference @arena directly. Without
+ * this, the verifier rejects addr_space_cast with "addr_space_cast insn
+ * can only be used in a program that has an associated arena".
+ */
+#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
+
+static task_ctx_t *lookup_task_ctx(struct task_struct *p)
+{
+ struct task_ctx_stor_val *v;
+
+ QMAP_TOUCH_ARENA();
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ if (!v || !v->taskc)
+ return NULL;
+ return v->taskc;
+}
+
+/* Append @taskc to the tail of @fifo. Must not already be queued. */
+static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo, task_ctx_t *taskc)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+
+ if (!lock || qmap_spin_lock(lock))
+ return;
+ taskc->fifo = fifo;
+ taskc->q_next = NULL;
+ taskc->q_prev = fifo->tail;
+ if (fifo->tail)
+ fifo->tail->q_next = taskc;
+ else
+ fifo->head = taskc;
+ fifo->tail = taskc;
+ bpf_res_spin_unlock(lock);
+}
+
+/* Pop the head of @fifo. Returns NULL if empty. */
+static task_ctx_t *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+ task_ctx_t *taskc;
+
+ if (!lock || qmap_spin_lock(lock))
+ return NULL;
+ taskc = fifo->head;
+ if (taskc) {
+ fifo->head = taskc->q_next;
+ if (taskc->q_next)
+ taskc->q_next->q_prev = NULL;
+ else
+ fifo->tail = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ }
+ bpf_res_spin_unlock(lock);
+ return taskc;
+}
+
+/* Remove @taskc from its fifo. No-op if not queued. */
+static void qmap_fifo_remove(task_ctx_t *taskc)
{
- return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ struct qmap_fifo __arena *fifo = taskc->fifo;
+ struct bpf_res_spin_lock *lock;
+
+ if (!fifo)
+ return;
+
+ lock = qa_q_lock(fifo->idx);
+ if (!lock || qmap_spin_lock(lock))
+ return;
+
+ /* Re-check under lock — a concurrent pop may have cleared fifo. */
+ if (taskc->fifo != fifo) {
+ bpf_res_spin_unlock(lock);
+ return;
+ }
+
+ if (taskc->q_next)
+ taskc->q_next->q_prev = taskc->q_prev;
+ else
+ fifo->tail = taskc->q_prev;
+ if (taskc->q_prev)
+ taskc->q_prev->q_next = taskc->q_next;
+ else
+ fifo->head = taskc->q_next;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ bpf_res_spin_unlock(lock);
}
-s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
- s32 prev_cpu, u64 wake_flags)
+s32 BPF_STRUCT_OPS(qmap_select_cid, struct task_struct *p,
+ s32 prev_cid, u64 wake_flags)
{
- struct task_ctx *tctx;
- s32 cpu;
+ task_ctx_t *taskc;
+ s32 cid;
- if (!(tctx = lookup_task_ctx(p)))
- return prev_cpu;
+ if (!(taskc = lookup_task_ctx(p)))
+ return prev_cid;
if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
- return prev_cpu;
+ return prev_cid;
- cpu = pick_direct_dispatch_cpu(p, prev_cpu);
+ cid = pick_direct_dispatch_cid(p, prev_cid, taskc);
- if (cpu >= 0) {
- tctx->force_local = true;
- return cpu;
+ if (cid >= 0) {
+ taskc->force_local = true;
+ return cid;
} else {
- return prev_cpu;
+ return prev_cid;
}
}
@@ -202,16 +366,14 @@ static int weight_to_idx(u32 weight)
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
- struct task_ctx *tctx;
- u32 pid = p->pid;
+ task_ctx_t *taskc;
int idx = weight_to_idx(p->scx.weight);
- void *ring;
- s32 cpu;
+ s32 cid;
if (enq_flags & SCX_ENQ_REENQ) {
- __sync_fetch_and_add(&nr_reenqueued, 1);
- if (scx_bpf_task_cpu(p) == 0)
- __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+ __sync_fetch_and_add(&qa.nr_reenqueued, 1);
+ if (scx_bpf_task_cid(p) == 0)
+ __sync_fetch_and_add(&qa.nr_reenqueued_cid0, 1);
}
if (p->flags & PF_KTHREAD) {
@@ -222,17 +384,17 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- if (test_error_cnt && !--test_error_cnt)
+ if (qa.test_error_cnt && !--qa.test_error_cnt)
scx_bpf_error("test triggering error");
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return;
/*
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
*/
- tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+ taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
/*
* IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -243,19 +405,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
static u32 immed_stress_cnt;
if (!(++immed_stress_cnt % immed_stress_nth)) {
- tctx->force_local = false;
- scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
+ taskc->force_local = false;
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cid(p),
slice_ns, enq_flags);
return;
}
}
/*
- * If qmap_select_cpu() is telling us to or this is the last runnable
+ * If qmap_select_cid() is telling us to or this is the last runnable
* task on the CPU, enqueue locally.
*/
- if (tctx->force_local) {
- tctx->force_local = false;
+ if (taskc->force_local) {
+ taskc->force_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
}
@@ -267,11 +429,11 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- /* if select_cpu() wasn't called, try direct dispatch */
+ /* if select_cid() wasn't called, try direct dispatch */
if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
- (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
- __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
- scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
+ (cid = pick_direct_dispatch_cid(p, scx_bpf_task_cid(p), taskc)) >= 0) {
+ __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cid, slice_ns, enq_flags);
return;
}
@@ -279,55 +441,52 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
* If the task was re-enqueued due to the CPU being preempted by a
* higher priority scheduling class, just re-enqueue the task directly
* on the global DSQ. As we want another CPU to pick it up, find and
- * kick an idle CPU.
+ * kick an idle cid.
*/
if (enq_flags & SCX_ENQ_REENQ) {
- s32 cpu;
+ s32 cid;
scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
- cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+ cid = cmask_next_and_set_wrap(&taskc->cpus_allowed,
+ qa_idle_cids, 0);
+ if (cid < scx_bpf_nr_cids())
+ scx_bpf_kick_cid(cid, SCX_KICK_IDLE);
return;
}
- ring = bpf_map_lookup_elem(&queue_arr, &idx);
- if (!ring) {
- scx_bpf_error("failed to find ring %d", idx);
- return;
- }
-
- /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
- if (bpf_map_push_elem(ring, &pid, 0)) {
- scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
- return;
- }
+ /* Queue on the selected FIFO. */
+ qmap_fifo_enqueue(&qa.fifos[idx], taskc);
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
- tctx->highpri = true;
- __sync_fetch_and_add(&nr_highpri_queued, 1);
+ taskc->highpri = true;
+ __sync_fetch_and_add(&qa.nr_highpri_queued, 1);
}
- __sync_fetch_and_add(&nr_enqueued, 1);
+ __sync_fetch_and_add(&qa.nr_enqueued, 1);
}
-/*
- * The BPF queue map doesn't support removal and sched_ext can handle spurious
- * dispatches. qmap_dequeue() is only used to collect statistics.
- */
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
- __sync_fetch_and_add(&nr_dequeued, 1);
+ task_ctx_t *taskc;
+
+ __sync_fetch_and_add(&qa.nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
- __sync_fetch_and_add(&nr_core_sched_execed, 1);
+ __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
+
+ taskc = lookup_task_ctx(p);
+ if (taskc && taskc->fifo) {
+ if (taskc->highpri)
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
+ qmap_fifo_remove(taskc);
+ }
}
static void update_core_sched_head_seq(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
- if ((tctx = lookup_task_ctx(p)))
- core_sched_head_seqs[idx] = tctx->core_sched_seq;
+ if ((taskc = lookup_task_ctx(p)))
+ qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
}
/*
@@ -343,17 +502,18 @@ static void update_core_sched_head_seq(struct task_struct *p)
static bool dispatch_highpri(bool from_timer)
{
struct task_struct *p;
- s32 this_cpu = bpf_get_smp_processor_id();
+ s32 this_cid = scx_bpf_this_cid();
+ u32 nr_cids = scx_bpf_nr_cids();
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
static u64 highpri_seq;
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return false;
- if (tctx->highpri) {
+ if (taskc->highpri) {
/* exercise the set_*() and vtime interface too */
scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
@@ -362,30 +522,38 @@ static bool dispatch_highpri(bool from_timer)
}
/*
- * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
- * is found.
+ * Scan HIGHPRI_DSQ and dispatch until a task that can run here is
+ * found. Prefer this_cid if the task allows it; otherwise RR-scan the
+ * task's cpus_allowed starting after this_cid.
*/
bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
+ task_ctx_t *taskc;
bool dispatched = false;
- s32 cpu;
+ s32 cid;
+
+ if (!(taskc = lookup_task_ctx(p)))
+ return false;
- if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
- cpu = this_cpu;
+ if (cmask_test(&taskc->cpus_allowed, this_cid))
+ cid = this_cid;
else
- cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
+ cid = cmask_next_set_wrap(&taskc->cpus_allowed,
+ this_cid + 1);
+ if (cid >= nr_cids)
+ continue;
- if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
+ if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cid,
SCX_ENQ_PREEMPT)) {
- if (cpu == this_cpu) {
+ if (cid == this_cid) {
dispatched = true;
- __sync_fetch_and_add(&nr_expedited_local, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_local, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_remote, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_remote, 1);
}
if (from_timer)
- __sync_fetch_and_add(&nr_expedited_from_timer, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_lost, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_lost, 1);
}
if (dispatched)
@@ -395,22 +563,21 @@ static bool dispatch_highpri(bool from_timer)
return false;
}
-void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
+void BPF_STRUCT_OPS(qmap_dispatch, s32 cid, struct task_struct *prev)
{
struct task_struct *p;
- struct cpu_ctx *cpuc;
- struct task_ctx *tctx;
- u32 zero = 0, batch = dsp_batch ?: 1;
- void *fifo;
- s32 i, pid;
+ struct cpu_ctx __arena *cpuc;
+ task_ctx_t *taskc;
+ u32 batch = dsp_batch ?: 1;
+ s32 i;
if (dispatch_highpri(false))
return;
- if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
+ if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
return;
- if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+ if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
/*
* PID 2 should be kthreadd which should mostly be idle and off
* the scheduler. Let's keep dispatching it to force the kernel
@@ -424,10 +591,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
}
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
+ cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()];
for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */
@@ -436,33 +600,23 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
- fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
- if (!fifo) {
- scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
- return;
- }
-
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
- if (bpf_map_pop_elem(fifo, &pid))
+ taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
+ if (!taskc)
break;
- p = bpf_task_from_pid(pid);
+ p = scx_bpf_tid_to_task(taskc->tid);
if (!p)
continue;
- if (!(tctx = lookup_task_ctx(p))) {
- bpf_task_release(p);
- return;
- }
-
- if (tctx->highpri)
- __sync_fetch_and_sub(&nr_highpri_queued, 1);
+ if (taskc->highpri)
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
update_core_sched_head_seq(p);
- __sync_fetch_and_add(&nr_dispatched, 1);
+ __sync_fetch_and_add(&qa.nr_dispatched, 1);
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
@@ -502,10 +656,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* document this class of issue -- other schedulers
* seeing similar warnings can use this as a reference.
*/
- if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
- scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
-
- bpf_task_release(p);
+ if (!cmask_test(&taskc->cpus_allowed, cid))
+ scx_bpf_kick_cid(scx_bpf_task_cid(p), 0);
batch--;
cpuc->dsp_cnt--;
@@ -523,8 +675,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] &&
- scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+ if (qa.sub_sched_cgroup_ids[i] &&
+ scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
return;
}
@@ -533,24 +685,20 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* if the task were enqueued and dispatched immediately.
*/
if (prev) {
- tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
- if (tctx)
- tctx->core_sched_seq =
- core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+ taskc = lookup_task_ctx(prev);
+ if (!taskc)
+ return;
+
+ taskc->core_sched_seq =
+ qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
}
}
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{
- struct cpu_ctx *cpuc;
- u32 zero = 0;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()];
int idx;
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
-
/*
* Use the running avg of weights to select the target cpuperf level.
* This is a demonstration of the cpuperf feature rather than a
@@ -560,7 +708,7 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
idx = weight_to_idx(cpuc->avg_weight);
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
- scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
+ scx_bpf_cidperf_set(scx_bpf_task_cid(p), cpuc->cpuperf_target);
}
/*
@@ -570,14 +718,14 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
static s64 task_qdist(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
s64 qdist;
- tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
- if (!tctx)
+ taskc = lookup_task_ctx(p);
+ if (!taskc)
return 0;
- qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+ qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
@@ -610,70 +758,110 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
* tasks when a higher-priority scheduling class takes the CPU.
*/
-s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
- struct scx_init_task_args *args)
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
+ struct scx_init_task_args *args)
{
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
+
if (p->tgid == disallow_tgid)
p->scx.disallow = true;
- /*
- * @p is new. Let's ensure that its task_ctx is available. We can sleep
- * in this function and the following will automatically use GFP_KERNEL.
- */
- if (bpf_task_storage_get(&task_ctx_stor, p, 0,
- BPF_LOCAL_STORAGE_GET_F_CREATE))
- return 0;
- else
+ /* pop a slab entry off the free list */
+ if (qmap_spin_lock(&qa_task_lock))
+ return -EBUSY;
+ taskc = qa.task_free_head;
+ if (taskc)
+ qa.task_free_head = taskc->next_free;
+ bpf_res_spin_unlock(&qa_task_lock);
+ if (!taskc) {
+ scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
+ return -ENOMEM;
+ }
+
+ taskc->next_free = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ taskc->tid = p->scx.tid;
+ taskc->pid = p->pid;
+ taskc->force_local = false;
+ taskc->highpri = false;
+ taskc->core_sched_seq = 0;
+ cmask_init(&taskc->cpus_allowed, 0, scx_bpf_nr_cids());
+ bpf_rcu_read_lock();
+ cmask_from_cpumask(&taskc->cpus_allowed, p->cpus_ptr);
+ bpf_rcu_read_unlock();
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!v) {
+ /* push back to the free list */
+ if (!qmap_spin_lock(&qa_task_lock)) {
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
+ }
return -ENOMEM;
+ }
+ v->taskc = taskc;
+ return 0;
}
-void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
+ struct scx_exit_task_args *args)
{
- s32 i, pid;
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
- if (suppress_dump)
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
return;
+ taskc = v->taskc;
+ v->taskc = NULL;
- bpf_for(i, 0, 5) {
- void *fifo;
+ if (qmap_spin_lock(&qa_task_lock))
+ return;
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
+}
- if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
- return;
+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+{
+ task_ctx_t *taskc;
+ s32 i;
- scx_bpf_dump("QMAP FIFO[%d]:", i);
+ QMAP_TOUCH_ARENA();
- /*
- * Dump can be invoked anytime and there is no way to iterate in
- * a non-destructive way. Pop and store in dump_store and then
- * restore afterwards. If racing against new enqueues, ordering
- * can get mixed up.
- */
- bpf_repeat(4096) {
- if (bpf_map_pop_elem(fifo, &pid))
- break;
- bpf_map_push_elem(&dump_store, &pid, 0);
- scx_bpf_dump(" %d", pid);
- }
+ if (suppress_dump)
+ return;
+ /*
+ * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
+ * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
+ * a lock and dump. Best-effort; racing may print stale tids but the
+ * walk is bounded by bpf_repeat() so it always terminates.
+ */
+ bpf_for(i, 0, 5) {
+ scx_bpf_dump("QMAP FIFO[%d]:", i);
+ taskc = qa.fifos[i].head;
bpf_repeat(4096) {
- if (bpf_map_pop_elem(&dump_store, &pid))
+ if (!taskc)
break;
- bpf_map_push_elem(fifo, &pid, 0);
+ scx_bpf_dump(" %d:%llu", taskc->pid, taskc->tid);
+ taskc = taskc->q_next;
}
-
scx_bpf_dump("\n");
}
}
-void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
+void BPF_STRUCT_OPS(qmap_dump_cid, struct scx_dump_ctx *dctx, s32 cid, bool idle)
{
- u32 zero = 0;
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid];
if (suppress_dump || idle)
return;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
- return;
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
@@ -682,12 +870,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
{
- struct task_ctx *taskc;
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
+
+ QMAP_TOUCH_ARENA();
if (suppress_dump)
return;
- if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
return;
+ taskc = v->taskc;
scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
taskc->force_local, taskc->core_sched_seq);
@@ -716,61 +909,25 @@ void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
cgrp->kn->id, period_us, quota_us, burst_us);
}
-/*
- * Print out the online and possible CPU map using bpf_printk() as a
- * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
- */
-static void print_cpus(void)
+void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
{
- const struct cpumask *possible, *online;
- s32 cpu;
- char buf[128] = "", *p;
- int idx;
-
- possible = scx_bpf_get_possible_cpumask();
- online = scx_bpf_get_online_cpumask();
-
- idx = 0;
- bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
- if (!(p = MEMBER_VPTR(buf, [idx++])))
- break;
- if (bpf_cpumask_test_cpu(cpu, online))
- *p++ = 'O';
- else if (bpf_cpumask_test_cpu(cpu, possible))
- *p++ = 'X';
- else
- *p++ = ' ';
-
- if ((cpu & 7) == 7) {
- if (!(p = MEMBER_VPTR(buf, [idx++])))
- break;
- *p++ = '|';
- }
- }
- buf[sizeof(buf) - 1] = '\0';
-
- scx_bpf_put_cpumask(online);
- scx_bpf_put_cpumask(possible);
-
- bpf_printk("CPUS: |%s", buf);
+ QMAP_TOUCH_ARENA();
+ if (idle)
+ cmask_set(qa_idle_cids, cid);
+ else
+ cmask_clear(qa_idle_cids, cid);
}
-void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
+void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
+ const struct scx_cmask *cmask_in)
{
- if (print_msgs) {
- bpf_printk("CPU %d coming online", cpu);
- /* @cpu is already online at this point */
- print_cpus();
- }
-}
+ struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
+ task_ctx_t *taskc;
-void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
-{
- if (print_msgs) {
- bpf_printk("CPU %d going offline", cpu);
- /* @cpu is still online at this point */
- print_cpus();
- }
+ taskc = lookup_task_ctx(p);
+ if (!taskc)
+ return;
+ cmask_copy(&taskc->cpus_allowed, cmask);
}
struct monitor_timer {
@@ -785,64 +942,49 @@ struct {
} monitor_timer SEC(".maps");
/*
- * Print out the min, avg and max performance levels of CPUs every second to
- * demonstrate the cpuperf interface.
+ * Aggregate cidperf across the first nr_online_cids cids. Post-hotplug
+ * the first-N-are-online invariant drifts, so some cap/cur values may
+ * be stale. For this demo monitor that's fine; the scheduler exits on
+ * the enable-time hotplug_seq mismatch and userspace restarts, which
+ * rebuilds the layout.
*/
static void monitor_cpuperf(void)
{
- u32 zero = 0, nr_cpu_ids;
+ u32 nr_online = scx_bpf_nr_online_cids();
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
- const struct cpumask *online;
- int i, nr_online_cpus = 0;
-
- nr_cpu_ids = scx_bpf_nr_cpu_ids();
- online = scx_bpf_get_online_cpumask();
-
- bpf_for(i, 0, nr_cpu_ids) {
- struct cpu_ctx *cpuc;
- u32 cap, cur;
+ s32 cid;
- if (!bpf_cpumask_test_cpu(i, online))
- continue;
- nr_online_cpus++;
+ QMAP_TOUCH_ARENA();
- /* collect the capacity and current cpuperf */
- cap = scx_bpf_cpuperf_cap(i);
- cur = scx_bpf_cpuperf_cur(i);
+ bpf_for(cid, 0, nr_online) {
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid];
+ u32 cap = scx_bpf_cidperf_cap(cid);
+ u32 cur = scx_bpf_cidperf_cur(cid);
+ u32 target;
cur_min = cur < cur_min ? cur : cur_min;
cur_max = cur > cur_max ? cur : cur_max;
- /*
- * $cur is relative to $cap. Scale it down accordingly so that
- * it's in the same scale as other CPUs and $cur_sum/$cap_sum
- * makes sense.
- */
- cur_sum += cur * cap / SCX_CPUPERF_ONE;
+ cur_sum += (u64)cur * cap / SCX_CPUPERF_ONE;
cap_sum += cap;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
- scx_bpf_error("failed to look up cpu_ctx");
- goto out;
- }
-
- /* collect target */
- cur = cpuc->cpuperf_target;
- target_sum += cur;
- target_min = cur < target_min ? cur : target_min;
- target_max = cur > target_max ? cur : target_max;
+ target = cpuc->cpuperf_target;
+ target_sum += target;
+ target_min = target < target_min ? target : target_min;
+ target_max = target > target_max ? target : target_max;
}
- cpuperf_min = cur_min;
- cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
- cpuperf_max = cur_max;
+ if (!nr_online || !cap_sum)
+ return;
+
+ qa.cpuperf_min = cur_min;
+ qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+ qa.cpuperf_max = cur_max;
- cpuperf_target_min = target_min;
- cpuperf_target_avg = target_sum / nr_online_cpus;
- cpuperf_target_max = target_max;
-out:
- scx_bpf_put_cpumask(online);
+ qa.cpuperf_target_min = target_min;
+ qa.cpuperf_target_avg = target_sum / nr_online;
+ qa.cpuperf_target_max = target_max;
}
/*
@@ -927,12 +1069,76 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
- u32 key = 0;
+ u8 __arena *slab;
+ u32 nr_pages, key = 0, i;
+ u32 nr_cids, nr_cpu_ids;
struct bpf_timer *timer;
s32 ret;
- if (print_msgs && !sub_cgroup_id)
- print_cpus();
+ nr_cids = scx_bpf_nr_cids();
+ nr_cpu_ids = scx_bpf_nr_cpu_ids();
+
+ if (nr_cids > SCX_QMAP_MAX_CPUS) {
+ scx_bpf_error("nr_cids=%u exceeds SCX_QMAP_MAX_CPUS=%d",
+ nr_cids, SCX_QMAP_MAX_CPUS);
+ return -EINVAL;
+ }
+ if (nr_cpu_ids > SCX_QMAP_MAX_CPUS) {
+ scx_bpf_error("nr_cpu_ids=%u exceeds SCX_QMAP_MAX_CPUS=%d",
+ nr_cpu_ids, SCX_QMAP_MAX_CPUS);
+ return -EINVAL;
+ }
+
+ /*
+ * cid-override test hook. Must run before anything that reads the
+ * cid space (scx_bpf_nr_cids, cmask_init, etc.). On invalid input,
+ * the kfunc calls scx_error() which aborts the scheduler.
+ */
+ if (cid_override_mode) {
+ scx_bpf_cid_override((const s32 *)cid_override_cpu_to_cid,
+ nr_cpu_ids * sizeof(s32));
+ }
+
+ /*
+ * Allocate the task_ctx slab in arena and thread the entire slab onto
+ * the free list. max_tasks is set by userspace before load. Each entry
+ * is TASK_CTX_STRIDE bytes - task_ctx's trailing cpus_allowed flex
+ * array extends into the stride tail.
+ */
+ if (!max_tasks) {
+ scx_bpf_error("max_tasks must be > 0");
+ return -EINVAL;
+ }
+
+ nr_pages = (max_tasks * TASK_CTX_STRIDE + PAGE_SIZE - 1) / PAGE_SIZE;
+ slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
+ if (!slab) {
+ scx_bpf_error("failed to allocate task_ctx slab");
+ return -ENOMEM;
+ }
+ qa.task_ctxs = (task_ctx_t *)slab;
+
+ bpf_for(i, 0, 5)
+ qa.fifos[i].idx = i;
+
+ bpf_for(i, 0, max_tasks) {
+ task_ctx_t *cur = (task_ctx_t *)(slab + i * TASK_CTX_STRIDE);
+ task_ctx_t *next = (i + 1 < max_tasks) ?
+ (task_ctx_t *)(slab + (i + 1) * TASK_CTX_STRIDE) : NULL;
+ cur->next_free = next;
+ }
+ qa.task_free_head = (task_ctx_t *)slab;
+
+ /*
+ * Allocate and initialize the idle cmask. Starts empty - update_idle
+ * fills it as cpus enter idle.
+ */
+ qa_idle_cids = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+ if (!qa_idle_cids) {
+ scx_bpf_error("failed to allocate idle cmask");
+ return -ENOMEM;
+ }
+ cmask_init(qa_idle_cids, 0, nr_cids);
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
if (ret) {
@@ -984,8 +1190,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (!sub_sched_cgroup_ids[i]) {
- sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+ if (!qa.sub_sched_cgroup_ids[i]) {
+ qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
bpf_printk("attaching sub-sched[%d] on %s",
i, args->cgroup_path);
return 0;
@@ -1000,8 +1206,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
- sub_sched_cgroup_ids[i] = 0;
+ if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+ qa.sub_sched_cgroup_ids[i] = 0;
bpf_printk("detaching sub-sched[%d] on %s",
i, args->cgroup_path);
break;
@@ -1009,24 +1215,26 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
}
}
-SCX_OPS_DEFINE(qmap_ops,
- .select_cpu = (void *)qmap_select_cpu,
+SCX_OPS_CID_DEFINE(qmap_ops,
+ .flags = SCX_OPS_ENQ_EXITING | SCX_OPS_TID_TO_TASK,
+ .select_cid = (void *)qmap_select_cid,
.enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before,
+ .set_cmask = (void *)qmap_set_cmask,
+ .update_idle = (void *)qmap_update_idle,
.init_task = (void *)qmap_init_task,
+ .exit_task = (void *)qmap_exit_task,
.dump = (void *)qmap_dump,
- .dump_cpu = (void *)qmap_dump_cpu,
+ .dump_cid = (void *)qmap_dump_cid,
.dump_task = (void *)qmap_dump_task,
.cgroup_init = (void *)qmap_cgroup_init,
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
.sub_attach = (void *)qmap_sub_attach,
.sub_detach = (void *)qmap_sub_detach,
- .cpu_online = (void *)qmap_cpu_online,
- .cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.timeout_ms = 5000U,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index e7c89a2bc3d80..67ddd483a4c75 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,9 +10,11 @@
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
+#include <sys/mman.h>
#include <sys/stat.h>
#include <bpf/bpf.h>
#include <scx/common.h>
+#include "scx_qmap.h"
#include "scx_qmap.bpf.skel.h"
const char help_fmt[] =
@@ -21,23 +23,27 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-" [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
+" [-N COUNT] [-P] [-M] [-H] [-c CG_PATH] [-d PID] [-D LEN] [-S] [-p] [-I]\n"
+" [-F COUNT] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
+" -N COUNT Size of the task_ctx arena slab (default 16384)\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -P Print out DSQ content and event counters to trace_pipe every second\n"
" -M Print out debug messages to trace_pipe\n"
" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
+" -c CG_PATH Cgroup path to attach as sub-scheduler, must run parent scheduler first\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
" -S Suppress qmap-specific debug dump\n"
" -p Switch only tasks on SCHED_EXT policy instead of all\n"
" -I Turn on SCX_OPS_ALWAYS_ENQ_IMMED\n"
" -F COUNT IMMED stress: force every COUNT'th enqueue to a busy local DSQ (use with -I)\n"
+" -C MODE cid-override test (shuffle|bad-dup|bad-range)\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
@@ -60,23 +66,36 @@ int main(int argc, char **argv)
{
struct scx_qmap *skel;
struct bpf_link *link;
+ struct qmap_arena *qa;
+ __u32 test_error_cnt = 0;
+ __u64 ecode;
int opt;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
+ if (libbpf_num_possible_cpus() > SCX_QMAP_MAX_CPUS) {
+ fprintf(stderr,
+ "scx_qmap: %d possible CPUs exceeds compile-time cap %d; "
+ "rebuild with larger SCX_QMAP_MAX_CPUS\n",
+ libbpf_num_possible_cpus(), SCX_QMAP_MAX_CPUS);
+ return 1;
+ }
+restart:
+ optind = 1;
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+ skel->rodata->max_tasks = 16384;
- while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:C:vh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'e':
- skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+ test_error_cnt = strtoul(optarg, NULL, 0);
break;
case 't':
skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
@@ -90,6 +109,9 @@ int main(int argc, char **argv)
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
+ case 'N':
+ skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
+ break;
case 'P':
skel->rodata->print_dsqs_and_events = true;
break;
@@ -130,6 +152,35 @@ int main(int argc, char **argv)
case 'F':
skel->rodata->immed_stress_nth = strtoul(optarg, NULL, 0);
break;
+ case 'C': {
+ u32 nr_cpus = libbpf_num_possible_cpus();
+ u32 mode, i;
+
+ if (!strcmp(optarg, "shuffle"))
+ mode = 1;
+ else if (!strcmp(optarg, "bad-dup"))
+ mode = 2;
+ else if (!strcmp(optarg, "bad-range"))
+ mode = 3;
+ else {
+ fprintf(stderr, "unknown cid-override mode '%s'\n", optarg);
+ return 1;
+ }
+ skel->rodata->cid_override_mode = mode;
+
+ /* shuffle: reversed cpu_to_cid, bad-dup: dup cid 0, bad-range: identity */
+ for (i = 0; i < nr_cpus; i++) {
+ if (mode == 1)
+ skel->bss->cid_override_cpu_to_cid[i] = nr_cpus - 1 - i;
+ else
+ skel->bss->cid_override_cpu_to_cid[i] = i;
+ }
+ if (mode == 2 && nr_cpus >= 2)
+ skel->bss->cid_override_cpu_to_cid[1] = 0;
+ if (mode == 3)
+ skel->bss->cid_override_cpu_to_cid[0] = (s32)nr_cpus;
+ break;
+ }
case 'v':
verbose = true;
break;
@@ -142,39 +193,41 @@ int main(int argc, char **argv)
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
+ qa = &skel->arena->qa;
+ qa->test_error_cnt = test_error_cnt;
+
while (!exit_req && !UEI_EXITED(skel, uei)) {
- long nr_enqueued = skel->bss->nr_enqueued;
- long nr_dispatched = skel->bss->nr_dispatched;
+ long nr_enqueued = qa->nr_enqueued;
+ long nr_dispatched = qa->nr_dispatched;
- printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+ printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cid0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
- skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
- skel->bss->nr_dequeued,
- skel->bss->nr_core_sched_execed,
- skel->bss->nr_ddsp_from_enq);
- printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
- skel->bss->nr_expedited_local,
- skel->bss->nr_expedited_remote,
- skel->bss->nr_expedited_from_timer,
- skel->bss->nr_expedited_lost);
- if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
+ qa->nr_reenqueued, qa->nr_reenqueued_cid0,
+ qa->nr_dequeued,
+ qa->nr_core_sched_execed,
+ qa->nr_ddsp_from_enq);
+ printf(" exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
+ qa->nr_expedited_local,
+ qa->nr_expedited_remote,
+ qa->nr_expedited_from_timer,
+ qa->nr_expedited_lost);
+ if (__COMPAT_has_ksym("scx_bpf_cidperf_cur"))
printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
- skel->bss->cpuperf_min,
- skel->bss->cpuperf_avg,
- skel->bss->cpuperf_max,
- skel->bss->cpuperf_target_min,
- skel->bss->cpuperf_target_avg,
- skel->bss->cpuperf_target_max);
+ qa->cpuperf_min,
+ qa->cpuperf_avg,
+ qa->cpuperf_max,
+ qa->cpuperf_target_min,
+ qa->cpuperf_target_avg,
+ qa->cpuperf_target_max);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
- UEI_REPORT(skel, uei);
+ ecode = UEI_REPORT(skel, uei);
scx_qmap__destroy(skel);
- /*
- * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
- * on CPU hotplug events.
- */
+
+ if (UEI_ECODE_RESTART(ecode))
+ goto restart;
return 0;
}
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
new file mode 100644
index 0000000000000..d15a705d5ac52
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
+ *
+ * The scheduler keeps all state in a single BPF arena map. struct
+ * qmap_arena is the one object that lives at the base of the arena and is
+ * mmap'd into userspace so the loader can read counters directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_QMAP_H
+#define __SCX_QMAP_H
+
+#ifdef __BPF__
+#include <scx/bpf_arena_common.bpf.h>
+#else
+#include <linux/types.h>
+#include <scx/bpf_arena_common.h>
+#endif
+
+#define MAX_SUB_SCHEDS 8
+
+/*
+ * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
+ * userspace. Keep this in sync with NR_CPUS used by the BPF side.
+ */
+#define SCX_QMAP_MAX_CPUS 1024
+
+struct cpu_ctx {
+ __u64 dsp_idx; /* dispatch index */
+ __u64 dsp_cnt; /* remaining count */
+ __u32 avg_weight;
+ __u32 cpuperf_target;
+};
+
+/* Opaque to userspace; defined in scx_qmap.bpf.c. */
+struct task_ctx;
+
+struct qmap_fifo {
+ struct task_ctx __arena *head;
+ struct task_ctx __arena *tail;
+ __s32 idx;
+};
+
+struct qmap_arena {
+ /* userspace-visible stats */
+ __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cid0;
+ __u64 nr_dequeued, nr_ddsp_from_enq;
+ __u64 nr_core_sched_execed;
+ __u64 nr_expedited_local, nr_expedited_remote;
+ __u64 nr_expedited_lost, nr_expedited_from_timer;
+ __u64 nr_highpri_queued;
+ __u32 test_error_cnt;
+ __u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+ __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
+ /* kernel-side runtime state */
+ __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+ __u64 core_sched_head_seqs[5];
+ __u64 core_sched_tail_seqs[5];
+
+ struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+
+ /* task_ctx slab; allocated and threaded by qmap_init() */
+ struct task_ctx __arena *task_ctxs;
+ struct task_ctx __arena *task_free_head;
+
+ /* five priority FIFOs, each a doubly-linked list through task_ctx */
+ struct qmap_fifo fifos[5];
+};
+
+#endif /* __SCX_QMAP_H */
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index 02e43c184d438..446d82807f904 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -27,18 +27,25 @@ def read_static_key(name):
def state_str(state):
return prog['scx_enable_state_str'][state].string_().decode()
+def read_root_ops_name():
+ if root:
+ return root.ops.name.string_().decode()
+ return ''
+
+def read_root_field(name, default):
+ if root:
+ return getattr(root, name).value_()
+ return default
+
root = prog['scx_root']
enable_state = read_atomic("scx_enable_state_var")
-if root:
- print(f'ops : {root.ops.name.string_().decode()}')
-else:
- print('ops : ')
+print(f'ops : {read_root_ops_name()}')
print(f'enabled : {read_static_key("__scx_enabled")}')
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}')
print(f'enable_state : {state_str(enable_state)} ({enable_state})')
-print(f'aborting : {prog["scx_aborting"].value_()}')
-print(f'bypass_depth : {prog["scx_bypass_depth"].value_()}')
+print(f'aborting : {read_root_field("aborting", False)}')
+print(f'bypass_depth : {read_root_field("bypass_depth", 0)}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq : {read_atomic("scx_enable_seq")}')
diff --git a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c
index 9f16d39255e73..0d6fcc8e5eb61 100644
--- a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c
+++ b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c
@@ -9,12 +9,7 @@
* Copyright (C) 2026 Cheng-Yang Chou <yphbchou0911@gmail.com>
*/
-#include <vmlinux.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
-
-/* SCX kfunc from scx_kfunc_ids_any set */
-void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+#include <scx/common.bpf.h>
SEC("struct_ops/ssthresh")
__u32 BPF_PROG(tcp_ca_ssthresh, struct sock *sk)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
index 5b6e045e1109b..7e342c0cec653 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
@@ -6,6 +6,7 @@
*/
#include <bpf/bpf.h>
#include <scx/common.h>
+#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
#include "select_cpu_dfl.bpf.skel.h"
@@ -13,29 +14,44 @@
#define NUM_CHILDREN 1028
+struct select_cpu_dfl_ctx {
+ struct select_cpu_dfl *skel;
+ struct bpf_link *link;
+};
+
static enum scx_test_status setup(void **ctx)
{
- struct select_cpu_dfl *skel;
+ struct select_cpu_dfl_ctx *tctx;
+
+ tctx = malloc(sizeof(*tctx));
+ SCX_FAIL_IF(!tctx, "Failed to allocate test context");
+ tctx->link = NULL;
- skel = select_cpu_dfl__open();
- SCX_FAIL_IF(!skel, "Failed to open");
- SCX_ENUM_INIT(skel);
- SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel");
+ tctx->skel = select_cpu_dfl__open();
+ if (!tctx->skel) {
+ free(tctx);
+ SCX_FAIL("Failed to open");
+ }
+ SCX_ENUM_INIT(tctx->skel);
+ if (select_cpu_dfl__load(tctx->skel)) {
+ select_cpu_dfl__destroy(tctx->skel);
+ free(tctx);
+ SCX_FAIL("Failed to load skel");
+ }
- *ctx = skel;
+ *ctx = tctx;
return SCX_TEST_PASS;
}
static enum scx_test_status run(void *ctx)
{
- struct select_cpu_dfl *skel = ctx;
- struct bpf_link *link;
+ struct select_cpu_dfl_ctx *tctx = ctx;
pid_t pids[NUM_CHILDREN];
- int i, status;
+ int i, status, nforked = 0;
- link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
- SCX_FAIL_IF(!link, "Failed to attach scheduler");
+ tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops);
+ SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler");
for (i = 0; i < NUM_CHILDREN; i++) {
pids[i] = fork();
@@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx)
sleep(1);
exit(0);
}
+ if (pids[i] > 0)
+ nforked++;
}
for (i = 0; i < NUM_CHILDREN; i++) {
+ if (pids[i] <= 0)
+ continue;
SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
SCX_EQ(status, 0);
}
- SCX_ASSERT(!skel->bss->saw_local);
-
- bpf_link__destroy(link);
+ SCX_GT(nforked, 0);
+ SCX_ASSERT(!tctx->skel->bss->saw_local);
return SCX_TEST_PASS;
}
static void cleanup(void *ctx)
{
- struct select_cpu_dfl *skel = ctx;
+ struct select_cpu_dfl_ctx *tctx = ctx;
- select_cpu_dfl__destroy(skel);
+ if (tctx->link)
+ bpf_link__destroy(tctx->link);
+ select_cpu_dfl__destroy(tctx->skel);
+ free(tctx);
}
struct scx_test select_cpu_dfl = {