]> Gentwo Git Trees - linux/.git/commitdiff
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
authorTejun Heo <tj@kernel.org>
Tue, 11 Nov 2025 19:18:06 +0000 (09:18 -1000)
committerTejun Heo <tj@kernel.org>
Wed, 12 Nov 2025 16:43:44 +0000 (06:43 -1000)
Bypass mode routes tasks through fallback dispatch queues. Originally a single
global DSQ, b7b3b2dbae73 ("sched_ext: Split the global DSQ per NUMA node")
changed this to per-node DSQs to resolve NUMA-related livelocks.

Dan Schatzberg found per-node DSQs can still livelock when many threads are
pinned to different small CPU subsets: each CPU must scan many incompatible
tasks to find runnable ones, causing severe contention with high CPU counts.

Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default
idle CPU selection and direct dispatch handle most cases well.

This introduces a failure mode when tasks concentrate on one CPU in
over-saturated systems. If the BPF scheduler severely skews placement before
triggering bypass, that CPU's queue may be too long to drain, causing RCU
stalls. A load balancer in a future patch will address this. The bypass DSQ is
separate from local DSQ to enable load balancing: local DSQs use rq locks,
preventing efficient scanning and transfer across CPUs, especially problematic
when systems are already contended.

v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi).

Reported-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
include/linux/sched/ext.h
kernel/sched/ext.c
kernel/sched/sched.h

index 60285c3d07cf61c6ac241a6118f4983af02c4494..3d3216ff91887b661f4abecb8c20241b6f19b299 100644 (file)
@@ -57,6 +57,7 @@ enum scx_dsq_id_flags {
        SCX_DSQ_INVALID         = SCX_DSQ_FLAG_BUILTIN | 0,
        SCX_DSQ_GLOBAL          = SCX_DSQ_FLAG_BUILTIN | 1,
        SCX_DSQ_LOCAL           = SCX_DSQ_FLAG_BUILTIN | 2,
+       SCX_DSQ_BYPASS          = SCX_DSQ_FLAG_BUILTIN | 3,
        SCX_DSQ_LOCAL_ON        = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
        SCX_DSQ_LOCAL_CPU_MASK  = 0xffffffffLLU,
 };
index 43083602c15e38be06439386faa09f50367b2e79..747391a3f6e393c4d8f9f130b833369164aa5c00 100644 (file)
@@ -1298,7 +1298,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 
        if (scx_rq_bypassing(rq)) {
                __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
-               goto global;
+               goto bypass;
        }
 
        if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -1356,6 +1356,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 global:
        dsq = find_global_dsq(sch, p);
        goto enqueue;
+bypass:
+       dsq = &task_rq(p)->scx.bypass_dsq;
+       goto enqueue;
 
 enqueue:
        /*
@@ -2154,8 +2157,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
        if (consume_global_dsq(sch, rq))
                goto has_tasks;
 
-       if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
-           scx_rq_bypassing(rq) || !scx_rq_online(rq))
+       if (scx_rq_bypassing(rq)) {
+               if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+                       goto has_tasks;
+               else
+                       goto no_tasks;
+       }
+
+       if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
                goto no_tasks;
 
        dspc->rq = rq;
@@ -5371,6 +5380,7 @@ void __init init_sched_ext_class(void)
                int  n = cpu_to_node(cpu);
 
                init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+               init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
                INIT_LIST_HEAD(&rq->scx.runnable_list);
                INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
 
index 27aae2a298f8b422b7611f428254e85709bedab4..5991133a484986a967e668658d2f66943ae5a806 100644 (file)
@@ -808,6 +808,7 @@ struct scx_rq {
        struct balance_callback deferred_bal_cb;
        struct irq_work         deferred_irq_work;
        struct irq_work         kick_cpus_irq_work;
+       struct scx_dispatch_q   bypass_dsq;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */