sched_ext: Fix SCX_KICK_WAIT to work reliably

author Tejun Heo <tj@kernel.org>

Wed, 22 Oct 2025 20:56:28 +0000 (10:56 -1000)

committer Tejun Heo <tj@kernel.org>

Wed, 22 Oct 2025 21:42:14 +0000 (11:42 -1000)
author Tejun Heo <tj@kernel.org>
Wed, 22 Oct 2025 20:56:28 +0000 (10:56 -1000)
committer Tejun Heo <tj@kernel.org>
Wed, 22 Oct 2025 21:42:14 +0000 (11:42 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 12e00f77a8487810e41e42174451c1ceda5d5825..1999f6e67022ae0938c5b57d6adeab431708377e 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2260,12 +2260,6 @@ static void switch_class(struct rq *rq, struct task_struct *next)
         struct scx_sched *sch = scx_root;
         const struct sched_class *next_class = next->sched_class;
  
-       /*
-        * Pairs with the smp_load_acquire() issued by a CPU in
-        * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
-        * resched.
-        */
-       smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
         if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
                 return;
  
@@ -2305,6 +2299,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
                               struct task_struct *next)
  {
         struct scx_sched *sch = scx_root;
+
+       /* see kick_cpus_irq_workfn() */
+       smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+
         update_curr_scx(rq);
  
         /* see dequeue_task_scx() on why we skip when !QUEUED */
@@ -2358,6 +2356,9 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
         bool keep_prev, kick_idle = false;
         struct task_struct *p;
  
+       /* see kick_cpus_irq_workfn() */
+       smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+
         rq_modified_clear(rq);
  
         rq_unpin_lock(rq, rf);
@@ -5144,8 +5145,12 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
                 }
  
                 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
-                       pseqs[cpu] = rq->scx.pnt_seq;
-                       should_wait = true;
+                       if (cur_class == &ext_sched_class) {
+                               pseqs[cpu] = rq->scx.pnt_seq;
+                               should_wait = true;
+                       } else {
+                               cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+                       }
                 }
  
                 resched_curr(rq);
@@ -5206,18 +5211,19 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
         for_each_cpu(cpu, this_scx->cpus_to_wait) {
                 unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
  
-               if (cpu != cpu_of(this_rq)) {
-                       /*
-                        * Pairs with smp_store_release() issued by this CPU in
-                        * switch_class() on the resched path.
-                        *
-                        * We busy-wait here to guarantee that no other task can
-                        * be scheduled on our core before the target CPU has
-                        * entered the resched path.
-                        */
-                       while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
-                               cpu_relax();
-               }
+               /*
+                * Busy-wait until the task running at the time of kicking is no
+                * longer running. This can be used to implement e.g. core
+                * scheduling.
+                *
+                * smp_cond_load_acquire() pairs with store_releases in
+                * pick_task_scx() and put_prev_task_scx(). The former breaks
+                * the wait if SCX's scheduling path is entered even if the same
+                * task is picked subsequently. The latter is necessary to break
+                * the wait when $cpu is taken by a higher sched class.
+                */
+               if (cpu != cpu_of(this_rq))
+                       smp_cond_load_acquire(wait_pnt_seq, VAL != pseqs[cpu]);
  
                 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
         }
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h

index 7d00a0a2456e27bbc51cdeeb696d246a7e09c07e..fb161fc35328cbcde007d36df5c7bf3cde947d73 100644 (file)
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -997,8 +997,10 @@ enum scx_kick_flags {
         SCX_KICK_PREEMPT        = 1LLU << 1,
  
         /*
-        * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
-        * return after the target CPU finishes picking the next task.
+        * The scx_bpf_kick_cpu() call will return after the current SCX task of
+        * the target CPU switches out. This can be used to implement e.g. core
+        * scheduling. This has no effect if the current task on the target CPU
+        * is not on SCX.
          */
         SCX_KICK_WAIT           = 1LLU << 2,
  };
author	Tejun Heo <tj@kernel.org>
	Wed, 22 Oct 2025 20:56:28 +0000 (10:56 -1000)
committer	Tejun Heo <tj@kernel.org>
	Wed, 22 Oct 2025 21:42:14 +0000 (11:42 -1000)
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/ext_internal.h		patch \| blob \| blame \| history