DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static int scx_bypass_depth;
+static cpumask_var_t scx_bypass_lb_donee_cpumask;
+static cpumask_var_t scx_bypass_lb_resched_cpumask;
static bool scx_aborting;
static bool scx_init_task_enabled;
static bool scx_switching_all;
*/
static u64 scx_slice_dfl = SCX_SLICE_DFL;
static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
+static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
static int set_slice_us(const char *val, const struct kernel_param *kp)
{
.get = param_get_uint,
};
+static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC);
+}
+
+static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
+ .set = set_bypass_lb_intv_us,
+ .get = param_get_uint,
+};
+
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "sched_ext."
module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
+module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600);
+MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)");
#undef MODULE_PARAM_PREFIX
!RB_EMPTY_NODE(&p->scx.dsq_priq));
if (!is_local) {
- raw_spin_lock(&dsq->lock);
+ raw_spin_lock_nested(&dsq->lock,
+ (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0);
+
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
return true;
}
+static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+ struct cpumask *donee_mask, struct cpumask *resched_mask,
+ u32 nr_donor_target, u32 nr_donee_target)
+{
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+ struct task_struct *p, *n;
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+ s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
+ u32 nr_balanced = 0, min_delta_us;
+
+ /*
+ * All we want to guarantee is reasonable forward progress. No reason to
+ * fine tune. Assuming every task on @donor_dsq runs their full slice,
+ * consider offloading iff the total queued duration is over the
+ * threshold.
+ */
+ min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ return 0;
+
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ list_add(&cursor.node, &donor_dsq->list);
+resume:
+ n = container_of(&cursor, struct task_struct, scx.dsq_list);
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ while ((p = n)) {
+ struct rq *donee_rq;
+ struct scx_dispatch_q *donee_dsq;
+ int donee;
+
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ if (donor_dsq->nr <= nr_donor_target)
+ break;
+
+ if (cpumask_empty(donee_mask))
+ break;
+
+ donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
+ if (donee >= nr_cpu_ids)
+ continue;
+
+ donee_rq = cpu_rq(donee);
+ donee_dsq = &donee_rq->scx.bypass_dsq;
+
+ /*
+ * $p's rq is not locked but $p's DSQ lock protects its
+ * scheduling properties making this test safe.
+ */
+ if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+ continue;
+
+ /*
+ * Moving $p from one non-local DSQ to another. The source rq
+ * and DSQ are already locked. Do an abbreviated dequeue and
+ * then perform enqueue without unlocking $donor_dsq.
+ *
+ * We don't want to drop and reacquire the lock on each
+ * iteration as @donor_dsq can be very long and potentially
+ * highly contended. Donee DSQs are less likely to be contended.
+ * The nested locking is safe as only this LB moves tasks
+ * between bypass DSQs.
+ */
+ dispatch_dequeue_locked(p, donor_dsq);
+ dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+
+ /*
+ * $donee might have been idle and need to be woken up. No need
+ * to be clever. Kick every CPU that receives tasks.
+ */
+ cpumask_set_cpu(donee, resched_mask);
+
+ if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
+ cpumask_clear_cpu(donee, donee_mask);
+
+ nr_balanced++;
+ if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
+ list_move_tail(&cursor.node, &n->scx.dsq_list.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
+ cpu_relax();
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ goto resume;
+ }
+ }
+
+ list_del_init(&cursor.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
+
+ return nr_balanced;
+}
+
+static void bypass_lb_node(struct scx_sched *sch, int node)
+{
+ const struct cpumask *node_mask = cpumask_of_node(node);
+ struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
+ u32 nr_target, nr_donor_target;
+ u32 before_min = U32_MAX, before_max = 0;
+ u32 after_min = U32_MAX, after_max = 0;
+ int cpu;
+
+ /* count the target tasks and CPUs */
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ nr_tasks += nr;
+ nr_cpus++;
+
+ before_min = min(nr, before_min);
+ before_max = max(nr, before_max);
+ }
+
+ if (!nr_cpus)
+ return;
+
+ /*
+ * We don't want CPUs to have more than $nr_donor_target tasks and
+ * balancing to fill donee CPUs upto $nr_target. Once targets are
+ * calculated, find the donee CPUs.
+ */
+ nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus);
+ nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100);
+
+ cpumask_clear(donee_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+ cpumask_set_cpu(cpu, donee_mask);
+ }
+
+ /* iterate !donee CPUs and see if they should be offloaded */
+ cpumask_clear(resched_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+
+ if (cpumask_empty(donee_mask))
+ break;
+ if (cpumask_test_cpu(cpu, donee_mask))
+ continue;
+ if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+ continue;
+
+ nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+ nr_donor_target, nr_target);
+ }
+
+ for_each_cpu(cpu, resched_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ raw_spin_rq_lock_irq(rq);
+ resched_curr(rq);
+ raw_spin_rq_unlock_irq(rq);
+ }
+
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ after_min = min(nr, after_min);
+ after_max = max(nr, after_max);
+
+ }
+
+ trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced,
+ before_min, before_max, after_min, after_max);
+}
+
+/*
+ * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
+ * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
+ * bypass DSQs can be overloaded. If there are enough tasks to saturate other
+ * lightly loaded CPUs, such imbalance can lead to very high execution latency
+ * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such
+ * outcomes, a simple load balancing mechanism is implemented by the following
+ * timer which runs periodically while bypass mode is in effect.
+ */
+static void scx_bypass_lb_timerfn(struct timer_list *timer)
+{
+ struct scx_sched *sch;
+ int node;
+ u32 intv_us;
+
+ sch = rcu_dereference_all(scx_root);
+ if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
+ return;
+
+ for_each_node_with_cpus(node)
+ bypass_lb_node(sch, node);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us)
+ mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
+}
+
+static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
+
/**
* scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
sch = rcu_dereference_bh(scx_root);
if (bypass) {
- scx_bypass_depth++;
+ u32 intv_us;
+
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
+ scx_bypass_lb_timer.expires =
+ jiffies + usecs_to_jiffies(intv_us);
+ add_timer_global(&scx_bypass_lb_timer);
+ }
} else {
- scx_bypass_depth--;
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
WARN_ON_ONCE(scx_bypass_depth < 0);
if (scx_bypass_depth != 0)
goto unlock;
return ret;
}
+ if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
+ !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
+ pr_err("sched_ext: Failed to allocate cpumasks\n");
+ return -ENOMEM;
+ }
+
return 0;
}
__initcall(scx_init);