cgroup: Fix sleeping from invalid context warning on PREEMPT_RT

author Tejun Heo <tj@kernel.org>

Thu, 6 Nov 2025 18:12:36 +0000 (08:12 -1000)

committer Tejun Heo <tj@kernel.org>

Thu, 6 Nov 2025 22:52:26 +0000 (12:52 -1000)
author Tejun Heo <tj@kernel.org>
Thu, 6 Nov 2025 18:12:36 +0000 (08:12 -1000)
committer Tejun Heo <tj@kernel.org>
Thu, 6 Nov 2025 22:52:26 +0000 (12:52 -1000)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index cbb7340c5866fba8ba64305e44a535c6adb54c69..5e80d48488ef24f2d40c43fb1c6efaaf63768c7a 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1324,7 +1324,10 @@ struct task_struct {
         struct css_set __rcu            *cgroups;
         /* cg_list protected by css_set_lock and tsk->alloc_lock: */
         struct list_head                cg_list;
-#endif
+#ifdef CONFIG_PREEMPT_RT
+       struct llist_node               cg_dead_lnode;
+#endif /* CONFIG_PREEMPT_RT */
+#endif /* CONFIG_CGROUPS */
  #ifdef CONFIG_X86_CPU_RESCTRL
         u32                             closid;
         u32                             rmid;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index aae180d56c8c2df9627039ee8ee3202d02399db3..48019a661c080b9ccad8847341a4f66dbda05e18 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css);
  static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                               struct cgroup *cgrp, struct cftype cfts[],
                               bool is_add);
+static void cgroup_rt_init(void);
  
  #ifdef CONFIG_DEBUG_CGROUP_REF
  #define CGROUP_REF_FN_ATTRS    noinline
@@ -6360,6 +6361,7 @@ int __init cgroup_init(void)
         BUG_ON(ss_rstat_init(NULL));
  
         get_user_ns(init_cgroup_ns.user_ns);
+       cgroup_rt_init();
  
         cgroup_lock();
  
@@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk)
         } while_each_subsys_mask();
  }
  
-void cgroup_task_dead(struct task_struct *tsk)
+static void do_cgroup_task_dead(struct task_struct *tsk)
  {
         struct css_set *cset;
         unsigned long flags;
@@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk)
         spin_unlock_irqrestore(&css_set_lock, flags);
  }
  
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously but can't be delayed indefinitely either as the dead task pins
+ * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
+ * irq_work to allow batching while ensuring timely completion.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+       struct llist_node *lnode;
+       struct task_struct *task, *next;
+
+       lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+       llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+               do_cgroup_task_dead(task);
+               put_task_struct(task);
+       }
+}
+
+static void __init cgroup_rt_init(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+               per_cpu(cgrp_dead_tasks_iwork, cpu) =
+                       IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
+       }
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+       get_task_struct(task);
+       llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+       irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
+}
+#else  /* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+       do_cgroup_task_dead(task);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
  void cgroup_task_release(struct task_struct *task)
  {
         struct cgroup_subsys *ss;
author	Tejun Heo <tj@kernel.org>
	Thu, 6 Nov 2025 18:12:36 +0000 (08:12 -1000)
committer	Tejun Heo <tj@kernel.org>
	Thu, 6 Nov 2025 22:52:26 +0000 (12:52 -1000)
include/linux/sched.h		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history