]> Gentwo Git Trees - linux/.git/commitdiff
Revert "pid: allow pid_max to be set per pid namespace"
authorMichal Koutný <mkoutny@suse.com>
Fri, 21 Feb 2025 17:02:48 +0000 (18:02 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Sun, 23 Feb 2025 06:26:20 +0000 (22:26 -0800)
Patch series "Alternative "pid_max" for 32-bit userspace".

pid_max is sort of a legacy limit (its value and partially the concept
too, given the existence of pids cgroup controller).  It is tempting to
make the pid_max value part of a pid namespace to provide compat
environment for 32-bit applications [1].  On the other hand, it provides
yet another mechanism for limitation of task count.  Even without
namespacing of pid_max value, the configuration of conscious limit is
confusing for users [2].

This series builds upon the idea of restricting the number (amount) of
tasks by pids controller and ensuring that number (pid) never exceeds the
amount of tasks.  This would not currently work out of the box because
next-fit pid allocation would continue to assign numbers (pids) higher
than the actual amount (there would be gaps in the lower range of the
interval).  The patch 2/2 implements this idea by extending semantics of
ns_last_pid knob to allow first-fit numbering.  (The implementation has
clumsy ifdefery, which can might be dropped since it's too x86-centric.)

The patch 1/2 is a mere revert to simplify pid_max to one global limit
only.

[1] https://lore.kernel.org/r/20241122132459.135120-1-aleksandr.mikhalitsyn@canonical.com/
[2] https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/

This patch (of 2):

This reverts commit 7863dcc72d0f4b13a641065670426435448b3d80.

It is already difficult for users to troubleshoot which of multiple pid
limits restricts their workload.  I'm afraid making pid_max
per-(hierarchical-)NS will contribute to confusion.  Also, the
implementation copies the limit upon creation from parent, this pattern
showed cumbersome with some attributes in legacy cgroup controllers --
it's subject to race condition between parent's limit modification and
children creation and once copied it must be changed in the descendant.

This is very similar to what pids.max of a cgroup (already) does that can
be used as an alternative.

Link: https://lkml.kernel.org/r/20250221170249.890014-1-mkoutny@suse.com
Link: https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/
Link: https://lkml.kernel.org/r/20250221170249.890014-2-mkoutny@suse.com
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Cc: Alexander Mikhalitsyn <alexander@mihalicyn.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <kees@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/pid.h
include/linux/pid_namespace.h
kernel/pid.c
kernel/pid_namespace.c
kernel/sysctl.c
kernel/trace/pid_list.c
kernel/trace/trace.h
kernel/trace/trace_sched_switch.c

index 98837a1ff0f334556b2cb1cb6be639056ffa9029..fe575fcdb4afa481645b9ce80fb6ee7147320cd2 100644 (file)
@@ -108,6 +108,9 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
 extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                         enum pid_type);
 
+extern int pid_max;
+extern int pid_max_min, pid_max_max;
+
 /*
  * look up a PID in the hash table. Must be called with the tasklist_lock
  * or rcu_read_lock() held.
index 7c67a58111998d9bcb1ba29c017acb249286ec38..f9f9931e02d6ad51970be627edcc58fbda879f0f 100644 (file)
@@ -30,7 +30,6 @@ struct pid_namespace {
        struct task_struct *child_reaper;
        struct kmem_cache *pid_cachep;
        unsigned int level;
-       int pid_max;
        struct pid_namespace *parent;
 #ifdef CONFIG_BSD_PROCESS_ACCT
        struct fs_pin *bacct;
@@ -39,14 +38,9 @@ struct pid_namespace {
        struct ucounts *ucounts;
        int reboot;     /* group exit code if this pidns was rebooted */
        struct ns_common ns;
-       struct work_struct      work;
-#ifdef CONFIG_SYSCTL
-       struct ctl_table_set    set;
-       struct ctl_table_header *sysctls;
-#if defined(CONFIG_MEMFD_CREATE)
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        int memfd_noexec_scope;
 #endif
-#endif
 } __randomize_layout;
 
 extern struct pid_namespace init_pid_ns;
@@ -123,8 +117,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
 void pid_idr_init(void);
-int register_pidns_sysctls(struct pid_namespace *pidns);
-void unregister_pidns_sysctls(struct pid_namespace *pidns);
 
 static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
 {
index 924084713be8ba9883b379baa346703fc1c91aff..aa2a7d4da4555dadcf579c53e2cd0d37ad81e55d 100644 (file)
@@ -61,8 +61,10 @@ struct pid init_struct_pid = {
        }, }
 };
 
-static int pid_max_min = RESERVED_PIDS + 1;
-static int pid_max_max = PID_MAX_LIMIT;
+int pid_max = PID_MAX_DEFAULT;
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -81,7 +83,6 @@ struct pid_namespace init_pid_ns = {
 #ifdef CONFIG_PID_NS
        .ns.ops = &pidns_operations,
 #endif
-       .pid_max = PID_MAX_DEFAULT,
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
 #endif
@@ -190,7 +191,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 
        for (i = ns->level; i >= 0; i--) {
                int tid = 0;
-               int pid_max = READ_ONCE(tmp->pid_max);
 
                if (set_tid_size) {
                        tid = set_tid[ns->level - i];
@@ -644,118 +644,17 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
        return fd;
 }
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
-{
-       return &task_active_pid_ns(current)->set;
-}
-
-static int set_is_seen(struct ctl_table_set *set)
-{
-       return &task_active_pid_ns(current)->set == set;
-}
-
-static int pid_table_root_permissions(struct ctl_table_header *head,
-                                     const struct ctl_table *table)
-{
-       struct pid_namespace *pidns =
-               container_of(head->set, struct pid_namespace, set);
-       int mode = table->mode;
-
-       if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
-           uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
-               mode = (mode & S_IRWXU) >> 6;
-       else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
-               mode = (mode & S_IRWXG) >> 3;
-       else
-               mode = mode & S_IROTH;
-       return (mode << 6) | (mode << 3) | mode;
-}
-
-static void pid_table_root_set_ownership(struct ctl_table_header *head,
-                                        kuid_t *uid, kgid_t *gid)
-{
-       struct pid_namespace *pidns =
-               container_of(head->set, struct pid_namespace, set);
-       kuid_t ns_root_uid;
-       kgid_t ns_root_gid;
-
-       ns_root_uid = make_kuid(pidns->user_ns, 0);
-       if (uid_valid(ns_root_uid))
-               *uid = ns_root_uid;
-
-       ns_root_gid = make_kgid(pidns->user_ns, 0);
-       if (gid_valid(ns_root_gid))
-               *gid = ns_root_gid;
-}
-
-static struct ctl_table_root pid_table_root = {
-       .lookup         = pid_table_root_lookup,
-       .permissions    = pid_table_root_permissions,
-       .set_ownership  = pid_table_root_set_ownership,
-};
-
-static const struct ctl_table pid_table[] = {
-       {
-               .procname       = "pid_max",
-               .data           = &init_pid_ns.pid_max,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &pid_max_min,
-               .extra2         = &pid_max_max,
-       },
-};
-#endif
-
-int register_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
-       struct ctl_table *tbl;
-
-       setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
-
-       tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
-       if (!tbl)
-               return -ENOMEM;
-       tbl->data = &pidns->pid_max;
-       pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
-                            PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
-
-       pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
-                                                ARRAY_SIZE(pid_table));
-       if (!pidns->sysctls) {
-               kfree(tbl);
-               retire_sysctl_set(&pidns->set);
-               return -ENOMEM;
-       }
-#endif
-       return 0;
-}
-
-void unregister_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
-       const struct ctl_table *tbl;
-
-       tbl = pidns->sysctls->ctl_table_arg;
-       unregister_sysctl_table(pidns->sysctls);
-       retire_sysctl_set(&pidns->set);
-       kfree(tbl);
-#endif
-}
-
 void __init pid_idr_init(void)
 {
        /* Verify no one has done anything silly: */
        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
        /* bump default and minimum pid_max based on number of cpus */
-       init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
-                                 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+       pid_max = min(pid_max_max, max_t(int, pid_max,
+                               PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
        pid_max_min = max_t(int, pid_max_min,
                                PIDS_PER_CPU_MIN * num_possible_cpus());
-       pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
+       pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
        idr_init(&init_pid_ns.idr);
 
@@ -766,16 +665,6 @@ void __init pid_idr_init(void)
                        NULL);
 }
 
-static __init int pid_namespace_sysctl_init(void)
-{
-#ifdef CONFIG_SYSCTL
-       /* "kernel" directory will have already been initialized. */
-       BUG_ON(register_pidns_sysctls(&init_pid_ns));
-#endif
-       return 0;
-}
-subsys_initcall(pid_namespace_sysctl_init);
-
 static struct file *__pidfd_fget(struct task_struct *task, int fd)
 {
        struct file *file;
index 8f6cfec87555a3af47566b19d853306371106690..0f23285be4f921ea6a59bd052cae2b4daf887872 100644 (file)
@@ -70,8 +70,6 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
        dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
 }
 
-static void destroy_pid_namespace_work(struct work_struct *work);
-
 static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
        struct pid_namespace *parent_pid_ns)
 {
@@ -107,27 +105,17 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
                goto out_free_idr;
        ns->ns.ops = &pidns_operations;
 
-       ns->pid_max = parent_pid_ns->pid_max;
-       err = register_pidns_sysctls(ns);
-       if (err)
-               goto out_free_inum;
-
        refcount_set(&ns->ns.count, 1);
        ns->level = level;
        ns->parent = get_pid_ns(parent_pid_ns);
        ns->user_ns = get_user_ns(user_ns);
        ns->ucounts = ucounts;
        ns->pid_allocated = PIDNS_ADDING;
-       INIT_WORK(&ns->work, destroy_pid_namespace_work);
-
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
 #endif
-
        return ns;
 
-out_free_inum:
-       ns_free_inum(&ns->ns);
 out_free_idr:
        idr_destroy(&ns->idr);
        kmem_cache_free(pid_ns_cachep, ns);
@@ -149,28 +137,12 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-       unregister_pidns_sysctls(ns);
-
        ns_free_inum(&ns->ns);
 
        idr_destroy(&ns->idr);
        call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
-static void destroy_pid_namespace_work(struct work_struct *work)
-{
-       struct pid_namespace *ns =
-               container_of(work, struct pid_namespace, work);
-
-       do {
-               struct pid_namespace *parent;
-
-               parent = ns->parent;
-               destroy_pid_namespace(ns);
-               ns = parent;
-       } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
-}
-
 struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
@@ -183,8 +155,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
 
 void put_pid_ns(struct pid_namespace *ns)
 {
-       if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
-               schedule_work(&ns->work);
+       struct pid_namespace *parent;
+
+       while (ns != &init_pid_ns) {
+               parent = ns->parent;
+               if (!refcount_dec_and_test(&ns->ns.count))
+                       break;
+               destroy_pid_namespace(ns);
+               ns = parent;
+       }
 }
 EXPORT_SYMBOL_GPL(put_pid_ns);
 
@@ -295,7 +274,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
        next = idr_get_cursor(&pid_ns->idr) - 1;
 
        tmp.data = &next;
-       tmp.extra2 = &pid_ns->pid_max;
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        if (!ret && write)
                idr_set_cursor(&pid_ns->idr, next + 1);
@@ -303,6 +281,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
        return ret;
 }
 
+extern int pid_max;
 static const struct ctl_table pid_ns_ctl_table[] = {
        {
                .procname = "ns_last_pid",
@@ -310,7 +289,7 @@ static const struct ctl_table pid_ns_ctl_table[] = {
                .mode = 0666, /* permissions are checked in the handler */
                .proc_handler = pid_ns_ctl_handler,
                .extra1 = SYSCTL_ZERO,
-               .extra2 = &init_pid_ns.pid_max,
+               .extra2 = &pid_max,
        },
 };
 #endif /* CONFIG_CHECKPOINT_RESTORE */
index cb57da499ebb1216cefb3705694ab62028fee03e..bb739608680f2ff09f5e79355c2d653cbf89f3bb 100644 (file)
@@ -1803,6 +1803,15 @@ static const struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
+       {
+               .procname       = "pid_max",
+               .data           = &pid_max,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &pid_max_min,
+               .extra2         = &pid_max_max,
+       },
        {
                .procname       = "panic_on_oops",
                .data           = &panic_on_oops,
index c62b9b3cfb3d8ebb6b15d1060a2e70f54598c4ed..4966e6bbdf6f3786dede22abbcf9ac046b594139 100644 (file)
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
        int i;
 
        /* According to linux/thread.h, pids can be no bigger that 30 bits */
-       WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
+       WARN_ON_ONCE(pid_max > (1 << 30));
 
        pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
        if (!pid_list)
index 9c21ba45b7af6bdbe6dce2732e38c929e54834cc..46c65402ad7e505caec5d64804d7943ff4d4c931 100644 (file)
@@ -732,6 +732,8 @@ extern unsigned long tracing_thresh;
 
 /* PID filtering */
 
+extern int pid_max;
+
 bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
                             pid_t search_pid);
 bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
index cb49f7279dc8043e6543fa71303c63d5d7544158..573b5d8e8a28e56603919eb2baa63d32a775af71 100644 (file)
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
        if (tgid_map)
                return 0;
 
-       tgid_map_max = init_pid_ns.pid_max;
+       tgid_map_max = pid_max;
        map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
                       GFP_KERNEL);
        if (!map)