Merge branch 'slab/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka...

author Stephen Rothwell <sfr@canb.auug.org.au>

Tue, 21 Jan 2025 06:16:13 +0000 (17:16 +1100)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Tue, 21 Jan 2025 06:16:14 +0000 (17:16 +1100)
author Stephen Rothwell <sfr@canb.auug.org.au>
Tue, 21 Jan 2025 06:16:13 +0000 (17:16 +1100)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Tue, 21 Jan 2025 06:16:14 +0000 (17:16 +1100)
diff --cc init/main.c
Simple merge
diff --cc kernel/rcu/tree.c
Simple merge
diff --cc mm/slab_common.c

index a29457bef626feadabdd674ab97aa822b3487e9b,69f2d19010dedaa3e5b303ab9803c8cdd40152fa..4030907b6b7d89adcf62c70e82e95c8deb41adba
--- 1/mm/slab_common.c
--- 2/mm/slab_common.c
+++ b/mm/slab_common.c
@@@ -1282,3 -1284,881 +1284,881 @@@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_all
   EXPORT_TRACEPOINT_SYMBOL(kfree);
   EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
   
- -      kasan_record_aux_stack_noalloc(ptr);
+ /*
+  * This rcu parameter is runtime-read-only. It reflects
+  * a minimum allowed number of objects which can be cached
+  * per-CPU. Object size is equal to one page. This value
+  * can be changed at boot time.
+  */
+ static int rcu_min_cached_objs = 5;
+ module_param(rcu_min_cached_objs, int, 0444);
+ 
+ // A page shrinker can ask for pages to be freed to make them
+ // available for other parts of the system. This usually happens
+ // under low memory conditions, and in that case we should also
+ // defer page-cache filling for a short time period.
+ //
+ // The default value is 5 seconds, which is long enough to reduce
+ // interference with the shrinker while it asks other systems to
+ // drain their caches.
+ static int rcu_delay_page_cache_fill_msec = 5000;
+ module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+ 
+ /* Maximum number of jiffies to wait before draining a batch. */
+ #define KFREE_DRAIN_JIFFIES (5 * HZ)
+ #define KFREE_N_BATCHES 2
+ #define FREE_N_CHANNELS 2
+ 
+ /**
+  * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+  * @list: List node. All blocks are linked between each other
+  * @gp_snap: Snapshot of RCU state for objects placed to this bulk
+  * @nr_records: Number of active pointers in the array
+  * @records: Array of the kvfree_rcu() pointers
+  */
+ struct kvfree_rcu_bulk_data {
+       struct list_head list;
+       struct rcu_gp_oldstate gp_snap;
+       unsigned long nr_records;
+       void *records[] __counted_by(nr_records);
+ };
+ 
+ /*
+  * This macro defines how many entries the "records" array
+  * will contain. It is based on the fact that the size of
+  * kvfree_rcu_bulk_data structure becomes exactly one page.
+  */
+ #define KVFREE_BULK_MAX_ENTR \
+       ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+ 
+ /**
+  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+  * @head_free: List of kfree_rcu() objects waiting for a grace period
+  * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
+  * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+  * @krcp: Pointer to @kfree_rcu_cpu structure
+  */
+ 
+ struct kfree_rcu_cpu_work {
+       struct rcu_work rcu_work;
+       struct rcu_head *head_free;
+       struct rcu_gp_oldstate head_free_gp_snap;
+       struct list_head bulk_head_free[FREE_N_CHANNELS];
+       struct kfree_rcu_cpu *krcp;
+ };
+ 
+ /**
+  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+  * @head: List of kfree_rcu() objects not yet waiting for a grace period
+  * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+  * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+  * @lock: Synchronize access to this structure
+  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+  * @initialized: The @rcu_work fields have been initialized
+  * @head_count: Number of objects in rcu_head singular list
+  * @bulk_count: Number of objects in bulk-list
+  * @bkvcache:
+  *    A simple cache list that contains objects for reuse purpose.
+  *    In order to save some per-cpu space the list is singular.
+  *    Even though it is lockless an access has to be protected by the
+  *    per-cpu lock.
+  * @page_cache_work: A work to refill the cache when it is empty
+  * @backoff_page_cache_fill: Delay cache refills
+  * @work_in_progress: Indicates that page_cache_work is running
+  * @hrtimer: A hrtimer for scheduling a page_cache_work
+  * @nr_bkv_objs: number of allocated objects at @bkvcache.
+  *
+  * This is a per-CPU structure.  The reason that it is not included in
+  * the rcu_data structure is to permit this code to be extracted from
+  * the RCU files.  Such extraction could allow further optimization of
+  * the interactions with the slab allocators.
+  */
+ struct kfree_rcu_cpu {
+       // Objects queued on a linked list
+       // through their rcu_head structures.
+       struct rcu_head *head;
+       unsigned long head_gp_snap;
+       atomic_t head_count;
+ 
+       // Objects queued on a bulk-list.
+       struct list_head bulk_head[FREE_N_CHANNELS];
+       atomic_t bulk_count[FREE_N_CHANNELS];
+ 
+       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+       raw_spinlock_t lock;
+       struct delayed_work monitor_work;
+       bool initialized;
+ 
+       struct delayed_work page_cache_work;
+       atomic_t backoff_page_cache_fill;
+       atomic_t work_in_progress;
+       struct hrtimer hrtimer;
+ 
+       struct llist_head bkvcache;
+       int nr_bkv_objs;
+ };
+ 
+ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+       .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+ };
+ 
+ static __always_inline void
+ debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
+ {
+ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+       int i;
+ 
+       for (i = 0; i < bhead->nr_records; i++)
+               debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
+ #endif
+ }
+ 
+ static inline struct kfree_rcu_cpu *
+ krc_this_cpu_lock(unsigned long *flags)
+ {
+       struct kfree_rcu_cpu *krcp;
+ 
+       local_irq_save(*flags); // For safely calling this_cpu_ptr().
+       krcp = this_cpu_ptr(&krc);
+       raw_spin_lock(&krcp->lock);
+ 
+       return krcp;
+ }
+ 
+ static inline void
+ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+ {
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ }
+ 
+ static inline struct kvfree_rcu_bulk_data *
+ get_cached_bnode(struct kfree_rcu_cpu *krcp)
+ {
+       if (!krcp->nr_bkv_objs)
+               return NULL;
+ 
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
+       return (struct kvfree_rcu_bulk_data *)
+               llist_del_first(&krcp->bkvcache);
+ }
+ 
+ static inline bool
+ put_cached_bnode(struct kfree_rcu_cpu *krcp,
+       struct kvfree_rcu_bulk_data *bnode)
+ {
+       // Check the limit.
+       if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+               return false;
+ 
+       llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
+       return true;
+ }
+ 
+ static int
+ drain_page_cache(struct kfree_rcu_cpu *krcp)
+ {
+       unsigned long flags;
+       struct llist_node *page_list, *pos, *n;
+       int freed = 0;
+ 
+       if (!rcu_min_cached_objs)
+               return 0;
+ 
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       page_list = llist_del_all(&krcp->bkvcache);
+       WRITE_ONCE(krcp->nr_bkv_objs, 0);
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ 
+       llist_for_each_safe(pos, n, page_list) {
+               free_page((unsigned long)pos);
+               freed++;
+       }
+ 
+       return freed;
+ }
+ 
+ static void
+ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+       struct kvfree_rcu_bulk_data *bnode, int idx)
+ {
+       unsigned long flags;
+       int i;
+ 
+       if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+               debug_rcu_bhead_unqueue(bnode);
+               rcu_lock_acquire(&rcu_callback_map);
+               if (idx == 0) { // kmalloc() / kfree().
+                       trace_rcu_invoke_kfree_bulk_callback(
+                               "slab", bnode->nr_records,
+                               bnode->records);
+ 
+                       kfree_bulk(bnode->nr_records, bnode->records);
+               } else { // vmalloc() / vfree().
+                       for (i = 0; i < bnode->nr_records; i++) {
+                               trace_rcu_invoke_kvfree_callback(
+                                       "slab", bnode->records[i], 0);
+ 
+                               vfree(bnode->records[i]);
+                       }
+               }
+               rcu_lock_release(&rcu_callback_map);
+       }
+ 
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       if (put_cached_bnode(krcp, bnode))
+               bnode = NULL;
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ 
+       if (bnode)
+               free_page((unsigned long) bnode);
+ 
+       cond_resched_tasks_rcu_qs();
+ }
+ 
+ static void
+ kvfree_rcu_list(struct rcu_head *head)
+ {
+       struct rcu_head *next;
+ 
+       for (; head; head = next) {
+               void *ptr = (void *) head->func;
+               unsigned long offset = (void *) head - ptr;
+ 
+               next = head->next;
+               debug_rcu_head_unqueue((struct rcu_head *)ptr);
+               rcu_lock_acquire(&rcu_callback_map);
+               trace_rcu_invoke_kvfree_callback("slab", head, offset);
+ 
+               if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+                       kvfree(ptr);
+ 
+               rcu_lock_release(&rcu_callback_map);
+               cond_resched_tasks_rcu_qs();
+       }
+ }
+ 
+ /*
+  * This function is invoked in workqueue context after a grace period.
+  * It frees all the objects queued on ->bulk_head_free or ->head_free.
+  */
+ static void kfree_rcu_work(struct work_struct *work)
+ {
+       unsigned long flags;
+       struct kvfree_rcu_bulk_data *bnode, *n;
+       struct list_head bulk_head[FREE_N_CHANNELS];
+       struct rcu_head *head;
+       struct kfree_rcu_cpu *krcp;
+       struct kfree_rcu_cpu_work *krwp;
+       struct rcu_gp_oldstate head_gp_snap;
+       int i;
+ 
+       krwp = container_of(to_rcu_work(work),
+               struct kfree_rcu_cpu_work, rcu_work);
+       krcp = krwp->krcp;
+ 
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       // Channels 1 and 2.
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
+ 
+       // Channel 3.
+       head = krwp->head_free;
+       krwp->head_free = NULL;
+       head_gp_snap = krwp->head_free_gp_snap;
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ 
+       // Handle the first two channels.
+       for (i = 0; i < FREE_N_CHANNELS; i++) {
+               // Start from the tail page, so a GP is likely passed for it.
+               list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+                       kvfree_rcu_bulk(krcp, bnode, i);
+       }
+ 
+       /*
+        * This is used when the "bulk" path can not be used for the
+        * double-argument of kvfree_rcu().  This happens when the
+        * page-cache is empty, which means that objects are instead
+        * queued on a linked list through their rcu_head structures.
+        * This list is named "Channel 3".
+        */
+       if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+               kvfree_rcu_list(head);
+ }
+ 
+ static bool
+ need_offload_krc(struct kfree_rcu_cpu *krcp)
+ {
+       int i;
+ 
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               if (!list_empty(&krcp->bulk_head[i]))
+                       return true;
+ 
+       return !!READ_ONCE(krcp->head);
+ }
+ 
+ static bool
+ need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+ {
+       int i;
+ 
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               if (!list_empty(&krwp->bulk_head_free[i]))
+                       return true;
+ 
+       return !!krwp->head_free;
+ }
+ 
+ static int krc_count(struct kfree_rcu_cpu *krcp)
+ {
+       int sum = atomic_read(&krcp->head_count);
+       int i;
+ 
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               sum += atomic_read(&krcp->bulk_count[i]);
+ 
+       return sum;
+ }
+ 
+ static void
+ __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+ {
+       long delay, delay_left;
+ 
+       delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+       if (delayed_work_pending(&krcp->monitor_work)) {
+               delay_left = krcp->monitor_work.timer.expires - jiffies;
+               if (delay < delay_left)
+                       mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+               return;
+       }
+       queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+ }
+ 
+ static void
+ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+ {
+       unsigned long flags;
+ 
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       __schedule_delayed_monitor_work(krcp);
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ }
+ 
+ static void
+ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+ {
+       struct list_head bulk_ready[FREE_N_CHANNELS];
+       struct kvfree_rcu_bulk_data *bnode, *n;
+       struct rcu_head *head_ready = NULL;
+       unsigned long flags;
+       int i;
+ 
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       for (i = 0; i < FREE_N_CHANNELS; i++) {
+               INIT_LIST_HEAD(&bulk_ready[i]);
+ 
+               list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+                       if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
+                               break;
+ 
+                       atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+                       list_move(&bnode->list, &bulk_ready[i]);
+               }
+       }
+ 
+       if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+               head_ready = krcp->head;
+               atomic_set(&krcp->head_count, 0);
+               WRITE_ONCE(krcp->head, NULL);
+       }
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ 
+       for (i = 0; i < FREE_N_CHANNELS; i++) {
+               list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+                       kvfree_rcu_bulk(krcp, bnode, i);
+       }
+ 
+       if (head_ready)
+               kvfree_rcu_list(head_ready);
+ }
+ 
+ /*
+  * Return: %true if a work is queued, %false otherwise.
+  */
+ static bool
+ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
+ {
+       unsigned long flags;
+       bool queued = false;
+       int i, j;
+ 
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+ 
+       // Attempt to start a new batch.
+       for (i = 0; i < KFREE_N_BATCHES; i++) {
+               struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
+ 
+               // Try to detach bulk_head or head and attach it, only when
+               // all channels are free.  Any channel is not free means at krwp
+               // there is on-going rcu work to handle krwp's free business.
+               if (need_wait_for_krwp_work(krwp))
+                       continue;
+ 
+               // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+               if (need_offload_krc(krcp)) {
+                       // Channel 1 corresponds to the SLAB-pointer bulk path.
+                       // Channel 2 corresponds to vmalloc-pointer bulk path.
+                       for (j = 0; j < FREE_N_CHANNELS; j++) {
+                               if (list_empty(&krwp->bulk_head_free[j])) {
+                                       atomic_set(&krcp->bulk_count[j], 0);
+                                       list_replace_init(&krcp->bulk_head[j],
+                                               &krwp->bulk_head_free[j]);
+                               }
+                       }
+ 
+                       // Channel 3 corresponds to both SLAB and vmalloc
+                       // objects queued on the linked list.
+                       if (!krwp->head_free) {
+                               krwp->head_free = krcp->head;
+                               get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
+                               atomic_set(&krcp->head_count, 0);
+                               WRITE_ONCE(krcp->head, NULL);
+                       }
+ 
+                       // One work is per one batch, so there are three
+                       // "free channels", the batch can handle. Break
+                       // the loop since it is done with this CPU thus
+                       // queuing an RCU work is _always_ success here.
+                       queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
+                       WARN_ON_ONCE(!queued);
+                       break;
+               }
+       }
+ 
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+       return queued;
+ }
+ 
+ /*
+  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+  */
+ static void kfree_rcu_monitor(struct work_struct *work)
+ {
+       struct kfree_rcu_cpu *krcp = container_of(work,
+               struct kfree_rcu_cpu, monitor_work.work);
+ 
+       // Drain ready for reclaim.
+       kvfree_rcu_drain_ready(krcp);
+ 
+       // Queue a batch for a rest.
+       kvfree_rcu_queue_batch(krcp);
+ 
+       // If there is nothing to detach, it means that our job is
+       // successfully done here. In case of having at least one
+       // of the channels that is still busy we should rearm the
+       // work to repeat an attempt. Because previous batches are
+       // still in progress.
+       if (need_offload_krc(krcp))
+               schedule_delayed_monitor_work(krcp);
+ }
+ 
+ static void fill_page_cache_func(struct work_struct *work)
+ {
+       struct kvfree_rcu_bulk_data *bnode;
+       struct kfree_rcu_cpu *krcp =
+               container_of(work, struct kfree_rcu_cpu,
+                       page_cache_work.work);
+       unsigned long flags;
+       int nr_pages;
+       bool pushed;
+       int i;
+ 
+       nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+               1 : rcu_min_cached_objs;
+ 
+       for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
+               bnode = (struct kvfree_rcu_bulk_data *)
+                       __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ 
+               if (!bnode)
+                       break;
+ 
+               raw_spin_lock_irqsave(&krcp->lock, flags);
+               pushed = put_cached_bnode(krcp, bnode);
+               raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ 
+               if (!pushed) {
+                       free_page((unsigned long) bnode);
+                       break;
+               }
+       }
+ 
+       atomic_set(&krcp->work_in_progress, 0);
+       atomic_set(&krcp->backoff_page_cache_fill, 0);
+ }
+ 
+ // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+ // state specified by flags.  If can_alloc is true, the caller must
+ // be schedulable and not be holding any locks or mutexes that might be
+ // acquired by the memory allocator or anything that it might invoke.
+ // Returns true if ptr was successfully recorded, else the caller must
+ // use a fallback.
+ static inline bool
+ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+       unsigned long *flags, void *ptr, bool can_alloc)
+ {
+       struct kvfree_rcu_bulk_data *bnode;
+       int idx;
+ 
+       *krcp = krc_this_cpu_lock(flags);
+       if (unlikely(!(*krcp)->initialized))
+               return false;
+ 
+       idx = !!is_vmalloc_addr(ptr);
+       bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+               struct kvfree_rcu_bulk_data, list);
+ 
+       /* Check if a new block is required. */
+       if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
+               bnode = get_cached_bnode(*krcp);
+               if (!bnode && can_alloc) {
+                       krc_this_cpu_unlock(*krcp, *flags);
+ 
+                       // __GFP_NORETRY - allows a light-weight direct reclaim
+                       // what is OK from minimizing of fallback hitting point of
+                       // view. Apart of that it forbids any OOM invoking what is
+                       // also beneficial since we are about to release memory soon.
+                       //
+                       // __GFP_NOMEMALLOC - prevents from consuming of all the
+                       // memory reserves. Please note we have a fallback path.
+                       //
+                       // __GFP_NOWARN - it is supposed that an allocation can
+                       // be failed under low memory or high memory pressure
+                       // scenarios.
+                       bnode = (struct kvfree_rcu_bulk_data *)
+                               __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+                       raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
+               }
+ 
+               if (!bnode)
+                       return false;
+ 
+               // Initialize the new block and attach it.
+               bnode->nr_records = 0;
+               list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
+       }
+ 
+       // Finally insert and update the GP for this page.
+       bnode->nr_records++;
+       bnode->records[bnode->nr_records - 1] = ptr;
+       get_state_synchronize_rcu_full(&bnode->gp_snap);
+       atomic_inc(&(*krcp)->bulk_count[idx]);
+ 
+       return true;
+ }
+ 
+ #if !defined(CONFIG_TINY_RCU)
+ 
+ static enum hrtimer_restart
+ schedule_page_work_fn(struct hrtimer *t)
+ {
+       struct kfree_rcu_cpu *krcp =
+               container_of(t, struct kfree_rcu_cpu, hrtimer);
+ 
+       queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
+       return HRTIMER_NORESTART;
+ }
+ 
+ static void
+ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+ {
+       // If cache disabled, bail out.
+       if (!rcu_min_cached_objs)
+               return;
+ 
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+                       !atomic_xchg(&krcp->work_in_progress, 1)) {
+               if (atomic_read(&krcp->backoff_page_cache_fill)) {
+                       queue_delayed_work(system_unbound_wq,
+                               &krcp->page_cache_work,
+                                       msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+               } else {
+                       hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+                       krcp->hrtimer.function = schedule_page_work_fn;
+                       hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               }
+       }
+ }
+ 
+ void __init kfree_rcu_scheduler_running(void)
+ {
+       int cpu;
+ 
+       for_each_possible_cpu(cpu) {
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ 
+               if (need_offload_krc(krcp))
+                       schedule_delayed_monitor_work(krcp);
+       }
+ }
+ 
+ /*
+  * Queue a request for lazy invocation of the appropriate free routine
+  * after a grace period.  Please note that three paths are maintained,
+  * two for the common case using arrays of pointers and a third one that
+  * is used only when the main paths cannot be used, for example, due to
+  * memory pressure.
+  *
+  * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
+  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
+  * be free'd in workqueue context. This allows us to: batch requests together to
+  * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
+  */
+ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+ {
+       unsigned long flags;
+       struct kfree_rcu_cpu *krcp;
+       bool success;
+ 
+       /*
+        * Please note there is a limitation for the head-less
+        * variant, that is why there is a clear rule for such
+        * objects: it can be used from might_sleep() context
+        * only. For other places please embed an rcu_head to
+        * your data.
+        */
+       if (!head)
+               might_sleep();
+ 
+       // Queue the object but don't yet schedule the batch.
+       if (debug_rcu_head_queue(ptr)) {
+               // Probable double kfree_rcu(), just leak.
+               WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+                         __func__, head);
+ 
+               // Mark as success and leave.
+               return;
+       }
+ 
++      kasan_record_aux_stack(ptr);
+       success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
+       if (!success) {
+               run_page_cache_worker(krcp);
+ 
+               if (head == NULL)
+                       // Inline if kvfree_rcu(one_arg) call.
+                       goto unlock_return;
+ 
+               head->func = ptr;
+               head->next = krcp->head;
+               WRITE_ONCE(krcp->head, head);
+               atomic_inc(&krcp->head_count);
+ 
+               // Take a snapshot for this krcp.
+               krcp->head_gp_snap = get_state_synchronize_rcu();
+               success = true;
+       }
+ 
+       /*
+        * The kvfree_rcu() caller considers the pointer freed at this point
+        * and likely removes any references to it. Since the actual slab
+        * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+        * this object (no scanning or false positives reporting).
+        */
+       kmemleak_ignore(ptr);
+ 
+       // Set timer to drain after KFREE_DRAIN_JIFFIES.
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+               __schedule_delayed_monitor_work(krcp);
+ 
+ unlock_return:
+       krc_this_cpu_unlock(krcp, flags);
+ 
+       /*
+        * Inline kvfree() after synchronize_rcu(). We can do
+        * it from might_sleep() context only, so the current
+        * CPU can pass the QS state.
+        */
+       if (!success) {
+               debug_rcu_head_unqueue((struct rcu_head *) ptr);
+               synchronize_rcu();
+               kvfree(ptr);
+       }
+ }
+ EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+ 
+ /**
+  * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+  *
+  * Note that a single argument of kvfree_rcu() call has a slow path that
+  * triggers synchronize_rcu() following by freeing a pointer. It is done
+  * before the return from the function. Therefore for any single-argument
+  * call that will result in a kfree() to a cache that is to be destroyed
+  * during module exit, it is developer's responsibility to ensure that all
+  * such calls have returned before the call to kmem_cache_destroy().
+  */
+ void kvfree_rcu_barrier(void)
+ {
+       struct kfree_rcu_cpu_work *krwp;
+       struct kfree_rcu_cpu *krcp;
+       bool queued;
+       int i, cpu;
+ 
+       /*
+        * Firstly we detach objects and queue them over an RCU-batch
+        * for all CPUs. Finally queued works are flushed for each CPU.
+        *
+        * Please note. If there are outstanding batches for a particular
+        * CPU, those have to be finished first following by queuing a new.
+        */
+       for_each_possible_cpu(cpu) {
+               krcp = per_cpu_ptr(&krc, cpu);
+ 
+               /*
+                * Check if this CPU has any objects which have been queued for a
+                * new GP completion. If not(means nothing to detach), we are done
+                * with it. If any batch is pending/running for this "krcp", below
+                * per-cpu flush_rcu_work() waits its completion(see last step).
+                */
+               if (!need_offload_krc(krcp))
+                       continue;
+ 
+               while (1) {
+                       /*
+                        * If we are not able to queue a new RCU work it means:
+                        * - batches for this CPU are still in flight which should
+                        *   be flushed first and then repeat;
+                        * - no objects to detach, because of concurrency.
+                        */
+                       queued = kvfree_rcu_queue_batch(krcp);
+ 
+                       /*
+                        * Bail out, if there is no need to offload this "krcp"
+                        * anymore. As noted earlier it can run concurrently.
+                        */
+                       if (queued || !need_offload_krc(krcp))
+                               break;
+ 
+                       /* There are ongoing batches. */
+                       for (i = 0; i < KFREE_N_BATCHES; i++) {
+                               krwp = &(krcp->krw_arr[i]);
+                               flush_rcu_work(&krwp->rcu_work);
+                       }
+               }
+       }
+ 
+       /*
+        * Now we guarantee that all objects are flushed.
+        */
+       for_each_possible_cpu(cpu) {
+               krcp = per_cpu_ptr(&krc, cpu);
+ 
+               /*
+                * A monitor work can drain ready to reclaim objects
+                * directly. Wait its completion if running or pending.
+                */
+               cancel_delayed_work_sync(&krcp->monitor_work);
+ 
+               for (i = 0; i < KFREE_N_BATCHES; i++) {
+                       krwp = &(krcp->krw_arr[i]);
+                       flush_rcu_work(&krwp->rcu_work);
+               }
+       }
+ }
+ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+ 
+ #endif /* #if !defined(CONFIG_TINY_RCU) */
+ 
+ static unsigned long
+ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+ {
+       int cpu;
+       unsigned long count = 0;
+ 
+       /* Snapshot count of all CPUs */
+       for_each_possible_cpu(cpu) {
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ 
+               count += krc_count(krcp);
+               count += READ_ONCE(krcp->nr_bkv_objs);
+               atomic_set(&krcp->backoff_page_cache_fill, 1);
+       }
+ 
+       return count == 0 ? SHRINK_EMPTY : count;
+ }
+ 
+ static unsigned long
+ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+ {
+       int cpu, freed = 0;
+ 
+       for_each_possible_cpu(cpu) {
+               int count;
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ 
+               count = krc_count(krcp);
+               count += drain_page_cache(krcp);
+               kfree_rcu_monitor(&krcp->monitor_work.work);
+ 
+               sc->nr_to_scan -= count;
+               freed += count;
+ 
+               if (sc->nr_to_scan <= 0)
+                       break;
+       }
+ 
+       return freed == 0 ? SHRINK_STOP : freed;
+ }
+ 
+ void __init kvfree_rcu_init(void)
+ {
+       int cpu;
+       int i, j;
+       struct shrinker *kfree_rcu_shrinker;
+ 
+       /* Clamp it to [0:100] seconds interval. */
+       if (rcu_delay_page_cache_fill_msec < 0 ||
+               rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+ 
+               rcu_delay_page_cache_fill_msec =
+                       clamp(rcu_delay_page_cache_fill_msec, 0,
+                               (int) (100 * MSEC_PER_SEC));
+ 
+               pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+                       rcu_delay_page_cache_fill_msec);
+       }
+ 
+       for_each_possible_cpu(cpu) {
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ 
+               for (i = 0; i < KFREE_N_BATCHES; i++) {
+                       INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
+                       krcp->krw_arr[i].krcp = krcp;
+ 
+                       for (j = 0; j < FREE_N_CHANNELS; j++)
+                               INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
+               }
+ 
+               for (i = 0; i < FREE_N_CHANNELS; i++)
+                       INIT_LIST_HEAD(&krcp->bulk_head[i]);
+ 
+               INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+               INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
+               krcp->initialized = true;
+       }
+ 
+       kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
+       if (!kfree_rcu_shrinker) {
+               pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+               return;
+       }
+ 
+       kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+       kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+ 
+       shrinker_register(kfree_rcu_shrinker);
+ }
author	Stephen Rothwell <sfr@canb.auug.org.au>
	Tue, 21 Jan 2025 06:16:13 +0000 (17:16 +1100)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Tue, 21 Jan 2025 06:16:14 +0000 (17:16 +1100)
		1	2
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slab_common.c	patch \|	diff1 \|	diff2 \|	blob \| history