diff options
| author | Suren Baghdasaryan <surenb@google.com> | 2026-04-25 23:27:16 -0700 |
|---|---|---|
| committer | Andrew Morton <akpm@linux-foundation.org> | 2026-05-28 21:31:32 -0700 |
| commit | 66003bc8b3844b9c76f7d705dee5821598b90f21 (patch) | |
| tree | 63c8ee700be18c84028a6c712e5ec78b5a5661c8 /fs | |
| parent | 8730860a82eaf8e6ff8ebc55e07c5e428ccbce0d (diff) | |
| download | linux-next-history-66003bc8b3844b9c76f7d705dee5821598b90f21.tar.gz | |
fs/proc/task_mmu: read proc/pid/{smaps|numa_maps} under per-vma lock
Patch series "use vma locks for proc/pid/{smaps|numa_maps} reads", v2.
Use per-vma locks when reading /proc/pid/smaps and /proc/pid/numa_maps
similar to /proc/pid/maps to reduce contention on central mmap_lock. One
major difference between maps and smaps/numa_maps reading is that the
latter executes page table walk which can't be done under RCU due to a
possibility of sleeping. Therefore we drop RCU read lock before this walk
while keeping the VMA locked. After the walk we retake RCU read lock,
reset VMA iterator and proceed with the next VMA.
The last two patches extend /proc/pid/maps test to cover /proc/pid/smaps
reading during concurrent address space modification.
This patch (of 3):
proc/pid/{smaps|numa_maps} can be read using the combination of RCU and
VMA read locks, similar to proc/pid/maps. RCU is required to safely
traverse the VMA tree and VMA lock stabilizes the VMA being processed and
the pagetable walk.
Link: https://lore.kernel.org/20260426062718.1238437-1-surenb@google.com
Link: https://lore.kernel.org/20260426062718.1238437-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <liam@infradead.org>
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/proc/task_mmu.c | 195 |
1 files changed, 156 insertions, 39 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 751b9ba160fbf..1e3a15bf46f4e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,6 +132,22 @@ static void release_task_mempolicy(struct proc_maps_private *priv) #ifdef CONFIG_PER_VMA_LOCK +static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + int ret = mmap_read_lock_killable(lock_ctx->mm); + + if (!ret) + lock_ctx->mmap_locked = true; + + return ret; +} + +static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); + lock_ctx->mmap_locked = false; +} + static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) { lock_ctx->locked_vma = NULL; @@ -146,25 +162,11 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) } } -static const struct seq_operations proc_pid_maps_op; - static inline bool lock_vma_range(struct seq_file *m, struct proc_maps_locking_ctx *lock_ctx) { - /* - * smaps and numa_maps perform page table walk, therefore require - * mmap_lock but maps can be read with locking just the vma and - * walking the vma tree under rcu read protection. - */ - if (m->op != &proc_pid_maps_op) { - if (mmap_read_lock_killable(lock_ctx->mm)) - return false; - - lock_ctx->mmap_locked = true; - } else { - rcu_read_lock(); - reset_lock_ctx(lock_ctx); - } + rcu_read_lock(); + reset_lock_ctx(lock_ctx); return true; } @@ -172,7 +174,7 @@ static inline bool lock_vma_range(struct seq_file *m, static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->mmap_locked) { - mmap_read_unlock(lock_ctx->mm); + unlock_ctx_mm(lock_ctx); } else { unlock_ctx_vma(lock_ctx); rcu_read_unlock(); @@ -213,17 +215,45 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, return true; } +static inline void drop_rcu(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return; + + rcu_read_unlock(); +} + +static inline void reacquire_rcu(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return; + + rcu_read_lock(); + /* Reinitialize the iterator. */ + vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end); +} + #else /* CONFIG_PER_VMA_LOCK */ +static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm); +} + +static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + static inline bool lock_vma_range(struct seq_file *m, struct proc_maps_locking_ctx *lock_ctx) { - return mmap_read_lock_killable(lock_ctx->mm) == 0; + return lock_ctx_mm(lock_ctx) == 0; } static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { - mmap_read_unlock(lock_ctx->mm); + unlock_ctx_mm(lock_ctx); } static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, @@ -238,6 +268,9 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, return false; } +static inline void drop_rcu(struct proc_maps_private *priv) {} +static inline void reacquire_rcu(struct proc_maps_private *priv) {} + #endif /* CONFIG_PER_VMA_LOCK */ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) @@ -538,12 +571,10 @@ static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { - if (lock_ctx->mmap_locked) { - mmap_read_unlock(lock_ctx->mm); - lock_ctx->mmap_locked = false; - } else { + if (lock_ctx->mmap_locked) + unlock_ctx_mm(lock_ctx); + else unlock_ctx_vma(lock_ctx); - } } static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, @@ -1280,21 +1311,75 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; +#ifdef CONFIG_PER_VMA_LOCK + +static const struct mm_walk_ops smaps_walk_vma_lock_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, +}; + +static const struct mm_walk_ops smaps_shmem_walk_vma_lock_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .pte_hole = smaps_pte_hole, + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, +}; + +static inline const struct mm_walk_ops * +get_smaps_walk_ops(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return &smaps_walk_ops; + return &smaps_walk_vma_lock_ops; +} + +static inline const struct mm_walk_ops * +get_smaps_shmem_walk_ops(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return &smaps_shmem_walk_ops; + return &smaps_shmem_walk_vma_lock_ops; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline const struct mm_walk_ops * +get_smaps_walk_ops(struct proc_maps_private *priv) +{ + return &smaps_walk_ops; +} + +static inline const struct mm_walk_ops * +get_smaps_shmem_walk_ops(struct proc_maps_private *priv) +{ + return &smaps_shmem_walk_ops; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + /* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. * * Use vm_start of @vma as the beginning address if @start is 0. */ -static void smap_gather_stats(struct vm_area_struct *vma, - struct mem_size_stats *mss, unsigned long start) +static void smap_gather_stats(struct proc_maps_private *priv, + struct vm_area_struct *vma, + struct mem_size_stats *mss, unsigned long start) { - const struct mm_walk_ops *ops = &smaps_walk_ops; + const struct mm_walk_ops *ops = get_smaps_walk_ops(priv); /* Invalid start */ if (start >= vma->vm_end) return; + if (vma == get_gate_vma(priv->lock_ctx.mm)) + return; + + /* Might sleep. Drop RCU read lock but keep the VMA locked. */ + drop_rcu(priv); + if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -1312,15 +1397,16 @@ static void smap_gather_stats(struct vm_area_struct *vma, !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { - ops = &smaps_shmem_walk_ops; + ops = get_smaps_shmem_walk_ops(priv); } } - /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); else walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); + + reacquire_rcu(priv); } #define SEQ_PUT_DEC(str, val) \ @@ -1369,10 +1455,11 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, static int show_smap(struct seq_file *m, void *v) { + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct mem_size_stats mss = {}; - smap_gather_stats(vma, &mss, 0); + smap_gather_stats(priv, vma, &mss, 0); show_map_vma(m, vma); @@ -1413,7 +1500,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_task; } - ret = mmap_read_lock_killable(mm); + ret = lock_ctx_mm(&priv->lock_ctx); if (ret) goto out_put_mm; @@ -1425,7 +1512,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) vma_start = vma->vm_start; do { - smap_gather_stats(vma, &mss, 0); + smap_gather_stats(priv, vma, &mss, 0); last_vma_end = vma->vm_end; /* @@ -1434,8 +1521,8 @@ static int show_smaps_rollup(struct seq_file *m, void *v) */ if (mmap_lock_is_contended(mm)) { vma_iter_invalidate(&vmi); - mmap_read_unlock(mm); - ret = mmap_read_lock_killable(mm); + unlock_ctx_mm(&priv->lock_ctx); + ret = lock_ctx_mm(&priv->lock_ctx); if (ret) { release_task_mempolicy(priv); goto out_put_mm; @@ -1484,14 +1571,14 @@ static int show_smaps_rollup(struct seq_file *m, void *v) /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) { - smap_gather_stats(vma, &mss, 0); + smap_gather_stats(priv, vma, &mss, 0); last_vma_end = vma->vm_end; continue; } /* Case 4 above */ if (vma->vm_end > last_vma_end) { - smap_gather_stats(vma, &mss, last_vma_end); + smap_gather_stats(priv, vma, &mss, last_vma_end); last_vma_end = vma->vm_end; } } @@ -1505,7 +1592,7 @@ empty_set: __show_smap(m, &mss, true); release_task_mempolicy(priv); - mmap_read_unlock(mm); + unlock_ctx_mm(&priv->lock_ctx); out_put_mm: mmput(mm); @@ -3291,6 +3378,31 @@ static const struct mm_walk_ops show_numa_ops = { .walk_lock = PGWALK_RDLOCK, }; +#ifdef CONFIG_PER_VMA_LOCK +static const struct mm_walk_ops show_numa_vma_lock_ops = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, +}; + +static inline const struct mm_walk_ops * +get_show_numa_ops(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return &show_numa_ops; + return &show_numa_vma_lock_ops; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline const struct mm_walk_ops * +get_show_numa_ops(struct proc_maps_private *priv) +{ + return &show_numa_ops; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + /* * Display pages allocated per node and memory policy via /proc. */ @@ -3335,8 +3447,13 @@ static int show_numa_map(struct seq_file *m, void *v) if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); - /* mmap_lock is held by m_start */ - walk_page_vma(vma, &show_numa_ops, md); + /* Skip walking pages if gate VMA */ + if (vma != get_gate_vma(proc_priv->lock_ctx.mm)) { + /* Might sleep. Drop RCU read lock but keep the VMA locked. */ + drop_rcu(proc_priv); + walk_page_vma(vma, get_show_numa_ops(proc_priv), md); + reacquire_rcu(proc_priv); + } if (!md->pages) goto out; |
