aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
authorKairui Song <kasong@tencent.com>2026-05-13 17:21:11 +0800
committerAndrew Morton <akpm@linux-foundation.org>2026-05-28 21:31:02 -0700
commit6fac788d7b88e8dd8a4668b89acea4403c470c4c (patch)
treea147674d4f33bcc9b92330f0e4ec7df14771d33b /mm
parent2625d0dd646af7b18467cd0ff567b16987bfcd5e (diff)
downloadlinux-next-history-6fac788d7b88e8dd8a4668b89acea4403c470c4c.tar.gz
mm, swap: avoid leaving unused extend table after alloc race
Allocating an extend table requires dropping the ci lock first. While the lock is dropped, a concurrent put can decrease the slot's swap count to a value that is no longer maxed out, so the extend table is no longer required. The current allocation path still attach the new extend table to the cluster anyway, leaving it unused. The next maxed out count on the same cluster may still reuse the table, and frees it properly. But swapoff could leak it indeed. To eliminate the waste, re-check under the ci lock that the extend table is still needed before publishing it, and free the local allocation otherwise. Also close the check window by ensuring every count decrement that brings a slot below SWP_TB_COUNT_MAX - 1 runs swap_extend_table_try_free(), not just the MAX to MAX - 1 transition. With this, a freshly published extend table that becomes redundant due to a racing put is freed on the very next decrement, restoring the invariant that an empty cluster never has a non-NULL ci->extend_table. The added overhead is ignorable. [kasong@tencent.com: v2] Link: https://lore.kernel.org/20260515-swap-extend-table-fix-v2-1-833d72ad53e5@tencent.com Link: https://lore.kernel.org/20260513-swap-extend-table-fix-v1-1-a71dea851fb3@tencent.com Fixes: 0d6af9bcf383 ("mm, swap: use the swap table to track the swap count") Signed-off-by: Kairui Song <kasong@tencent.com> Reported-by: Breno Leitao <leitao@debian.org> Closes: https://lore.kernel.org/linux-mm/agG6Dp0umhs6O1SY@gmail.com/ Tested-by: Breno Leitao <leitao@debian.org> Cc: Baoquan He <bhe@redhat.com> Cc: Barry Song <baohua@kernel.org> Cc: Chris Li <chrisl@kernel.org> Cc: Kemeng Shi <shikemeng@huaweicloud.com> Cc: Nhat Pham <nphamcs@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/swapfile.c42
1 files changed, 34 insertions, 8 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 74a1e324449dc..ee515a6fbccd4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,8 +1443,10 @@ start_over:
}
static int swap_extend_table_alloc(struct swap_info_struct *si,
- struct swap_cluster_info *ci, gfp_t gfp)
+ struct swap_cluster_info *ci,
+ unsigned int ci_off, gfp_t gfp)
{
+ int count;
void *table;
table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp);
@@ -1452,12 +1454,28 @@ static int swap_extend_table_alloc(struct swap_info_struct *si,
return -ENOMEM;
spin_lock(&ci->lock);
- if (!ci->extend_table)
- ci->extend_table = table;
- else
- kfree(table);
+ /*
+ * Extend table allocation requires releasing ci lock first so it's
+ * possible that the slot has been freed, no longer overflowed, or
+ * a concurrent extend table allocation has already succeeded, so
+ * the allocation is no longer needed.
+ */
+ if (!cluster_table_is_alloced(ci))
+ goto out_free;
+ count = swp_tb_get_count(__swap_table_get(ci, ci_off));
+ if (count < (SWP_TB_COUNT_MAX - 1))
+ goto out_free;
+ if (ci->extend_table)
+ goto out_free;
+
+ ci->extend_table = table;
spin_unlock(&ci->lock);
return 0;
+
+out_free:
+ spin_unlock(&ci->lock);
+ kfree(table);
+ return 0;
}
int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
@@ -1472,7 +1490,7 @@ int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
return 0;
ci = __swap_offset_to_cluster(si, offset);
- ret = swap_extend_table_alloc(si, ci, gfp);
+ ret = swap_extend_table_alloc(si, ci, swp_cluster_offset(entry), gfp);
put_swap_device(si);
return ret;
@@ -1519,13 +1537,21 @@ static void __swap_cluster_put_entry(struct swap_cluster_info *ci,
if (count == (SWP_TB_COUNT_MAX - 1)) {
ci->extend_table[ci_off] = 0;
__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count));
- swap_extend_table_try_free(ci);
} else {
ci->extend_table[ci_off] = count;
}
} else {
__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count));
}
+
+ /*
+ * `SWP_TB_COUNT_MAX - 1` triggers extend table allocation. If the
+ * count was above that, then the extend table is no longer needed,
+ * so free it. And if we just put the count value from MAX - 1, it's
+ * also possible that a pending dup just attached an extend table.
+ */
+ if (unlikely(count == SWP_TB_COUNT_MAX - 2 || count == SWP_TB_COUNT_MAX - 1))
+ swap_extend_table_try_free(ci);
}
/**
@@ -1665,7 +1691,7 @@ restart:
if (unlikely(err)) {
if (err == -ENOMEM) {
spin_unlock(&ci->lock);
- err = swap_extend_table_alloc(si, ci, GFP_ATOMIC);
+ err = swap_extend_table_alloc(si, ci, ci_off, GFP_ATOMIC);
spin_lock(&ci->lock);
if (!err)
goto restart;