aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
authorShakeel Butt <shakeel.butt@linux.dev>2026-05-25 20:39:28 -0700
committerAndrew Morton <akpm@linux-foundation.org>2026-05-28 21:32:02 -0700
commita48a944f102d99e934839a78b815fd8792a7d346 (patch)
treeb70f3ccb779a63e0a4347274e3c02fa395c98813 /mm
parent818054d4c6972dc4f20f1c5b5081505202a123da (diff)
downloadlinux-next-history-a48a944f102d99e934839a78b815fd8792a7d346.tar.gz
memcg: store node_id instead of pglist_data pointer
Patch series "memcg: shrink obj_stock_pcp and cache multiple objcgs", v3. Commit 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") split a memcg's single obj_cgroup into one per NUMA node so that reparenting LRU folios can take per-node lru locks. As a side effect, the per-CPU obj_stock_pcp -- which caches a single cached_objcg pointer -- thrashes on workloads where threads of the same memcg run on different NUMA nodes. The kernel test robot reported a 67.7% regression on stress-ng.switch.ops_per_sec from this pattern. Commit d0211878ce06 ("memcg: cache obj_stock by memcg, not by objcg pointer") landed as a temporary fix by treating sibling per-node objcgs as equivalent for the cache lookup, intended to be reverted once per-node kmem accounting is introduced. This series takes a more general approach: cache multiple objcgs per CPU using the multi-slot pattern memcg_stock_pcp already uses, so the per-node objcg variants of one memcg can all coexist in the stock without ever forcing a drain. The temporary fix can then be reverted. To avoid increasing the per-CPU cache footprint, the first three patches shrink the existing single-slot obj_stock_pcp fields. The final patch converts cached_objcg and nr_bytes into NR_OBJ_STOCK=5 slot arrays and reorders the struct so the entire consume/refill/account hot path fits within a single 64-byte cache line on non-debug 64-bit builds (verified with pahole). This patch (of 4): The struct obj_stock_pcp stores a pointer to pglist_data for the slab stats cached on the cpu. On 64-bit machines, this costs 8 bytes. The pointer is not strictly required: NODE_DATA() can recover it from the node id. Replace cached_pgdat with int16_t node_id and use NUMA_NO_NODE as the "no stats cached" sentinel. At the moment all the archs limit MAX_NUMNODES to 1024 so int16_t is plenty; a BUILD_BUG_ON() makes sure we notice if that ever changes. Link: https://lore.kernel.org/20260526033931.1760588-1-shakeel.butt@linux.dev Link: https://lore.kernel.org/20260526033931.1760588-2-shakeel.butt@linux.dev Fixes: 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> Tested-by: kernel test robot <oliver.sang@intel.com> Acked-by: Muchun Song <muchun.song@linux.dev> Reviewed-by: Harry Yoo (Oracle) <harry@kernel.org> Acked-by: Qi Zheng <qi.zheng@linux.dev> Cc: Alexandre Ghiti <alex@ghiti.fr> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Roman Gushchin <roman.gushchin@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c26
1 files changed, 19 insertions, 7 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 92269740eef13..e983fa590af8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2022,7 +2022,7 @@ struct obj_stock_pcp {
local_trylock_t lock;
unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
- struct pglist_data *cached_pgdat;
+ int16_t node_id;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
@@ -2032,6 +2032,7 @@ struct obj_stock_pcp {
static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
.lock = INIT_LOCAL_TRYLOCK(lock),
+ .node_id = NUMA_NO_NODE,
};
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -3162,6 +3163,13 @@ static void __account_obj_stock(struct obj_cgroup *objcg,
{
int *bytes;
+ /*
+ * Though at the moment MAX_NUMNODES <= 1024 in all archs but let's make
+ * sure it does not exceed S16_MAX otherwise we need to fix node_id type
+ * in struct obj_stock_pcp.
+ */
+ BUILD_BUG_ON(MAX_NUMNODES >= S16_MAX);
+
if (!stock || READ_ONCE(stock->cached_objcg) != objcg)
goto direct;
@@ -3169,9 +3177,11 @@ static void __account_obj_stock(struct obj_cgroup *objcg,
* Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat changes.
*/
- if (stock->cached_pgdat != pgdat) {
+ if (stock->node_id == NUMA_NO_NODE) {
+ stock->node_id = pgdat->node_id;
+ } else if (stock->node_id != pgdat->node_id) {
/* Flush the existing cached vmstat data */
- struct pglist_data *oldpg = stock->cached_pgdat;
+ struct pglist_data *oldpg = NODE_DATA(stock->node_id);
if (stock->nr_slab_reclaimable_b) {
mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
@@ -3183,7 +3193,7 @@ static void __account_obj_stock(struct obj_cgroup *objcg,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
- stock->cached_pgdat = pgdat;
+ stock->node_id = pgdat->node_id;
}
bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
@@ -3279,19 +3289,21 @@ static void drain_obj_stock(struct obj_stock_pcp *stock)
* Flush the vmstat data in current stock
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
+ struct pglist_data *oldpg = NODE_DATA(stock->node_id);
+
if (stock->nr_slab_reclaimable_b) {
- mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, oldpg,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, oldpg,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
- stock->cached_pgdat = NULL;
+ stock->node_id = NUMA_NO_NODE;
}
WRITE_ONCE(stock->cached_objcg, NULL);