aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
authorAndrew Morton <akpm@linux-foundation.org>2026-05-28 21:30:18 -0700
committerAndrew Morton <akpm@linux-foundation.org>2026-05-28 21:30:18 -0700
commit82432d0b9ab22b91c2baf0458f3a0669af7a8156 (patch)
treee377f474bae4c2166caeb560b2853539f78829c2 /mm
parentd90fdc074685684dfc210e86688cb009c1a327a7 (diff)
parentcfaef29c20e86738aec28641b6de1e078298999e (diff)
downloadlinux-next-history-82432d0b9ab22b91c2baf0458f3a0669af7a8156.tar.gz
foo
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/damon/core.c335
-rw-r--r--mm/damon/lru_sort.c54
-rw-r--r--mm/damon/ops-common.c9
-rw-r--r--mm/damon/reclaim.c93
-rw-r--r--mm/damon/stat.c92
-rw-r--r--mm/damon/sysfs-schemes.c65
-rw-r--r--mm/damon/sysfs.c31
-rw-r--r--mm/damon/tests/core-kunit.h10
-rw-r--r--mm/damon/vaddr.c3
-rw-r--r--mm/filemap.c67
-rw-r--r--mm/gup.c8
-rw-r--r--mm/huge_memory.c107
-rw-r--r--mm/hugetlb.c7
-rw-r--r--mm/kfence/kfence_test.c2
-rw-r--r--mm/khugepaged.c9
-rw-r--r--mm/kmemleak.c148
-rw-r--r--mm/madvise.c60
-rw-r--r--mm/memcontrol.c5
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c57
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/memremap.c4
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/migrate_device.c9
-rw-r--r--mm/mm_init.c47
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/page_alloc.c265
-rw-r--r--mm/page_io.c42
-rw-r--r--mm/page_isolation.c65
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/sparse-vmemmap.c75
-rw-r--r--mm/sparse.c68
-rw-r--r--mm/swap.c41
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/vmalloc.c29
-rw-r--r--mm/vmpressure.c15
-rw-r--r--mm/vmscan.c19
41 files changed, 1342 insertions, 597 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6ad90..e221fa1dc54d0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -810,7 +810,6 @@ if TRANSPARENT_HUGEPAGE
choice
prompt "Transparent Hugepage Support sysfs defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_ALWAYS
help
Selects the sysfs defaults for Transparent Hugepage Support.
@@ -840,7 +839,6 @@ endchoice
choice
prompt "Shmem hugepage allocation defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER
help
Selects the hugepage allocation policy defaults for
@@ -886,7 +884,6 @@ endchoice
choice
prompt "Tmpfs hugepage allocation defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER
help
Selects the hugepage allocation policy defaults for
@@ -931,7 +928,7 @@ endchoice
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
+ depends on ARCH_WANTS_THP_SWAP && SWAP && 64BIT
help
Swap transparent huge pages in one piece, without splitting.
XXX: For now, swap cluster backing transparent huge page
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3dbbbfdeff719..9f38deddcb30e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -13,10 +13,14 @@
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/psi.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/string_choices.h>
+/* for damon_get_folio() used by node eligible memory metrics */
+#include "ops-common.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -918,6 +922,8 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src)
if (err)
return err;
dst->goal_tuner = src->goal_tuner;
+ dst->fail_charge_num = src->fail_charge_num;
+ dst->fail_charge_denom = src->fail_charge_denom;
dst->weight_sz = src->weight_sz;
dst->weight_nr_accesses = src->weight_nr_accesses;
dst->weight_age = src->weight_age;
@@ -1326,11 +1332,26 @@ static int damon_commit_targets(
int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
int err;
+ struct damos *scheme;
+ struct damos_quota_goal *goal;
dst->maybe_corrupted = true;
if (!is_power_of_2(src->min_region_sz))
return -EINVAL;
+ /* node_eligible_mem_bp metric requires PADDR ops */
+ if (src->ops.id != DAMON_OPS_PADDR) {
+ damon_for_each_scheme(scheme, src) {
+ struct damos_quota *quota = &scheme->quota;
+
+ damos_for_each_quota_goal(goal, quota) {
+ if (goal->metric ==
+ DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP)
+ return -EINVAL;
+ }
+ }
+ }
+
err = damon_commit_schemes(dst, src);
if (err)
return err;
@@ -1349,6 +1370,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
if (err)
return err;
}
+ dst->pause = src->pause;
dst->ops = src->ops;
dst->addr_unit = src->addr_unit;
dst->min_region_sz = src->min_region_sz;
@@ -2046,6 +2068,37 @@ static void damos_walk_cancel(struct damon_ctx *ctx)
mutex_unlock(&ctx->walk_control_lock);
}
+static void damos_charge_quota(struct damos_quota *quota,
+ unsigned long sz_region, unsigned long sz_applied)
+{
+ /*
+ * sz_applied could be bigger than sz_region, depending on ops
+ * implementation of the action, e.g., damos_pa_pageout(). Charge only
+ * the region size in the case.
+ */
+ if (!quota->fail_charge_denom || sz_applied > sz_region)
+ quota->charged_sz += sz_region;
+ else
+ quota->charged_sz += sz_applied + mult_frac(
+ (sz_region - sz_applied),
+ quota->fail_charge_num,
+ quota->fail_charge_denom);
+}
+
+static bool damos_quota_is_full(struct damos_quota *quota,
+ unsigned long min_region_sz)
+{
+ if (!damos_quota_is_set(quota))
+ return false;
+ if (quota->charged_sz >= quota->esz)
+ return true;
+ /*
+ * DAMOS action is applied per region, so <min_region_sz remaining
+ * quota means the quota is effectively full.
+ */
+ return quota->esz - quota->charged_sz < min_region_sz;
+}
+
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
@@ -2102,11 +2155,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
- quota->charged_sz += sz;
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz) {
+ damos_charge_quota(quota, sz, sz_applied);
+ if (damos_quota_is_full(quota, c->min_region_sz)) {
quota->charge_target_from = t;
- quota->charge_addr_from = r->ar.end + 1;
+ quota->charge_addr_from = r->ar.end;
}
}
if (s->action != DAMOS_STAT)
@@ -2132,8 +2184,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
continue;
/* Check the quota */
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz)
+ if (damos_quota_is_full(quota, c->min_region_sz))
continue;
if (damos_skip_charged_region(t, r, s, c->min_region_sz))
@@ -2152,6 +2203,58 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
}
/*
+ * damos_apply_target() - Apply DAMOS schemes to a given target.
+ * @c: monitoring context to apply its DAMOS schemes to..
+ * @t: monitoring target to apply the schemes to.
+ * @max_region_sz: maximum region size for @c.
+ *
+ * This function could split regions for keeping the quota. To minimize
+ * overhead from the split operations increased number of regions, this
+ * function will also merge regions after the schemes applying attempt is done,
+ * for each region. The merge operation is made only when it doesn't lose the
+ * monitoring information and not violating @max_region_sz.
+ *
+ * Hence, after this function is called, the total number of regions could
+ * be increased or reduced. The increase could make max_nr_regions temporarily
+ * be violated, until the next per-aggregation interval regions merge operation
+ * is executed. The decrease will not violate min_nr_regions though, since it
+ * keeps @max_region_sz.
+ */
+static void damos_apply_target(struct damon_ctx *c, struct damon_target *t,
+ unsigned long max_region_sz)
+{
+ struct damon_region *r;
+
+ damon_for_each_region(r, t) {
+ struct damon_region *prev_r;
+
+ damon_do_apply_schemes(c, t, r);
+ /*
+ * damon_do_apply_scheems() could split the region for the
+ * quota. Keeping the new slices is an overhead. Merge back
+ * the slices into the previous region if it doesn't lose any
+ * information and not violating the max_region_sz.
+ */
+ if (damon_first_region(t) == r)
+ continue;
+ prev_r = damon_prev_region(r);
+ if (prev_r->ar.end != r->ar.start)
+ continue;
+ if (prev_r->age != r->age)
+ continue;
+ if (prev_r->last_nr_accesses != r->last_nr_accesses)
+ continue;
+ if (prev_r->nr_accesses != r->nr_accesses)
+ continue;
+ if (r->ar.end - prev_r->ar.start > max_region_sz)
+ continue;
+ prev_r->ar.end = r->ar.end;
+ damon_destroy_region(r, t);
+ r = prev_r;
+ }
+}
+
+/*
* damon_feed_loop_next_input() - get next input to achieve a target score.
* @last_input The last input.
* @score Current score that made with @last_input.
@@ -2287,7 +2390,115 @@ static unsigned long damos_get_node_memcg_used_bp(
numerator = i.totalram - used_pages;
return mult_frac(numerator, 10000, i.totalram);
}
-#else
+
+#ifdef CONFIG_DAMON_PADDR
+/*
+ * damos_calc_eligible_bytes() - Calculate raw eligible bytes per node.
+ * @c: The DAMON context.
+ * @s: The scheme.
+ * @nid: The target NUMA node id.
+ * @total: Output for total eligible bytes across all nodes.
+ *
+ * Iterates through each folio in eligible regions to accurately determine
+ * which node the memory resides on. Returns eligible bytes on the specified
+ * node and sets *total to the sum across all nodes.
+ *
+ * Note: This function requires damon_get_folio() from ops-common.c, which is
+ * only available when CONFIG_DAMON_PADDR is enabled. It also requires the
+ * context to be using PADDR operations for meaningful results.
+ */
+static phys_addr_t damos_calc_eligible_bytes(struct damon_ctx *c,
+ struct damos *s, int nid, phys_addr_t *total)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ phys_addr_t total_eligible = 0;
+ phys_addr_t node_eligible = 0;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ phys_addr_t addr, end_addr;
+
+ if (!__damos_valid_target(r, s))
+ continue;
+
+ /* Convert from core address units to physical bytes */
+ addr = (phys_addr_t)r->ar.start * c->addr_unit;
+ end_addr = (phys_addr_t)r->ar.end * c->addr_unit;
+ while (addr < end_addr) {
+ struct folio *folio;
+ phys_addr_t folio_start, folio_end;
+ phys_addr_t overlap_start, overlap_end;
+ phys_addr_t counted;
+
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (!folio) {
+ addr = PAGE_ALIGN_DOWN(addr +
+ PAGE_SIZE);
+ if (!addr)
+ break;
+ continue;
+ }
+
+ /*
+ * Calculate exact overlap between the region
+ * [addr, end_addr) and the folio range.
+ * The folio may start before addr if addr is
+ * in the middle of a large folio.
+ */
+ folio_start = PFN_PHYS(folio_pfn(folio));
+ folio_end = folio_start + folio_size(folio);
+
+ overlap_start = max(addr, folio_start);
+ overlap_end = min(end_addr, folio_end);
+
+ if (overlap_end > overlap_start) {
+ counted = overlap_end - overlap_start;
+ total_eligible += counted;
+ if (folio_nid(folio) == nid)
+ node_eligible += counted;
+ }
+
+ /* Advance past the entire folio */
+ addr = folio_end;
+ folio_put(folio);
+ }
+ cond_resched();
+ }
+ }
+
+ *total = total_eligible;
+ return node_eligible;
+}
+
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ phys_addr_t total_eligible = 0;
+ phys_addr_t node_eligible;
+
+ if (c->ops.id != DAMON_OPS_PADDR)
+ return 0;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid))
+ return 0;
+
+ node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible);
+
+ if (!(unsigned long)total_eligible)
+ return 0;
+
+ return mult_frac((unsigned long)node_eligible, 10000,
+ (unsigned long)total_eligible);
+}
+#else /* CONFIG_DAMON_PADDR */
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_DAMON_PADDR */
+#else /* CONFIG_NUMA */
static __kernel_ulong_t damos_get_node_mem_bp(
struct damos_quota_goal *goal)
{
@@ -2299,7 +2510,13 @@ static unsigned long damos_get_node_memcg_used_bp(
{
return 0;
}
-#endif
+
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
/*
* Returns LRU-active or inactive memory to total LRU memory size ratio.
@@ -2319,7 +2536,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio)
return mult_frac(inactive, 10000, total);
}
-static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
+static void damos_set_quota_goal_current_value(struct damon_ctx *c,
+ struct damos *s, struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -2345,19 +2563,24 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = damos_get_in_active_mem_bp(
goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP);
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ goal->current_value = damos_get_node_eligible_mem_bp(c, s,
+ goal->nid);
+ break;
default:
break;
}
}
/* Return the highest score since it makes schemes least aggressive */
-static unsigned long damos_quota_score(struct damos_quota *quota)
+static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s)
{
struct damos_quota_goal *goal;
+ struct damos_quota *quota = &s->quota;
unsigned long highest_score = 0;
damos_for_each_quota_goal(goal, quota) {
- damos_set_quota_goal_current_value(goal);
+ damos_set_quota_goal_current_value(c, s, goal);
highest_score = max(highest_score,
mult_frac(goal->current_value, 10000,
goal->target_value));
@@ -2366,17 +2589,20 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
return highest_score;
}
-static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_consist(struct damon_ctx *c, struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
quota->esz_bp = damon_feed_loop_next_input(
max(quota->esz_bp, 10000UL), score);
}
-static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_temporal(struct damon_ctx *c,
+ struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
if (score >= 10000)
quota->esz_bp = 0;
@@ -2389,9 +2615,9 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
/*
* Called only if quota->ms, or quota->sz are set, or quota->goals is not empty
*/
-static void damos_set_effective_quota(struct damos_quota *quota,
- struct damon_ctx *ctx)
+static void damos_set_effective_quota(struct damon_ctx *ctx, struct damos *s)
{
+ struct damos_quota *quota = &s->quota;
unsigned long throughput;
unsigned long esz = ULONG_MAX;
@@ -2402,9 +2628,9 @@ static void damos_set_effective_quota(struct damos_quota *quota,
if (!list_empty(&quota->goals)) {
if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST)
- damos_goal_tune_esz_bp_consist(quota);
+ damos_goal_tune_esz_bp_consist(ctx, s);
else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL)
- damos_goal_tune_esz_bp_temporal(quota);
+ damos_goal_tune_esz_bp_temporal(ctx, s);
esz = quota->esz_bp / 10000;
}
@@ -2452,22 +2678,21 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
/* First charge window */
if (!quota->total_charged_sz && !quota->charged_from) {
quota->charged_from = jiffies;
- damos_set_effective_quota(quota, c);
+ damos_set_effective_quota(c, s);
}
/* New charge window starts */
if (!time_in_range_open(jiffies, quota->charged_from,
quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz)
+ if (damos_quota_is_full(quota, c->min_region_sz))
s->stat.qt_exceeds++;
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
if (trace_damos_esz_enabled())
cached_esz = quota->esz;
- damos_set_effective_quota(quota, c);
+ damos_set_effective_quota(c, s);
if (trace_damos_esz_enabled() && quota->esz != cached_esz)
damos_trace_esz(c, s, quota);
}
@@ -2521,9 +2746,9 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s)
static void kdamond_apply_schemes(struct damon_ctx *c)
{
struct damon_target *t;
- struct damon_region *r;
struct damos *s;
bool has_schemes_to_apply = false;
+ unsigned long max_region_sz;
damon_for_each_scheme(s, c) {
if (time_before(c->passed_sample_intervals, s->next_apply_sis))
@@ -2540,13 +2765,12 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (!has_schemes_to_apply)
return;
+ max_region_sz = damon_region_sz_limit(c);
mutex_lock(&c->walk_control_lock);
damon_for_each_target(t, c) {
if (c->ops.target_valid && c->ops.target_valid(t) == false)
continue;
-
- damon_for_each_region(r, t)
- damon_do_apply_schemes(c, t, r);
+ damos_apply_target(c, t, max_region_sz);
}
damon_for_each_scheme(s, c) {
@@ -3014,6 +3238,14 @@ static int kdamond_fn(void *data)
kdamond_call(ctx, false);
if (ctx->maybe_corrupted)
break;
+ while (ctx->pause) {
+ damos_walk_cancel(ctx);
+ kdamond_usleep(ctx->attrs.sample_interval);
+ /* allow caller unset pause via damon_call() */
+ kdamond_call(ctx, false);
+ if (kdamond_need_stop(ctx) || ctx->maybe_corrupted)
+ goto done;
+ }
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
else
@@ -3096,14 +3328,20 @@ done:
return 0;
}
-static int walk_system_ram(struct resource *res, void *arg)
+struct damon_system_ram_range_walk_arg {
+ bool walked;
+ struct resource res;
+};
+
+static int damon_system_ram_walk_fn(struct resource *res, void *arg)
{
- struct resource *a = arg;
+ struct damon_system_ram_range_walk_arg *a = arg;
- if (resource_size(a) < resource_size(res)) {
- a->start = res->start;
- a->end = res->end;
+ if (!a->walked) {
+ a->walked = true;
+ a->res.start = res->start;
}
+ a->res.end = res->end;
return 0;
}
@@ -3120,27 +3358,24 @@ static unsigned long damon_res_to_core_addr(resource_size_t ra,
return ra / addr_unit;
}
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively. If no System RAM is found, returns false.
- */
-static bool damon_find_biggest_system_ram(unsigned long *start,
+static bool damon_find_system_rams_range(unsigned long *start,
unsigned long *end, unsigned long addr_unit)
-
{
- struct resource res = {};
+ struct damon_system_ram_range_walk_arg arg = {};
- walk_system_ram_res(0, -1, &res, walk_system_ram);
- *start = damon_res_to_core_addr(res.start, addr_unit);
- *end = damon_res_to_core_addr(res.end + 1, addr_unit);
+ walk_system_ram_res(0, -1, &arg, damon_system_ram_walk_fn);
+ if (!arg.walked)
+ return false;
+ *start = damon_res_to_core_addr(arg.res.start, addr_unit);
+ *end = damon_res_to_core_addr(arg.res.end + 1, addr_unit);
if (*end <= *start)
return false;
return true;
}
/**
- * damon_set_region_biggest_system_ram_default() - Set the region of the given
- * monitoring target as requested, or biggest 'System RAM'.
+ * damon_set_region_system_rams_default() - Set the region of the given
+ * monitoring target as requested, or to cover all 'System RAM' resources.
* @t: The monitoring target to set the region.
* @start: The pointer to the start address of the region.
* @end: The pointer to the end address of the region.
@@ -3148,14 +3383,14 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
* @min_region_sz: Minimum region size.
*
* This function sets the region of @t as requested by @start and @end. If the
- * values of @start and @end are zero, however, this function finds the biggest
- * 'System RAM' resource and sets the region to cover the resource. In the
- * latter case, this function saves the start and end addresses of the resource
- * in @start and @end, respectively.
+ * values of @start and @end are zero, however, this function finds 'System
+ * RAM' resources and sets the region to cover all the resource. In the latter
+ * case, this function saves the start and the end addresseses of the first and
+ * the last resources in @start and @end, respectively.
*
* Return: 0 on success, negative error code otherwise.
*/
-int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+int damon_set_region_system_rams_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
unsigned long addr_unit, unsigned long min_region_sz)
{
@@ -3165,7 +3400,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
return -EINVAL;
if (!*start && !*end &&
- !damon_find_biggest_system_ram(start, end, addr_unit))
+ !damon_find_system_rams_range(start, end, addr_unit))
return -EINVAL;
addr_range.start = *start;
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 8cfe7bd3dc1d3..08500a15bbfb8 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -39,7 +39,6 @@ static bool enabled __read_mostly;
* the re-reading, DAMON_LRU_SORT will be disabled.
*/
static bool commit_inputs __read_mostly;
-module_param(commit_inputs, bool, 0600);
/*
* Desired active to [in]active memory ratio in bp (1/10,000).
@@ -140,7 +139,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs);
* Start of the target memory region in physical address.
*
* The start physical address of memory region that DAMON_LRU_SORT will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_start __read_mostly;
module_param(monitor_region_start, ulong, 0600);
@@ -149,7 +149,8 @@ module_param(monitor_region_start, ulong, 0600);
* End of the target memory region in physical address.
*
* The end physical address of memory region that DAMON_LRU_SORT will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
@@ -327,7 +328,7 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
goto out;
- err = damon_set_region_biggest_system_ram_default(param_target,
+ err = damon_set_region_system_rams_default(param_target,
&monitor_region_start,
&monitor_region_end,
param_ctx->addr_unit,
@@ -340,18 +341,51 @@ out:
return err;
}
-static int damon_lru_sort_handle_commit_inputs(void)
+static int damon_lru_sort_commit_inputs_fn(void *arg)
{
+ return damon_lru_sort_apply_parameters();
+}
+
+static int damon_lru_sort_commit_inputs_store(const char *val,
+ const struct kernel_param *kp)
+{
+ bool commit_inputs_request;
int err;
+ struct damon_call_control control = {
+ .fn = damon_lru_sort_commit_inputs_fn,
+ };
+
+ if (!val) {
+ commit_inputs_request = true;
+ } else {
+ err = kstrtobool(val, &commit_inputs_request);
+ if (err)
+ return err;
+ }
- if (!commit_inputs)
+ if (!commit_inputs_request)
return 0;
- err = damon_lru_sort_apply_parameters();
- commit_inputs = false;
- return err;
+ /*
+ * Skip damon_call() if ctx is not initialized to avoid
+ * NULL pointer dereference.
+ */
+ if (!ctx)
+ return -EINVAL;
+
+ err = damon_call(ctx, &control);
+
+ return err ? err : control.return_code;
}
+static const struct kernel_param_ops commit_inputs_param_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = damon_lru_sort_commit_inputs_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600);
+
static int damon_lru_sort_damon_call_fn(void *arg)
{
struct damon_ctx *c = arg;
@@ -365,7 +399,7 @@ static int damon_lru_sort_damon_call_fn(void *arg)
damon_lru_sort_cold_stat = s->stat;
}
- return damon_lru_sort_handle_commit_inputs();
+ return 0;
}
static struct damon_call_control call_control = {
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index c3e4c871b0bb2..5c93ef2bb8a97 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -117,9 +117,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
damon_max_nr_accesses(&c->attrs);
age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
- for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
- age_in_log++, age_in_sec >>= 1)
- ;
+ if (age_in_sec)
+ age_in_log = min_t(int, ilog2(age_in_sec) + 1,
+ DAMON_MAX_AGE_IN_LOG);
+ else
+ age_in_log = 0;
+
/* If frequency is 0, higher age means it's colder */
if (freq_subscore == 0)
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 96f6dfc28eae4..1f0319bfa0788 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -39,7 +39,6 @@ static bool enabled __read_mostly;
* re-reading, DAMON_RECLAIM will be disabled.
*/
static bool commit_inputs __read_mostly;
-module_param(commit_inputs, bool, 0600);
/*
* Time threshold for cold memory regions identification in microseconds.
@@ -92,6 +91,20 @@ module_param(quota_mem_pressure_us, ulong, 0600);
static unsigned long quota_autotune_feedback __read_mostly;
module_param(quota_autotune_feedback, ulong, 0600);
+/*
+ * Auto-tune monitoring intervals.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's
+ * sampling and aggregation intervals. The auto-tuning aims to capture
+ * meaningful amount of access events in each DAMON-snapshot, while keeping the
+ * sampling intervals 5 milliseconds in minimum, and 10 seconds in maximum.
+ * Setting this as ``N`` disables the auto-tuning.
+ *
+ * Disabled by default.
+ */
+static bool autotune_monitoring_intervals __read_mostly;
+module_param(autotune_monitoring_intervals, bool, 0600);
+
static struct damos_watermarks damon_reclaim_wmarks = {
.metric = DAMOS_WMARK_FREE_MEM_RATE,
.interval = 5000000, /* 5 seconds */
@@ -114,7 +127,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs);
* Start of the target memory region in physical address.
*
* The start physical address of memory region that DAMON_RECLAIM will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_start __read_mostly;
module_param(monitor_region_start, ulong, 0600);
@@ -123,7 +137,8 @@ module_param(monitor_region_start, ulong, 0600);
* End of the target memory region in physical address.
*
* The end physical address of memory region that DAMON_RECLAIM will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
@@ -151,7 +166,7 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
static struct damon_ctx *ctx;
static struct damon_target *target;
-static struct damos *damon_reclaim_new_scheme(void)
+static struct damos *damon_reclaim_new_scheme(unsigned long aggr_interval)
{
struct damos_access_pattern pattern = {
/* Find regions having PAGE_SIZE or larger size */
@@ -161,8 +176,7 @@ static struct damos *damon_reclaim_new_scheme(void)
.min_nr_accesses = 0,
.max_nr_accesses = 0,
/* for min_age or more micro-seconds */
- .min_age_region = min_age /
- damon_reclaim_mon_attrs.aggr_interval,
+ .min_age_region = min_age / aggr_interval,
.max_age_region = UINT_MAX,
};
@@ -183,6 +197,7 @@ static int damon_reclaim_apply_parameters(void)
{
struct damon_ctx *param_ctx;
struct damon_target *param_target;
+ struct damon_attrs attrs;
struct damos *scheme;
struct damos_quota_goal *goal;
struct damos_filter *filter;
@@ -200,12 +215,21 @@ static int damon_reclaim_apply_parameters(void)
goto out;
}
- err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
+ attrs = damon_reclaim_mon_attrs;
+ if (autotune_monitoring_intervals) {
+ attrs.sample_interval = 5000;
+ attrs.aggr_interval = 100000;
+ attrs.intervals_goal.access_bp = 40;
+ attrs.intervals_goal.aggrs = 3;
+ attrs.intervals_goal.min_sample_us = 5000;
+ attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000;
+ }
+ err = damon_set_attrs(param_ctx, &attrs);
if (err)
goto out;
err = -ENOMEM;
- scheme = damon_reclaim_new_scheme();
+ scheme = damon_reclaim_new_scheme(attrs.aggr_interval);
if (!scheme)
goto out;
damon_set_schemes(param_ctx, &scheme, 1);
@@ -233,11 +257,9 @@ static int damon_reclaim_apply_parameters(void)
damos_add_filter(scheme, filter);
}
- err = damon_set_region_biggest_system_ram_default(param_target,
- &monitor_region_start,
- &monitor_region_end,
- param_ctx->addr_unit,
- param_ctx->min_region_sz);
+ err = damon_set_region_system_rams_default(param_target,
+ &monitor_region_start, &monitor_region_end,
+ param_ctx->addr_unit, param_ctx->min_region_sz);
if (err)
goto out;
err = damon_commit_ctx(ctx, param_ctx);
@@ -246,18 +268,51 @@ out:
return err;
}
-static int damon_reclaim_handle_commit_inputs(void)
+static int damon_reclaim_commit_inputs_fn(void *arg)
{
+ return damon_reclaim_apply_parameters();
+}
+
+static int damon_reclaim_commit_inputs_store(const char *val,
+ const struct kernel_param *kp)
+{
+ bool commit_inputs_request;
int err;
+ struct damon_call_control control = {
+ .fn = damon_reclaim_commit_inputs_fn,
+ };
+
+ if (!val) {
+ commit_inputs_request = true;
+ } else {
+ err = kstrtobool(val, &commit_inputs_request);
+ if (err)
+ return err;
+ }
- if (!commit_inputs)
+ if (!commit_inputs_request)
return 0;
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- return err;
+ /*
+ * Skip damon_call() if ctx is not initialized to avoid
+ * NULL pointer dereference.
+ */
+ if (!ctx)
+ return -EINVAL;
+
+ err = damon_call(ctx, &control);
+
+ return err ? err : control.return_code;
}
+static const struct kernel_param_ops commit_inputs_param_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = damon_reclaim_commit_inputs_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600);
+
static int damon_reclaim_damon_call_fn(void *arg)
{
struct damon_ctx *c = arg;
@@ -267,7 +322,7 @@ static int damon_reclaim_damon_call_fn(void *arg)
damon_for_each_scheme(s, c)
damon_reclaim_stat = s->stat;
- return damon_reclaim_handle_commit_inputs();
+ return 0;
}
static struct damon_call_control call_control = {
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index 3951b762cbddf..0e14f5bb8f75d 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -148,59 +148,12 @@ static int damon_stat_damon_call_fn(void *data)
return 0;
}
-struct damon_stat_system_ram_range_walk_arg {
- bool walked;
- struct resource res;
-};
-
-static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg)
-{
- struct damon_stat_system_ram_range_walk_arg *a = arg;
-
- if (!a->walked) {
- a->walked = true;
- a->res.start = res->start;
- }
- a->res.end = res->end;
- return 0;
-}
-
-static unsigned long damon_stat_res_to_core_addr(resource_size_t ra,
- unsigned long addr_unit)
-{
- /*
- * Use div_u64() for avoiding linking errors related with __udivdi3,
- * __aeabi_uldivmod, or similar problems. This should also improve the
- * performance optimization (read div_u64() comment for the detail).
- */
- if (sizeof(ra) == 8 && sizeof(addr_unit) == 4)
- return div_u64(ra, addr_unit);
- return ra / addr_unit;
-}
-
-static int damon_stat_set_monitoring_region(struct damon_target *t,
- unsigned long addr_unit, unsigned long min_region_sz)
-{
- struct damon_addr_range addr_range;
- struct damon_stat_system_ram_range_walk_arg arg = {};
-
- walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn);
- if (!arg.walked)
- return -EINVAL;
- addr_range.start = damon_stat_res_to_core_addr(
- arg.res.start, addr_unit);
- addr_range.end = damon_stat_res_to_core_addr(
- arg.res.end + 1, addr_unit);
- if (addr_range.end <= addr_range.start)
- return -EINVAL;
- return damon_set_regions(t, &addr_range, 1, min_region_sz);
-}
-
static struct damon_ctx *damon_stat_build_ctx(void)
{
struct damon_ctx *ctx;
struct damon_attrs attrs;
struct damon_target *target;
+ unsigned long start = 0, end = 0;
ctx = damon_new_ctx();
if (!ctx)
@@ -230,8 +183,8 @@ static struct damon_ctx *damon_stat_build_ctx(void)
if (!target)
goto free_out;
damon_add_target(ctx, target);
- if (damon_stat_set_monitoring_region(target, ctx->addr_unit,
- ctx->min_region_sz))
+ if (damon_set_region_system_rams_default(target, &start, &end,
+ ctx->addr_unit, ctx->min_region_sz))
goto free_out;
return ctx;
free_out:
@@ -313,6 +266,45 @@ static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp)
return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N');
}
+static int damon_stat_kdamond_pid_store(
+ const char *val, const struct kernel_param *kp)
+{
+ /*
+ * kdamond_pid is read-only, but kernel command line could write it.
+ * Do nothing here.
+ */
+ return 0;
+}
+
+static int damon_stat_kdamond_pid_load(
+ char *buffer, const struct kernel_param *kp)
+{
+ int pid;
+
+ if (!damon_stat_context) {
+ pid = -1;
+ } else {
+ pid = damon_kdamond_pid(damon_stat_context);
+ if (pid < 1)
+ pid = -1;
+ }
+ return sprintf(buffer, "%d\n", pid);
+}
+
+static const struct kernel_param_ops kdamond_pid_param_ops = {
+ .set = damon_stat_kdamond_pid_store,
+ .get = damon_stat_kdamond_pid_load,
+};
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_STAT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400);
+MODULE_PARM_DESC(kdamond_pid, "pid of the kdamond");
+
static int __init damon_stat_init(void)
{
int err = 0;
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index a8014780edae9..ab2153fff9a83 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1093,6 +1093,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
.metric = DAMOS_QUOTA_INACTIVE_MEM_BP,
.name = "inactive_mem_bp",
},
+ {
+ .metric = DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP,
+ .name = "node_eligible_mem_bp",
+ },
};
static ssize_t target_metric_show(struct kobject *kobj,
@@ -1508,6 +1512,8 @@ struct damon_sysfs_quotas {
unsigned long reset_interval_ms;
unsigned long effective_sz; /* Effective size quota in bytes */
enum damos_quota_goal_tuner goal_tuner;
+ unsigned int fail_charge_num;
+ unsigned int fail_charge_denom;
};
static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
@@ -1682,6 +1688,48 @@ static ssize_t goal_tuner_store(struct kobject *kobj,
return -EINVAL;
}
+static ssize_t fail_charge_num_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%u\n", quotas->fail_charge_num);
+}
+
+static ssize_t fail_charge_num_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtouint(buf, 0, &quotas->fail_charge_num);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t fail_charge_denom_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%u\n", quotas->fail_charge_denom);
+}
+
+static ssize_t fail_charge_denom_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtouint(buf, 0, &quotas->fail_charge_denom);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
static void damon_sysfs_quotas_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
@@ -1702,12 +1750,20 @@ static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr =
static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr =
__ATTR_RW_MODE(goal_tuner, 0600);
+static struct kobj_attribute damon_sysfs_quotas_fail_charge_num_attr =
+ __ATTR_RW_MODE(fail_charge_num, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_fail_charge_denom_attr =
+ __ATTR_RW_MODE(fail_charge_denom, 0600);
+
static struct attribute *damon_sysfs_quotas_attrs[] = {
&damon_sysfs_quotas_ms_attr.attr,
&damon_sysfs_quotas_sz_attr.attr,
&damon_sysfs_quotas_reset_interval_ms_attr.attr,
&damon_sysfs_quotas_effective_bytes_attr.attr,
&damon_sysfs_quotas_goal_tuner_attr.attr,
+ &damon_sysfs_quotas_fail_charge_num_attr.attr,
+ &damon_sysfs_quotas_fail_charge_denom_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_quotas);
@@ -2061,6 +2117,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = {
.name = "nohugepage",
},
{
+ .action = DAMOS_COLLAPSE,
+ .name = "collapse",
+ },
+ {
.action = DAMOS_LRU_PRIO,
.name = "lru_prio",
},
@@ -2685,6 +2745,9 @@ static int damos_sysfs_add_quota_score(
}
goal->nid = sysfs_goal->nid;
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
default:
break;
}
@@ -2796,6 +2859,8 @@ static struct damos *damon_sysfs_mk_scheme(
.weight_nr_accesses = sysfs_weights->nr_accesses,
.weight_age = sysfs_weights->age,
.goal_tuner = sysfs_quotas->goal_tuner,
+ .fail_charge_num = sysfs_quotas->fail_charge_num,
+ .fail_charge_denom = sysfs_quotas->fail_charge_denom,
};
struct damos_watermarks wmarks = {
.metric = sysfs_wmarks->metric,
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index eefa959aa30ae..d5863cc33d230 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -866,6 +866,7 @@ struct damon_sysfs_context {
struct damon_sysfs_attrs *attrs;
struct damon_sysfs_targets *targets;
struct damon_sysfs_schemes *schemes;
+ bool pause;
};
static struct damon_sysfs_context *damon_sysfs_context_alloc(
@@ -878,6 +879,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc(
context->kobj = (struct kobject){};
context->ops_id = ops_id;
context->addr_unit = 1;
+ context->pause = false;
return context;
}
@@ -1053,6 +1055,30 @@ static ssize_t addr_unit_store(struct kobject *kobj,
return count;
}
+static ssize_t pause_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%c\n", context->pause ? 'Y' : 'N');
+}
+
+static ssize_t pause_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ bool pause;
+ int err = kstrtobool(buf, &pause);
+
+ if (err)
+ return err;
+ context->pause = pause;
+ return count;
+}
+
+
static void damon_sysfs_context_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
@@ -1067,10 +1093,14 @@ static struct kobj_attribute damon_sysfs_context_operations_attr =
static struct kobj_attribute damon_sysfs_context_addr_unit_attr =
__ATTR_RW_MODE(addr_unit, 0600);
+static struct kobj_attribute damon_sysfs_context_pause_attr =
+ __ATTR_RW_MODE(pause, 0600);
+
static struct attribute *damon_sysfs_context_attrs[] = {
&damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
&damon_sysfs_context_addr_unit_attr.attr,
+ &damon_sysfs_context_pause_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
@@ -1470,6 +1500,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
if (sys_ctx->ops_id == DAMON_OPS_PADDR)
ctx->min_region_sz = max(
DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1);
+ ctx->pause = sys_ctx->pause;
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 9e5904c2beeb2..1b23a22ac04c4 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -694,6 +694,8 @@ static void damos_test_commit_quota(struct kunit *test)
.ms = 2,
.sz = 3,
.goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST,
+ .fail_charge_num = 2,
+ .fail_charge_denom = 3,
.weight_sz = 4,
.weight_nr_accesses = 5,
.weight_age = 6,
@@ -703,6 +705,8 @@ static void damos_test_commit_quota(struct kunit *test)
.ms = 8,
.sz = 9,
.goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL,
+ .fail_charge_num = 1,
+ .fail_charge_denom = 1024,
.weight_sz = 10,
.weight_nr_accesses = 11,
.weight_age = 12,
@@ -717,6 +721,8 @@ static void damos_test_commit_quota(struct kunit *test)
KUNIT_EXPECT_EQ(test, dst.ms, src.ms);
KUNIT_EXPECT_EQ(test, dst.sz, src.sz);
KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner);
+ KUNIT_EXPECT_EQ(test, dst.fail_charge_num, src.fail_charge_num);
+ KUNIT_EXPECT_EQ(test, dst.fail_charge_denom, src.fail_charge_denom);
KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz);
KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses);
KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age);
@@ -1077,6 +1083,10 @@ static void damon_test_commit_ctx(struct kunit *test)
KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0);
src->min_region_sz = 4095;
KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL);
+ src->min_region_sz = 4096;
+ src->pause = true;
+ KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0);
+ KUNIT_EXPECT_TRUE(test, dst->pause);
damon_destroy_ctx(src);
damon_destroy_ctx(dst);
}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b069dbc7e3d25..dd5f2d7027ac4 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -903,6 +903,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_NOHUGEPAGE:
madv_action = MADV_NOHUGEPAGE;
break;
+ case DAMOS_COLLAPSE:
+ madv_action = MADV_COLLAPSE;
+ break;
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
return damos_va_migrate(t, r, scheme, sz_filter_passed);
diff --git a/mm/filemap.c b/mm/filemap.c
index 4e636647100c1..ab34cab2416a4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3314,6 +3314,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
bool force_thp_readahead = false;
unsigned short mmap_miss;
+ ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1;
+
/* Use the readahead code, even if readahead is disabled */
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
(vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)
@@ -3396,6 +3398,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+ ra->start = max(ra->start, vmf->vma->vm_pgoff);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ra->order = 0;
@@ -3438,6 +3441,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
}
if (folio_test_readahead(folio)) {
+ ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1;
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_ra(&ractl, folio, ra->ra_pages);
}
@@ -3747,8 +3751,7 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned short *mmap_miss,
- pgoff_t file_end)
+ unsigned long *rss, pgoff_t file_end)
{
struct address_space *mapping = folio->mapping;
unsigned int ref_from_caller = 1;
@@ -3781,16 +3784,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
goto skip;
/*
- * If there are too many folios that are recently evicted
- * in a file, they will probably continue to be evicted.
- * In such situation, read-ahead is only a waste of IO.
- * Don't decrease mmap_miss in this scenario to make sure
- * we can stop read-ahead.
- */
- if (!folio_test_workingset(folio))
- (*mmap_miss)++;
-
- /*
* NOTE: If there're PTE markers, we'll leave them to be
* handled in the specific fault path, and it'll prohibit the
* fault-around logic.
@@ -3836,7 +3829,7 @@ skip:
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned long *rss, unsigned short *mmap_miss)
+ unsigned long *rss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
@@ -3844,10 +3837,6 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
if (PageHWPoison(page))
goto out;
- /* See comment of filemap_map_folio_range() */
- if (!folio_test_workingset(folio))
- (*mmap_miss)++;
-
/*
* NOTE: If there're PTE markers, we'll leave them to be
* handled in the specific fault path, and it'll prohibit
@@ -3882,7 +3871,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
vm_fault_t ret = 0;
unsigned long rss = 0;
unsigned int nr_pages = 0, folio_type;
- unsigned short mmap_miss = 0, mmap_miss_saved;
/*
* Recalculate end_pgoff based on file_end before calling
@@ -3921,6 +3909,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
folio_type = mm_counter_file(folio);
do {
unsigned long end;
+ vm_fault_t map_ret;
addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
vmf->pte += xas.xa_index - last_pgoff;
@@ -3928,13 +3917,35 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
end = folio_next_index(folio) - 1;
nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
- if (!folio_test_large(folio))
- ret |= filemap_map_order0_folio(vmf,
- folio, addr, &rss, &mmap_miss);
- else
- ret |= filemap_map_folio_range(vmf, folio,
- xas.xa_index - folio->index, addr,
- nr_pages, &rss, &mmap_miss, file_end);
+ if (!folio_test_large(folio)) {
+ map_ret = filemap_map_order0_folio(vmf, folio, addr,
+ &rss);
+ } else {
+ unsigned long start = xas.xa_index - folio->index;
+
+ map_ret = filemap_map_folio_range(vmf, folio, start,
+ addr, nr_pages, &rss,
+ file_end);
+ }
+ ret |= map_ret;
+
+ /*
+ * If there are too many folios that are recently evicted
+ * in a file, they will probably continue to be evicted.
+ * In such situation, read-ahead is only a waste of IO.
+ * Don't decrease mmap_miss in this scenario to make sure
+ * we can stop read-ahead.
+ */
+ if ((map_ret & VM_FAULT_NOPAGE) &&
+ !(vmf->flags & FAULT_FLAG_TRIED) &&
+ !folio_test_workingset(folio)) {
+ unsigned short mmap_miss;
+
+ mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(file->f_ra.mmap_miss,
+ mmap_miss - 1);
+ }
folio_unlock(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
@@ -3944,12 +3955,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
out:
rcu_read_unlock();
- mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
- if (mmap_miss >= mmap_miss_saved)
- WRITE_ONCE(file->f_ra.mmap_miss, 0);
- else
- WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);
-
return ret;
}
EXPORT_SYMBOL(filemap_map_pages);
diff --git a/mm/gup.c b/mm/gup.c
index ad9ded39609cb..0692119b79043 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2865,8 +2865,8 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!folio)
goto pte_unmap;
- if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
- unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get_lockless(pmdp))) ||
+ unlikely(pte_val(pte) != pte_val(ptep_get_lockless(ptep)))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2942,7 +2942,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+ if (unlikely(pmd_val(orig) != pmd_val(pmdp_get_lockless(pmdp)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2985,7 +2985,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ if (unlikely(pud_val(orig) != pud_val(pudp_get(pudp)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 653f2dc034036..b7df167f7acbf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -429,61 +429,75 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
return count;
}
+enum defrag_mode {
+ DEFRAG_ALWAYS = 0,
+ DEFRAG_DEFER,
+ DEFRAG_DEFER_MADVISE,
+ DEFRAG_MADVISE,
+ DEFRAG_NEVER,
+};
+
+static const char * const defrag_mode_strings[] = {
+ [DEFRAG_ALWAYS] = "always",
+ [DEFRAG_DEFER] = "defer",
+ [DEFRAG_DEFER_MADVISE] = "defer+madvise",
+ [DEFRAG_MADVISE] = "madvise",
+ [DEFRAG_NEVER] = "never",
+};
+
+static const enum transparent_hugepage_flag defrag_flags[] = {
+ [DEFRAG_ALWAYS] = TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ [DEFRAG_DEFER] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+ [DEFRAG_DEFER_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
+ [DEFRAG_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+};
+
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- const char *output;
+ int active = DEFRAG_NEVER;
+ int len = 0;
+ int i;
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
- &transparent_hugepage_flags))
- output = "[always] defer defer+madvise madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
- &transparent_hugepage_flags))
- output = "always [defer] defer+madvise madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
- &transparent_hugepage_flags))
- output = "always defer [defer+madvise] madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
- &transparent_hugepage_flags))
- output = "always defer defer+madvise [madvise] never";
- else
- output = "always defer defer+madvise madvise [never]";
+ for (i = 0; i < ARRAY_SIZE(defrag_flags); i++) {
+ if (test_bit(defrag_flags[i], &transparent_hugepage_flags)) {
+ active = i;
+ break;
+ }
+ }
- return sysfs_emit(buf, "%s\n", output);
+ for (i = 0; i < ARRAY_SIZE(defrag_mode_strings); i++) {
+ if (i == active)
+ len += sysfs_emit_at(buf, len, "[%s] ",
+ defrag_mode_strings[i]);
+ else
+ len += sysfs_emit_at(buf, len, "%s ",
+ defrag_mode_strings[i]);
+ }
+
+ /* Replace trailing space with newline */
+ buf[len - 1] = '\n';
+
+ return len;
}
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (sysfs_streq(buf, "always")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "defer+madvise")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "defer")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "madvise")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "never")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else
+ int mode, m;
+
+ mode = sysfs_match_string(defrag_mode_strings, buf);
+ if (mode < 0)
return -EINVAL;
+ for (m = 0; m < ARRAY_SIZE(defrag_flags); m++) {
+ if (m == mode)
+ set_bit(defrag_flags[m], &transparent_hugepage_flags);
+ else
+ clear_bit(defrag_flags[m], &transparent_hugepage_flags);
+ }
+
return count;
}
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
@@ -4192,11 +4206,10 @@ fail:
folio_unlock(new_folio);
/*
- * Subpages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
+ * Subpages whose mapping has been zapped may be freed
+ * earlier, but freeing them requires taking the
+ * lru_lock, so we defer put_page() on tail pages until
+ * after the split completes.
*/
free_folio_and_swap_cache(new_folio);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c921287489de3..571212b80835e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2862,6 +2862,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
map_chg_state map_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
+ struct hugetlb_cgroup *h_cg_rsvd = NULL;
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
idx = hstate_index(h);
@@ -2912,7 +2913,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
- idx, pages_per_huge_page(h), &h_cg);
+ idx, pages_per_huge_page(h), &h_cg_rsvd);
if (ret)
goto out_subpool_put;
}
@@ -2954,7 +2955,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
- h_cg, folio);
+ h_cg_rsvd, folio);
}
spin_unlock_irq(&hugetlb_lock);
@@ -3006,7 +3007,7 @@ out_uncharge_cgroup:
out_uncharge_cgroup_reservation:
if (map_chg)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
- h_cg);
+ h_cg_rsvd);
out_subpool_put:
/*
* put page to subpool iff the quota of subpool's rsv_hpages is used
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 5725a367246d9..10424cd25e5a6 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -263,7 +263,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
break;
}
- kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+ kunit_info(test, "%s: size=%zu, gfp=%pGg, policy=%s, cache=%i\n", __func__, size, &gfp,
policy_name, !!test_cache);
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b8452dbdb043f..28a843f30b32b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2528,8 +2528,8 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
cc->progress++;
continue;
}
- hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
- hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
+ hstart = ALIGN(vma->vm_start, HPAGE_PMD_SIZE);
+ hend = ALIGN_DOWN(vma->vm_end, HPAGE_PMD_SIZE);
if (khugepaged_scan.address > hend) {
cc->progress++;
continue;
@@ -2808,6 +2808,7 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
case SCAN_PAGE_FILLED:
+ case SCAN_PAGE_HAS_PRIVATE:
case SCAN_PAGE_DIRTY_OR_WRITEBACK:
return -EAGAIN;
/*
@@ -2845,8 +2846,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmgrab(mm);
lru_add_drain_all();
- hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = end & HPAGE_PMD_MASK;
+ hstart = ALIGN(start, HPAGE_PMD_SIZE);
+ hend = ALIGN_DOWN(end, HPAGE_PMD_SIZE);
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
enum scan_result result = SCAN_FAIL;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2eff0d6b622b6..7c7ba17ce7af0 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,6 +92,7 @@
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
+#include <linux/xarray.h>
#include <linux/crc32.h>
#include <asm/sections.h>
@@ -157,6 +158,8 @@ struct kmemleak_object {
struct hlist_head area_list;
unsigned long jiffies; /* creation timestamp */
pid_t pid; /* pid of the current task */
+ /* per-scan dedup count, valid only while in scan-local dedup xarray */
+ unsigned int dup_count;
char comm[TASK_COMM_LEN]; /* executable name */
};
@@ -360,8 +363,9 @@ static const char *__object_type_str(struct kmemleak_object *object)
* Printing of the unreferenced objects information to the seq file. The
* print_unreferenced function must be called with the object->lock held.
*/
-static void print_unreferenced(struct seq_file *seq,
- struct kmemleak_object *object)
+static void __print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object,
+ bool hex_dump)
{
int i;
unsigned long *entries;
@@ -373,7 +377,8 @@ static void print_unreferenced(struct seq_file *seq,
object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
- hex_dump_object(seq, object);
+ if (hex_dump)
+ hex_dump_object(seq, object);
warn_or_seq_printf(seq, " backtrace (crc %x):\n", object->checksum);
for (i = 0; i < nr_entries; i++) {
@@ -382,6 +387,12 @@ static void print_unreferenced(struct seq_file *seq,
}
}
+static void print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ __print_unreferenced(seq, object, true);
+}
+
/*
* Print the kmemleak_object information. This function is used mainly for
* debugging special cases when kmemleak operations. It must be called with
@@ -1685,6 +1696,103 @@ unlock_put:
}
/*
+ * Print one leak inline. The hex dump is gated on OBJECT_ALLOCATED so it
+ * does not touch user memory that was freed concurrently; the rest of the
+ * report (backtrace, comm, pid) is always emitted since the kmemleak_object
+ * metadata is pinned by the caller.
+ */
+static void print_leak_locked(struct kmemleak_object *object, bool hex_dump)
+{
+ raw_spin_lock_irq(&object->lock);
+ __print_unreferenced(NULL, object,
+ hex_dump && (object->flags & OBJECT_ALLOCATED));
+ raw_spin_unlock_irq(&object->lock);
+}
+
+/*
+ * Per-scan dedup table for verbose leak printing. The xarray is keyed by
+ * stackdepot trace_handle and stores a pointer to the representative
+ * kmemleak_object. The per-scan repeat count lives in object->dup_count.
+ *
+ * dedup_record() must run outside object->lock: xa_store() may take
+ * mutexes (xa_node slab allocation) which lockdep would flag against the
+ * raw spinlock object->lock.
+ */
+static void dedup_record(struct xarray *dedup, struct kmemleak_object *object,
+ depot_stack_handle_t trace_handle)
+{
+ struct kmemleak_object *rep;
+ void *old;
+
+ /*
+ * No stack trace to dedup against: early-boot allocation tracked
+ * before kmemleak_init() set up object_cache, or stack_depot_save()
+ * failure under memory pressure.
+ */
+ if (!trace_handle) {
+ print_leak_locked(object, true);
+ return;
+ }
+
+ /* stack is available, now we can de-dup */
+ rep = xa_load(dedup, trace_handle);
+ if (rep) {
+ rep->dup_count++;
+ return;
+ }
+
+ /*
+ * Object is being torn down (use_count already hit zero); the
+ * tracked memory at object->pointer is unsafe to read, so skip.
+ */
+ if (!get_object(object))
+ return;
+
+ object->dup_count = 1;
+ old = xa_store(dedup, trace_handle, object, GFP_ATOMIC);
+ if (xa_is_err(old)) {
+ /* xa_node allocation failed; fall back to inline print. */
+ print_leak_locked(object, true);
+ put_object(object);
+ return;
+ }
+ /*
+ * scan_mutex serialises all writers to the dedup xarray, so xa_store()
+ * after a NULL xa_load() must always overwrite an empty slot.
+ */
+ WARN_ON_ONCE(old);
+}
+
+/*
+ * Drain the dedup table. Re-acquires object->lock and re-checks
+ * OBJECT_ALLOCATED before printing: while get_object() pins the
+ * kmemleak_object metadata, the underlying tracked allocation may have
+ * been freed since the scan walked it (kmemleak_free clears
+ * OBJECT_ALLOCATED under object->lock before the user memory goes away).
+ * The hex dump is skipped for coalesced entries since the bytes would
+ * differ across objects anyway.
+ */
+static void dedup_flush(struct xarray *dedup)
+{
+ struct kmemleak_object *object;
+ unsigned long idx;
+ unsigned int dup;
+ bool coalesced;
+
+ xa_for_each(dedup, idx, object) {
+ dup = object->dup_count;
+ coalesced = dup > 1;
+
+ print_leak_locked(object, !coalesced);
+ if (coalesced)
+ pr_warn(" ... and %u more object(s) with the same backtrace\n",
+ dup - 1);
+ put_object(object);
+ xa_erase(dedup, idx);
+ }
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1694,6 +1802,7 @@ static void kmemleak_scan(void)
struct kmemleak_object *object;
struct zone *zone;
int __maybe_unused i;
+ struct xarray dedup;
int new_leaks = 0;
jiffies_last_scan = jiffies;
@@ -1834,10 +1943,18 @@ static void kmemleak_scan(void)
return;
/*
- * Scanning result reporting.
+ * Scanning result reporting. When verbose printing is enabled, dedupe
+ * by stackdepot trace_handle so each unique backtrace is logged once
+ * per scan, annotated with the number of objects that share it. The
+ * per-leak count below still reflects every object, and
+ * /sys/kernel/debug/kmemleak still lists them individually.
*/
+ xa_init(&dedup);
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
+ depot_stack_handle_t trace_handle;
+ bool dedup_print;
+
if (need_resched())
kmemleak_cond_resched(object);
@@ -1849,18 +1966,33 @@ static void kmemleak_scan(void)
if (!color_white(object))
continue;
raw_spin_lock_irq(&object->lock);
+ trace_handle = 0;
+ dedup_print = false;
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
-
- if (kmemleak_verbose)
- print_unreferenced(NULL, object);
-
+ if (kmemleak_verbose) {
+ trace_handle = object->trace_handle;
+ dedup_print = true;
+ }
new_leaks++;
}
raw_spin_unlock_irq(&object->lock);
+
+ /*
+ * Defer the verbose print outside object->lock: xa_store()
+ * may take xa_node slab locks at a higher wait-context level
+ * which lockdep would flag against the raw_spinlock_t
+ * object->lock. rcu_read_lock() keeps the kmemleak_object
+ * alive across the call.
+ */
+ if (dedup_print)
+ dedup_record(&dedup, object, trace_handle);
}
rcu_read_unlock();
+ /* Flush'em all */
+ dedup_flush(&dedup);
+ xa_destroy(&dedup);
if (new_leaks) {
kmemleak_found_leaks = true;
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf56..cd9bb077072cc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1834,50 +1834,29 @@ static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
tlb_finish_mmu(madv_behavior->tlb);
}
-static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+/**
+ * check_input_range() - Check if the requested range is valid.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ *
+ * Returns: 0 if the input range is valid, otherwise an error code.
+ */
+static int check_input_range(unsigned long start, size_t len_in)
{
size_t len;
- if (!madvise_behavior_valid(behavior))
- return false;
-
if (!PAGE_ALIGNED(start))
- return false;
+ return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return false;
+ return -EINVAL;
if (start + len < start)
- return false;
-
- return true;
-}
+ return -EINVAL;
-/*
- * madvise_should_skip() - Return if the request is invalid or nothing.
- * @start: Start address of madvise-requested address range.
- * @len_in: Length of madvise-requested address range.
- * @behavior: Requested madvise behavior.
- * @err: Pointer to store an error code from the check.
- *
- * If the specified behaviour is invalid or nothing would occur, we skip the
- * operation. This function returns true in the cases, otherwise false. In
- * the former case we store an error on @err.
- */
-static bool madvise_should_skip(unsigned long start, size_t len_in,
- int behavior, int *err)
-{
- if (!is_valid_madvise(start, len_in, behavior)) {
- *err = -EINVAL;
- return true;
- }
- if (start + PAGE_ALIGN(len_in) == start) {
- *err = 0;
- return true;
- }
- return false;
+ return 0;
}
static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
@@ -2013,8 +1992,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
.tlb = &tlb,
};
- if (madvise_should_skip(start, len_in, behavior, &error))
+ if (!madvise_behavior_valid(behavior))
+ return -EINVAL;
+
+ error = check_input_range(start, len_in);
+ if (error || !len_in)
return error;
+
error = madvise_lock(&madv_behavior);
if (error)
return error;
@@ -2056,7 +2040,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
size_t len_in = iter_iov_len(iter);
int error;
- if (madvise_should_skip(start, len_in, behavior, &error))
+ error = check_input_range(start, len_in);
+ if (error || !len_in)
ret = error;
else
ret = madvise_do_behavior(start, len_in, &madv_behavior);
@@ -2131,6 +2116,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
+ if (!madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_mm;
+ }
+
/*
* We need only perform this check if we are attempting to manipulate a
* remote process's address space.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1a4fd2504bcdf..431cad99189f9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4005,11 +4005,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg1_alloc_events(memcg))
goto fail;
+ pstatc_pcpu = parent ? parent->vmstats_percpu : NULL;
for_each_possible_cpu(cpu) {
- if (parent)
- pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
+ statc->parent_pcpu = pstatc_pcpu;
statc->vmstats = memcg->vmstats;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d47aef256a324..eff405a21c68b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -459,7 +459,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
* Only do anything when FORCEKILL is set, otherwise just free the
* list (this is used for clean pages which do not need killing)
*/
-static void kill_procs(struct list_head *to_kill, int forcekill,
+static void kill_procs(struct list_head *to_kill, bool forcekill,
unsigned long pfn, int flags)
{
struct to_kill *tk, *next;
@@ -1418,7 +1418,7 @@ try_again:
* We raced with (possibly temporary) unhandlable
* page, retry.
*/
- if (pass++ < 3) {
+ if (pass++ < GET_PAGE_MAX_RETRY_NUM) {
shake_page(p);
goto try_again;
}
@@ -1582,7 +1582,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
{
LIST_HEAD(tokill);
bool unmap_success;
- int forcekill;
+ bool forcekill;
bool mlocked = folio_test_mlocked(folio);
/*
@@ -1703,7 +1703,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
unmap_mapping_range(mapping, start, size, 0);
}
- kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags);
+ kill_procs(to_kill, !!(flags & MF_MUST_KILL), pfn, flags);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 86a973119bd46..0c9d9c2cbf0e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3837,8 +3837,8 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
* Handle the case of a page which we actually need to copy to a new page,
* either due to COW or unsharing.
*
- * Called with mmap_lock locked and the old page referenced, but
- * without the ptl held.
+ * Called with either the VMA lock or the mmap_lock held (see FAULT_FLAG_VMA_LOCK)
+ * and the old page referenced, but without the ptl held.
*
* High level logic flow:
*
@@ -4237,9 +4237,9 @@ static bool wp_can_reuse_anon_folio(struct folio *folio,
* though the page will change only once the write actually happens. This
* avoids a few races, and potentially makes it more efficient.
*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_lock still held, but pte unmapped and unlocked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK) and pte both mapped and locked. We return with
+ * the same lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
@@ -4785,12 +4785,12 @@ static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
- * We return with the mmap_lock locked or unlocked in the same cases
- * as does filemap_fault().
+ * When returning, the lock may have been released in the same cases
+ * as done by filemap_fault().
*/
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
@@ -5330,9 +5330,10 @@ static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte,
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_lock still held, but pte unmapped and unlocked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK), and pte unmapped and unlocked.
+ * We return with the lock still held, but pte unmapped and unlocked.
+ * If VM_FAULT_RETRY is returned, the lock may have been released.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
@@ -5440,9 +5441,10 @@ oom:
}
/*
- * The mmap_lock must have been held on entry, and may have been
- * released depending on flags and vma->vm_ops->fault() return value.
- * See filemap_fault() and __lock_page_retry().
+ * Either the VMA lock or the mmap_lock must have been held on entry
+ * (see FAULT_FLAG_VMA_LOCK) and may have been released depending on
+ * flags and vma->vm_ops->fault() return value.
+ * See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
@@ -5480,7 +5482,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
if (unlikely(PageHWPoison(vmf->page))) {
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
- if (page_mapped(vmf->page))
+ if (folio_mapped(folio))
unmap_mapping_folio(folio);
/* Retry if a clean folio was removed from the cache. */
if (mapping_evict_folio(folio->mapping, folio))
@@ -6003,11 +6005,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults).
- * The mmap_lock may have been released depending on flags and our
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK).
+ * The lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
- * If mmap_lock is released, vma may become invalid (for example
+ * If the lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
@@ -6374,10 +6376,11 @@ static void fix_spurious_fault(struct vm_fault *vmf,
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
- * concurrent faults).
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (see FAULT_FLAG_VMA_LOCK).
*
- * The mmap_lock may have been released depending on flags and our return value.
+ * The mmap_lock or VMA lock may have been released depending on flags
+ * and our return value.
* See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
@@ -6458,8 +6461,8 @@ unlock:
/*
* On entry, we hold either the VMA lock or the mmap_lock
- * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
- * the result, the mmap_lock is not held on exit. See filemap_fault()
+ * (see FAULT_FLAG_VMA_LOCK). If VM_FAULT_RETRY is set in
+ * the result, the lock is not held on exit. See filemap_fault()
* and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
@@ -6691,9 +6694,9 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
/*
* By the time we get here, we already hold either the VMA lock or the
- * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
+ * mmap_lock (see FAULT_FLAG_VMA_LOCK).
*
- * The mmap_lock may have been released depending on flags and our
+ * The lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 40c7915dabe05..462d8dcd636dc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -576,6 +576,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
* @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
+ * @pgmap: device page map or %NULL if not ZONE_DEVICE
*
* Generic helper function to remove section mappings and sysfs entries
* for the section of the memory we are removing. Caller needs to make
@@ -583,7 +584,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
* calling offline_pages().
*/
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
const unsigned long end_pfn = pfn + nr_pages;
unsigned long cur_nr_pages;
@@ -598,7 +599,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- sparse_remove_section(pfn, cur_nr_pages, altmap);
+ sparse_remove_section(pfn, cur_nr_pages, altmap, pgmap);
}
}
@@ -1427,7 +1428,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
remove_memory_block_devices(cur_start, memblock_size);
- arch_remove_memory(cur_start, memblock_size, altmap);
+ arch_remove_memory(cur_start, memblock_size, altmap, NULL);
/* Verify that all vmemmap pages have actually been freed. */
WARN(altmap->alloc, "Altmap not fully unmapped");
@@ -1470,7 +1471,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
ret = create_memory_block_devices(cur_start, memblock_size, nid,
params.altmap, group);
if (ret) {
- arch_remove_memory(cur_start, memblock_size, NULL);
+ arch_remove_memory(cur_start, memblock_size, params.altmap, NULL);
kfree(params.altmap);
goto out;
}
@@ -1556,7 +1557,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
/* create memory block devices after memory was added */
ret = create_memory_block_devices(start, size, nid, NULL, group);
if (ret) {
- arch_remove_memory(start, size, params.altmap);
+ arch_remove_memory(start, size, params.altmap, NULL);
goto error;
}
}
@@ -2268,7 +2269,7 @@ static int try_remove_memory(u64 start, u64 size)
* No altmaps present, do the removal directly
*/
remove_memory_block_devices(start, size);
- arch_remove_memory(start, size, NULL);
+ arch_remove_memory(start, size, NULL, NULL);
} else {
/* all memblocks in the range have altmaps */
remove_memory_blocks_and_altmaps(start, size);
diff --git a/mm/memremap.c b/mm/memremap.c
index 053842d45cb10..81766d8224009 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -97,10 +97,10 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
PHYS_PFN(range_len(range)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
__remove_pages(PHYS_PFN(range->start),
- PHYS_PFN(range_len(range)), NULL);
+ PHYS_PFN(range_len(range)), NULL, pgmap);
} else {
arch_remove_memory(range->start, range_len(range),
- pgmap_altmap(pgmap));
+ pgmap_altmap(pgmap), pgmap);
kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
mem_hotplug_done();
diff --git a/mm/migrate.c b/mm/migrate.c
index 8a64291ab5b44..0c6a0ab6eccef 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1135,26 +1135,24 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
* This is safe because nobody is using it except us.
*/
enum {
- PAGE_WAS_MAPPED = BIT(0),
- PAGE_WAS_MLOCKED = BIT(1),
- PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
+ FOLIO_WAS_MAPPED = BIT(0),
+ FOLIO_WAS_MLOCKED = BIT(1),
+ FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED,
};
static void __migrate_folio_record(struct folio *dst,
- int old_page_state,
- struct anon_vma *anon_vma)
+ int old_folio_state, struct anon_vma *anon_vma)
{
- dst->private = (void *)anon_vma + old_page_state;
+ dst->private = (void *)anon_vma + old_folio_state;
}
static void __migrate_folio_extract(struct folio *dst,
- int *old_page_state,
- struct anon_vma **anon_vmap)
+ int *old_folio_state, struct anon_vma **anon_vmap)
{
unsigned long private = (unsigned long)dst->private;
- *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
- *old_page_state = private & PAGE_OLD_STATES;
+ *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES);
+ *old_folio_state = private & FOLIO_OLD_STATES;
dst->private = NULL;
}
@@ -1209,7 +1207,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
{
struct folio *dst;
int rc = -EAGAIN;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool locked = false;
bool dst_locked = false;
@@ -1253,7 +1251,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
}
locked = true;
if (folio_test_mlocked(src))
- old_page_state |= PAGE_WAS_MLOCKED;
+ old_folio_state |= FOLIO_WAS_MLOCKED;
if (folio_test_writeback(src)) {
/*
@@ -1302,7 +1300,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
dst_locked = true;
if (unlikely(page_has_movable_ops(&src->page))) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1328,11 +1326,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
- old_page_state |= PAGE_WAS_MAPPED;
+ old_folio_state |= FOLIO_WAS_MAPPED;
}
if (!folio_mapped(src)) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1344,7 +1342,7 @@ out:
if (rc == -EAGAIN)
ret = NULL;
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, locked, ret);
migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
@@ -1358,13 +1356,13 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct list_head *ret)
{
int rc;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool src_deferred_split = false;
bool src_partially_mapped = false;
struct list_head *prev;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
@@ -1404,10 +1402,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
* isolated from the unevictable LRU: but this case is the easiest.
*/
folio_add_lru(dst);
- if (old_page_state & PAGE_WAS_MLOCKED)
+ if (old_folio_state & FOLIO_WAS_MLOCKED)
lru_add_drain();
- if (old_page_state & PAGE_WAS_MAPPED)
+ if (old_folio_state & FOLIO_WAS_MAPPED)
remove_migration_ptes(src, dst, 0);
out_unlock_both:
@@ -1439,11 +1437,11 @@ out:
*/
if (rc == -EAGAIN) {
list_add(&dst->lru, prev);
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return rc;
}
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
@@ -1777,11 +1775,11 @@ static void migrate_folios_undo(struct list_head *src_folios,
dst = list_first_entry(dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, src_folios, lru) {
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
- migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret_folios);
list_del(&dst->lru);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 19cd14b341146..554754eb26ff2 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -801,8 +801,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
bool flush = false;
unsigned long i;
- VM_WARN_ON_FOLIO(!folio, folio);
- VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+ VM_WARN_ON_ONCE(!folio);
if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
return -EINVAL;
@@ -859,11 +858,9 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
if (userfaultfd_missing(vma))
goto unlock_abort;
- if (!pmd_none(*pmdp)) {
- if (!is_huge_zero_pmd(*pmdp))
- goto unlock_abort;
+ if (is_huge_zero_pmd(*pmdp))
flush = true;
- } else if (!pmd_none(*pmdp))
+ else if (!pmd_none(*pmdp))
goto unlock_abort;
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f9f8e1af921cd..bd466a3c10c8e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -674,6 +674,20 @@ static inline void fixup_hashdist(void)
static inline void fixup_hashdist(void) {}
#endif /* CONFIG_NUMA */
+#ifdef CONFIG_ZONE_DEVICE
+static __meminit void pageblock_migratetype_init_range(unsigned long pfn,
+ unsigned long nr_pages, int migratetype)
+{
+ const unsigned long end = pfn + nr_pages;
+
+ for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) {
+ init_pageblock_migratetype(pfn_to_page(pfn), migratetype, false);
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+ }
+}
+#endif
+
/*
* Initialize a reserved page unconditionally, finding its zone first.
*/
@@ -1012,21 +1026,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
page->zone_device_data = NULL;
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
- * because this is done early in section_activate()
- */
- if (pageblock_aligned(pfn)) {
- init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
- cond_resched();
- }
-
- /*
* ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
* directly to the driver page allocator which will set the page count
* to 1 when allocating the page.
@@ -1056,10 +1055,17 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* of how the sparse_vmemmap internals handle compound pages in the lack
* of an altmap. See vmemmap_populate_compound_pages().
*/
-static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+static inline unsigned long compound_nr_pages(unsigned long pfn,
+ struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
{
- if (!vmemmap_can_optimize(altmap, pgmap))
+ /*
+ * If DAX memory is hot-plugged into an unoccupied subsection
+ * of an early section, the unoptimized boot memmap is reused.
+ * See section_activate().
+ */
+ if (early_section(__pfn_to_section(pfn)) ||
+ !vmemmap_can_optimize(altmap, pgmap))
return pgmap_vmemmap_nr(pgmap);
return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
@@ -1122,13 +1128,18 @@ void __ref memmap_init_zone_device(struct zone *zone,
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+
if (pfns_per_compound == 1)
continue;
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- compound_nr_pages(altmap, pgmap));
+ compound_nr_pages(pfn, altmap, pgmap));
}
+ pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE);
+
pr_debug("%s initialised %lu pages in %ums\n", __func__,
nr_pages, jiffies_to_msecs(jiffies - start));
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 5754d1c364624..2311ae7c2ff45 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -504,7 +504,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
break;
case MAP_DROPPABLE:
if (VM_DROPPABLE == VM_NONE)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
/*
* A locked or stack area makes no sense to be droppable.
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d49c254174da7..69a99af777771 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ typedef int __bitwise fpi_t;
/* Free the page without taking locks. Rely on trylock only. */
#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+/* free_pages_prepare() has already been called for page(s) being freed. */
+#define FPI_PREPARED ((__force fpi_t)BIT(3))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -282,6 +285,14 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+/*
+ * When page allocations stall for longer than a threshold,
+ * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log. Only one warning
+ * will be printed during this duration for the entire system.
+ */
+#define ALLOC_STALL_WARN_MSECS (10 * 1000UL)
+static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES;
+
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static bool cond_accept_memory(struct zone *zone, unsigned int order,
int alloc_flags);
@@ -1211,14 +1222,18 @@ static inline bool should_skip_kasan_poison(struct page *page)
return page_kasan_tag(page) == KASAN_TAG_KERNEL;
}
-static void kernel_init_pages(struct page *page, int numpages)
+static void clear_highpages_kasan_tagged(struct page *page, int numpages)
{
- int i;
-
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++)
- clear_highpage_kasan_tagged(page + i);
+ if (!IS_ENABLED(CONFIG_HIGHMEM)) {
+ clear_pages(kasan_reset_tag(page_address(page)), numpages);
+ } else {
+ int i;
+
+ for (i = 0; i < numpages; i++)
+ clear_highpage_kasan_tagged(page + i);
+ }
kasan_enable_current();
}
@@ -1303,8 +1318,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
#endif /* CONFIG_MEM_ALLOC_PROFILING */
-__always_inline bool __free_pages_prepare(struct page *page,
- unsigned int order, fpi_t fpi_flags)
+static __always_inline bool __free_pages_prepare(struct page *page,
+ unsigned int order, fpi_t fpi_flags)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page);
@@ -1312,6 +1327,9 @@ __always_inline bool __free_pages_prepare(struct page *page,
bool compound = PageCompound(page);
struct folio *folio = page_folio(page);
+ if (fpi_flags & FPI_PREPARED)
+ return true;
+
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
@@ -1423,7 +1441,7 @@ __always_inline bool __free_pages_prepare(struct page *page,
init = false;
}
if (init)
- kernel_init_pages(page, 1 << order);
+ clear_highpages_kasan_tagged(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1451,7 +1469,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp,
int pindex)
{
- unsigned long flags;
unsigned int order;
struct page *page;
@@ -1464,7 +1481,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
while (count > 0) {
struct list_head *list;
@@ -1496,8 +1513,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
trace_mm_page_pcpu_drain(page, order, mt);
} while (count > 0 && !list_empty(list));
}
-
- spin_unlock_irqrestore(&zone->lock, flags);
}
/* Split a multi-block free page into its individual pageblocks. */
@@ -1848,7 +1863,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
}
/* If memory is still not initialized, initialize it now. */
if (init)
- kernel_init_pages(page, 1 << order);
+ clear_highpages_kasan_tagged(page, 1 << order);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
@@ -3424,7 +3439,7 @@ static void reserve_highatomic_pageblock(struct page *page, int order,
struct zone *zone)
{
int mt;
- unsigned long max_managed, flags;
+ unsigned long max_managed;
/*
* The number reserved as: minimum is 1 pageblock, maximum is
@@ -3438,29 +3453,26 @@ static void reserve_highatomic_pageblock(struct page *page, int order,
if (zone->nr_reserved_highatomic >= max_managed)
return;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
/* Recheck the nr_reserved_highatomic limit under the lock */
if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
+ return;
/* Yoink! */
mt = get_pageblock_migratetype(page);
/* Only reserve normal pageblocks (i.e., they can merge with others) */
if (!migratetype_is_mergeable(mt))
- goto out_unlock;
+ return;
if (order < pageblock_order) {
if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
+ return;
zone->nr_reserved_highatomic += pageblock_nr_pages;
} else {
change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
zone->nr_reserved_highatomic += 1 << order;
}
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
}
/*
@@ -3476,7 +3488,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
bool force)
{
struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
struct zoneref *z;
struct zone *zone;
struct page *page;
@@ -3493,7 +3504,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
pageblock_nr_pages)
continue;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &(zone->free_area[order]);
unsigned long size;
@@ -3540,12 +3551,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* so this should not fail on zone boundaries.
*/
WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (ret > 0)
return ret;
- }
}
- spin_unlock_irqrestore(&zone->lock, flags);
}
return false;
@@ -4678,6 +4686,40 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
return false;
}
+static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned int order, unsigned long alloc_start_time)
+{
+ static DEFINE_SPINLOCK(alloc_stall_lock);
+ unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time);
+
+ if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS))
+ return;
+ if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies)))
+ return;
+ if (gfp_mask & __GFP_NOWARN)
+ return;
+
+ if (!spin_trylock(&alloc_stall_lock))
+ return;
+
+ /* Check again, this time under the lock */
+ if (time_is_after_jiffies(alloc_stall_warn_jiffies)) {
+ spin_unlock(&alloc_stall_lock);
+ return;
+ }
+
+ WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS));
+ spin_unlock(&alloc_stall_lock);
+
+ pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl",
+ current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask,
+ nodemask_pr_args(nodemask));
+ cpuset_print_current_mems_allowed();
+ pr_cont("\n");
+ dump_stack();
+ warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -4698,6 +4740,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int reserve_flags;
bool compact_first = false;
bool can_retry_reserves = true;
+ unsigned long alloc_start_time = jiffies;
if (unlikely(nofail)) {
/*
@@ -4813,6 +4856,9 @@ retry:
if (current->flags & PF_MEMALLOC)
goto nopage;
+ /* If allocation has taken excessively long, warn about it */
+ check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time);
+
/* Try direct reclaim and then allocating */
if (!compact_first) {
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags,
@@ -5044,7 +5090,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct per_cpu_pages *pcp;
struct list_head *pcp_list;
struct alloc_context ac;
- gfp_t alloc_gfp;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
int nr_populated = 0, nr_account = 0;
@@ -5085,10 +5130,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
gfp &= gfp_allowed_mask;
- alloc_gfp = gfp;
- if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+ if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags))
goto out;
- gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
z = ac.preferred_zoneref;
@@ -5180,6 +5223,34 @@ failed:
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
+ * free_pages_bulk - Free an array of order-0 pages
+ * @page_array: Array of pages to free
+ * @nr_pages: The number of pages in the array
+ *
+ * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous
+ * run are released with a single __free_contig_range() call.
+ *
+ * This assumes page_array is sorted in ascending PFN order. Without that,
+ * the function still frees all pages, but contiguous runs may not be
+ * detected and the freeing pattern can degrade to freeing one page at a
+ * time.
+ *
+ * Context: Sleepable process context only; calls cond_resched()
+ */
+void free_pages_bulk(struct page **page_array, unsigned long nr_pages)
+{
+ while (nr_pages) {
+ unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages);
+
+ __free_contig_range(page_to_pfn(*page_array), nr_contig);
+
+ nr_pages -= nr_contig;
+ page_array += nr_contig;
+ cond_resched();
+ }
+}
+
+/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
@@ -6758,6 +6829,105 @@ void __init page_alloc_sysctl_init(void)
register_sysctl_init("vm", page_alloc_sysctl_table);
}
+static void free_prepared_contig_range(struct page *page,
+ unsigned long nr_pages)
+{
+ unsigned long pfn = page_to_pfn(page);
+
+ while (nr_pages) {
+ unsigned int order;
+
+ /* We are limited by the largest buddy order. */
+ order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER;
+ /* Don't exceed the number of pages to free. */
+ order = min_t(unsigned int, order, ilog2(nr_pages));
+ order = min_t(unsigned int, order, MAX_PAGE_ORDER);
+
+ /*
+ * Free the chunk as a single block. Our caller has already
+ * called free_pages_prepare() for each order-0 page.
+ */
+ __free_frozen_pages(page, order, FPI_PREPARED);
+
+ pfn += 1UL << order;
+ page += 1UL << order;
+ nr_pages -= 1UL << order;
+ }
+}
+
+static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages,
+ bool is_frozen)
+{
+ struct page *page, *start = NULL;
+ unsigned long nr_start = 0;
+ unsigned long start_sec;
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++) {
+ bool can_free = true;
+
+ /*
+ * Contiguous PFNs might not have contiguous "struct pages"
+ * in some kernel configs: page++ across a section boundary
+ * is undefined. Use pfn_to_page() for each PFN.
+ */
+ page = pfn_to_page(pfn + i);
+
+ VM_WARN_ON_ONCE(PageHead(page));
+ VM_WARN_ON_ONCE(PageTail(page));
+
+ if (!is_frozen)
+ can_free = put_page_testzero(page);
+
+ if (can_free)
+ can_free = free_pages_prepare(page, 0);
+
+ if (!can_free) {
+ if (start) {
+ free_prepared_contig_range(start, i - nr_start);
+ start = NULL;
+ }
+ continue;
+ }
+
+ if (start && memdesc_section(page->flags) != start_sec) {
+ free_prepared_contig_range(start, i - nr_start);
+ start = page;
+ nr_start = i;
+ start_sec = memdesc_section(page->flags);
+ } else if (!start) {
+ start = page;
+ nr_start = i;
+ start_sec = memdesc_section(page->flags);
+ }
+ }
+
+ if (start)
+ free_prepared_contig_range(start, nr_pages - nr_start);
+}
+
+/**
+ * __free_contig_range - Free contiguous range of order-0 pages.
+ * @pfn: Page frame number of the first page in the range.
+ * @nr_pages: Number of pages to free.
+ *
+ * For each order-0 struct page in the physically contiguous range, put a
+ * reference. Free any page who's reference count falls to zero. The
+ * implementation is functionally equivalent to, but significantly faster than
+ * calling __free_page() for each struct page in a loop.
+ *
+ * Memory allocated with alloc_pages(order>=1) then subsequently split to
+ * order-0 with split_page() is an example of appropriate contiguous pages that
+ * can be freed with this API.
+ *
+ * Context: May be called in interrupt context or while holding a normal
+ * spinlock, but not in NMI context or while holding a raw spinlock.
+ */
+void __free_contig_range(unsigned long pfn, unsigned long nr_pages)
+{
+ __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false);
+}
+
#ifdef CONFIG_CONTIG_ALLOC
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
@@ -6895,8 +7065,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages)
{
- for (; nr_pages--; pfn++)
- free_frozen_pages(pfn_to_page(pfn), 0);
+ __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true);
}
/**
@@ -7304,8 +7473,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn))))
return;
- for (; nr_pages--; pfn++)
- __free_page(pfn_to_page(pfn));
+ __free_contig_range(pfn, nr_pages);
}
EXPORT_SYMBOL(free_contig_range);
#endif /* CONFIG_CONTIG_ALLOC */
@@ -7363,7 +7531,7 @@ void zone_pcp_reset(struct zone *zone)
unsigned long __offline_isolated_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long already_offline = 0, flags;
+ unsigned long already_offline = 0;
unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
@@ -7371,7 +7539,7 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn,
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
while (pfn < end_pfn) {
page = pfn_to_page(pfn);
/*
@@ -7401,7 +7569,6 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn,
del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
pfn += (1 << order);
}
- spin_unlock_irqrestore(&zone->lock, flags);
return end_pfn - start_pfn - already_offline;
}
@@ -7473,11 +7640,9 @@ bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
- unsigned long flags;
unsigned int order;
- bool ret = false;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
int page_order = buddy_order(page_head);
@@ -7492,14 +7657,12 @@ bool take_page_off_buddy(struct page *page)
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
SetPageHWPoisonTakenOff(page);
- ret = true;
- break;
+ return true;
}
if (page_count(page_head) > 0)
break;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
+ return false;
}
/*
@@ -7508,23 +7671,19 @@ bool take_page_off_buddy(struct page *page)
bool put_page_back_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
- unsigned long flags;
- bool ret = false;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
if (put_page_testzero(page)) {
unsigned long pfn = page_to_pfn(page);
int migratetype = get_pfnblock_migratetype(page, pfn);
ClearPageHWPoisonTakenOff(page);
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
- if (TestClearPageHWPoison(page)) {
- ret = true;
- }
+ if (TestClearPageHWPoison(page))
+ return true;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
+ return false;
}
#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 70cea9e24d2fd..7ed76592e20d8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -326,8 +326,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
struct swap_iocb {
struct kiocb iocb;
- struct bio_vec bvec[SWAP_CLUSTER_MAX];
- int pages;
+ struct bio_vec bvecs[SWAP_CLUSTER_MAX];
+ int nr_bvecs;
int len;
};
static mempool_t *sio_pool;
@@ -348,7 +348,7 @@ int sio_pool_init(void)
static void sio_write_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec[0].bv_page;
+ struct page *page = sio->bvecs[0].bv_page;
int p;
if (ret != sio->len) {
@@ -362,15 +362,15 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
*/
pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
ret, swap_dev_pos(page_swap_entry(page)));
- for (p = 0; p < sio->pages; p++) {
- page = sio->bvec[p].bv_page;
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ page = sio->bvecs[p].bv_page;
set_page_dirty(page);
ClearPageReclaim(page);
}
}
- for (p = 0; p < sio->pages; p++)
- end_page_writeback(sio->bvec[p].bv_page);
+ for (p = 0; p < sio->nr_bvecs; p++)
+ end_page_writeback(sio->bvecs[p].bv_page);
mempool_free(sio, sio_pool);
}
@@ -397,13 +397,13 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
init_sync_kiocb(&sio->iocb, swap_file);
sio->iocb.ki_complete = sio_write_complete;
sio->iocb.ki_pos = pos;
- sio->pages = 0;
+ sio->nr_bvecs = 0;
sio->len = 0;
}
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
+ bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) {
+ sio->nr_bvecs += 1;
+ if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !swap_plug) {
swap_write_unplug(sio);
sio = NULL;
}
@@ -477,7 +477,7 @@ void swap_write_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_write_complete(&sio->iocb, ret);
@@ -489,8 +489,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
int p;
if (ret == sio->len) {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = page_folio(sio->bvec[p].bv_page);
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ struct folio *folio = page_folio(sio->bvecs[p].bv_page);
count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
@@ -499,8 +499,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
}
count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
} else {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = page_folio(sio->bvec[p].bv_page);
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ struct folio *folio = page_folio(sio->bvecs[p].bv_page);
folio_unlock(folio);
}
@@ -559,13 +559,13 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
init_sync_kiocb(&sio->iocb, sis->swap_file);
sio->iocb.ki_pos = pos;
sio->iocb.ki_complete = sio_read_complete;
- sio->pages = 0;
+ sio->nr_bvecs = 0;
sio->len = 0;
}
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
+ bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+ sio->nr_bvecs += 1;
+ if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !plug) {
swap_read_unplug(sio);
sio = NULL;
}
@@ -666,7 +666,7 @@ void __swap_read_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_read_complete(&sio->iocb, ret);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c48ff5c002449..7a9d631945a34 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -167,48 +167,40 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
{
struct zone *zone = page_zone(page);
struct page *unmovable;
- unsigned long flags;
unsigned long check_unmovable_start, check_unmovable_end;
if (PageUnaccepted(page))
accept_page(page);
- spin_lock_irqsave(&zone->lock, flags);
-
- /*
- * We assume the caller intended to SET migrate type to isolate.
- * If it is already set, then someone else must have raced and
- * set it before us.
- */
- if (is_migrate_isolate_page(page)) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return -EBUSY;
- }
+ scoped_guard(spinlock_irqsave, &zone->lock) {
+ /*
+ * We assume the caller intended to SET migrate type to
+ * isolate. If it is already set, then someone else must have
+ * raced and set it before us.
+ */
+ if (is_migrate_isolate_page(page))
+ return -EBUSY;
- /*
- * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
- * We just check MOVABLE pages.
- *
- * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock
- * to avoid redundant checks.
- */
- check_unmovable_start = max(page_to_pfn(page), start_pfn);
- check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
- end_pfn);
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by
+ * itself. We just check MOVABLE pages.
+ *
+ * Pass the intersection of [start_pfn, end_pfn) and the page's
+ * pageblock to avoid redundant checks.
+ */
+ check_unmovable_start = max(page_to_pfn(page), start_pfn);
+ check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
+ end_pfn);
- unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
- mode);
- if (!unmovable) {
- if (!pageblock_isolate_and_move_free_pages(zone, page)) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return -EBUSY;
+ unmovable = has_unmovable_pages(check_unmovable_start,
+ check_unmovable_end, mode);
+ if (!unmovable) {
+ if (!pageblock_isolate_and_move_free_pages(zone, page))
+ return -EBUSY;
+ zone->nr_isolate_pageblock++;
+ return 0;
}
- zone->nr_isolate_pageblock++;
- spin_unlock_irqrestore(&zone->lock, flags);
- return 0;
}
-
- spin_unlock_irqrestore(&zone->lock, flags);
if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) {
/*
* printk() with zone->lock held will likely trigger a
@@ -223,15 +215,14 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
static void unset_migratetype_isolate(struct page *page)
{
struct zone *zone;
- unsigned long flags;
bool isolated_page = false;
unsigned int order;
struct page *buddy;
zone = page_zone(page);
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
if (!is_migrate_isolate_page(page))
- goto out;
+ return;
/*
* Because freepage with more than pageblock_order on isolated
@@ -279,8 +270,6 @@ static void unset_migratetype_isolate(struct page *page)
__putback_isolated_page(page, order, get_pageblock_migratetype(page));
}
zone->nr_isolate_pageblock--;
-out:
- spin_unlock_irqrestore(&zone->lock, flags);
}
static inline struct page *
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 8178e0be557f8..2dddcb6510aa1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -573,7 +573,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
migratetype_names[pageblock_mt],
- &page->flags);
+ &page->flags.f);
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
diff --git a/mm/readahead.c b/mm/readahead.c
index 7b05082c89ea2..8c12b63ccd4a2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -324,6 +324,8 @@ static void do_page_cache_ra(struct readahead_control *ractl,
return;
end_index = (isize - 1) >> PAGE_SHIFT;
+ if (end_index > ractl->_max_index)
+ end_index = ractl->_max_index;
if (index > end_index)
return;
/* Don't read past the page containing the last byte of the file */
@@ -471,7 +473,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
pgoff_t start = readahead_index(ractl);
pgoff_t index = start;
unsigned int min_order = mapping_min_folio_order(mapping);
- pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ pgoff_t limit;
pgoff_t mark = index + ra->size - ra->async_size;
unsigned int nofs;
int err = 0;
@@ -484,6 +486,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
goto fallback;
}
+ limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ limit = min(limit, ractl->_max_index);
limit = min(limit, index + ra->size - 1);
new_order = min(mapping_max_folio_order(mapping), new_order);
diff --git a/mm/rmap.c b/mm/rmap.c
index 99e1b3dc390b7..1c77d5dc06e9f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -571,7 +571,7 @@ void __init anon_vma_init(void)
* In case it was remapped to a different anon_vma, the new anon_vma will be a
* child of the old anon_vma, and the anon_vma lifetime rules will therefore
* ensure that any anon_vma obtained from the page will still be valid for as
- * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ * long as we observe folio_mapped() [ hence all those folio_mapped() tests ].
*
* All users of this function must be very careful when walking the anon_vma
* chain and verify that the page in question is indeed mapped in it
@@ -1999,7 +1999,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
- * try_to_unmap() may return before page_mapped() has become false,
+ * try_to_unmap() may return before folio_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
@@ -2428,7 +2428,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
- * try_to_migrate() may return before page_mapped() has become false,
+ * try_to_migrate() may return before folio_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
@@ -2929,7 +2929,7 @@ static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
/*
* Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
- * because that depends on page_mapped(); but not all its usages
+ * because that depends on folio_mapped(); but not all its usages
* are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index 3b5dc21b323c2..bab3529af23c5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3101,10 +3101,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
cache_no_acl(inode);
if (sbinfo->noswap)
mapping_set_unevictable(inode->i_mapping);
-
- /* Don't consider 'deny' for emergencies and 'force' for testing */
- if (sbinfo->huge)
- mapping_set_large_folios(inode->i_mapping);
+ mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 6eadb9d116e43..112ccf9c71caf 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -87,15 +87,10 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap)
{
- void *ptr;
-
if (altmap)
return altmap_alloc_block_buf(size, altmap);
- ptr = sparse_buffer_alloc(size);
- if (!ptr)
- ptr = vmemmap_alloc_block(size, node);
- return ptr;
+ return vmemmap_alloc_block(size, node);
}
static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
@@ -151,7 +146,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
start, end - 1);
}
-pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap,
unsigned long ptpfn, unsigned long flags)
{
@@ -195,7 +190,7 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
return p;
}
-pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+static pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
{
pmd_t *pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
@@ -208,7 +203,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
-pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
+static pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
@@ -221,7 +216,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
return pud;
}
-p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
+static p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
@@ -234,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
return p4d;
}
-pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
{
pgd_t *pgd = pgd_offset_k(addr);
if (pgd_none(*pgd)) {
@@ -652,26 +647,61 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
}
}
+static int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ const unsigned int order = pgmap ? pgmap->vmemmap_shift : 0;
+ const unsigned long pages_per_compound = 1UL << order;
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION));
+ VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION);
+
+ if (!vmemmap_can_optimize(altmap, pgmap))
+ return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE);
+
+ if (order < PFN_SECTION_SHIFT) {
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound));
+ return VMEMMAP_RESERVE_NR * nr_pages / pages_per_compound;
+ }
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION));
+
+ if (IS_ALIGNED(pfn, pages_per_compound))
+ return VMEMMAP_RESERVE_NR;
+
+ return 0;
+}
+
static struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
{
- return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
+ struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap,
+ pgmap);
+
+ memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
+
+ return page;
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
+ memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
vmemmap_free(start, end, altmap);
}
+
static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long pfn = page_to_pfn(memmap);
+ memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION,
+ NULL, NULL));
vmemmap_free(start, end, NULL);
}
@@ -737,7 +767,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
* usage map, but still need to free the vmemmap range.
*/
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
@@ -774,14 +804,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* The memmap of early sections is always fully populated. See
* section_activate() and pfn_valid() .
*/
- if (!section_is_early) {
- memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
- depopulate_section_memmap(pfn, nr_pages, altmap);
- } else if (memmap) {
- memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
- PAGE_SIZE)));
+ if (!section_is_early)
+ depopulate_section_memmap(pfn, nr_pages, altmap, pgmap);
+ else if (memmap)
free_map_bootmem(memmap);
- }
if (empty)
ms->section_mem_map = (unsigned long)NULL;
@@ -823,10 +849,9 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
if (!memmap) {
- section_deactivate(pfn, nr_pages, altmap);
+ section_deactivate(pfn, nr_pages, altmap, pgmap);
return ERR_PTR(-ENOMEM);
}
- memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
return memmap;
}
@@ -885,13 +910,13 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
}
void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- section_deactivate(pfn, nr_pages, altmap);
+ section_deactivate(pfn, nr_pages, altmap, pgmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/sparse.c b/mm/sparse.c
index effdac6b0ab13..16ac6df3c89fa 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -201,13 +201,11 @@ static void __init memblocks_present(void)
int i, nid;
#ifdef CONFIG_SPARSEMEM_EXTREME
- if (unlikely(!mem_section)) {
- unsigned long size, align;
+ unsigned long size, align;
- size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
- align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_alloc_or_panic(size, align);
- }
+ size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_alloc_or_panic(size, align);
#endif
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
@@ -241,12 +239,9 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
struct dev_pagemap *pgmap)
{
unsigned long size = section_map_size();
- struct page *map = sparse_buffer_alloc(size);
+ struct page *map;
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
- if (map)
- return map;
-
map = memmap_alloc(size, size, addr, nid, false);
if (!map)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
@@ -256,55 +251,6 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-static void *sparsemap_buf __meminitdata;
-static void *sparsemap_buf_end __meminitdata;
-
-static inline void __meminit sparse_buffer_free(unsigned long size)
-{
- WARN_ON(!sparsemap_buf || size == 0);
- memblock_free(sparsemap_buf, size);
-}
-
-static void __init sparse_buffer_init(unsigned long size, int nid)
-{
- phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
- WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
- /*
- * Pre-allocated buffer is mainly used by __populate_section_memmap
- * and we want it to be properly aligned to the section size - this is
- * especially the case for VMEMMAP which maps memmap to PMDs
- */
- sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
- sparsemap_buf_end = sparsemap_buf + size;
-}
-
-static void __init sparse_buffer_fini(void)
-{
- unsigned long size = sparsemap_buf_end - sparsemap_buf;
-
- if (sparsemap_buf && size > 0)
- sparse_buffer_free(size);
- sparsemap_buf = NULL;
-}
-
-void * __meminit sparse_buffer_alloc(unsigned long size)
-{
- void *ptr = NULL;
-
- if (sparsemap_buf) {
- ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
- if (ptr + size > sparsemap_buf_end)
- ptr = NULL;
- else {
- /* Free redundant aligned space */
- if ((unsigned long)(ptr - sparsemap_buf) > 0)
- sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
- sparsemap_buf = ptr + size;
- }
- }
- return ptr;
-}
-
void __weak __meminit vmemmap_populate_print_last(void)
{
}
@@ -362,8 +308,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
goto failed;
}
- sparse_buffer_init(map_count * section_map_size(), nid);
-
sparse_vmemmap_init_nid_early(nid);
for_each_present_section_nr(pnum_begin, pnum) {
@@ -381,7 +325,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
__func__, nid);
pnum_begin = pnum;
sparse_usage_fini();
- sparse_buffer_fini();
goto failed;
}
memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
@@ -390,7 +333,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
}
}
sparse_usage_fini();
- sparse_buffer_fini();
return;
failed:
/*
diff --git a/mm/swap.c b/mm/swap.c
index 5cc44f0de9877..2dd84813f4dde 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -160,14 +160,42 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
int i;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
+ struct folio_batch free_fbatch;
+ bool is_lru_add = (move_fn == lru_add);
+
+ /*
+ * If we're adding to the LRU, preemptively filter dead folios. Use
+ * this dedicated folio batch for temp storage and deferred cleanup.
+ */
+ if (is_lru_add)
+ folio_batch_init(&free_fbatch);
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
/* block memcg migration while the folio moves between lru */
- if (move_fn != lru_add && !folio_test_clear_lru(folio))
+ if (!is_lru_add && !folio_test_clear_lru(folio))
continue;
+ /*
+ * Filter dead folios by moving them from the add batch to the temp
+ * batch for freeing after this loop.
+ *
+ * We're bypassing normal cleanup. Clear flags that are not
+ * applicable to dead folios.
+ *
+ * Since the folio may be part of a huge page, unqueue from
+ * deferred split list to avoid a dangling list entry.
+ */
+ if (is_lru_add && folio_ref_freeze(folio, 1)) {
+ __folio_clear_active(folio);
+ __folio_clear_unevictable(folio);
+ folio_unqueue_deferred_split(folio);
+ fbatch->folios[i] = NULL;
+ folio_batch_add(&free_fbatch, folio);
+ continue;
+ }
+
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -176,6 +204,13 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (lruvec)
lruvec_unlock_irqrestore(lruvec, flags);
+
+ /* Cleanup filtered dead folios. */
+ if (is_lru_add) {
+ mem_cgroup_uncharge_folios(&free_fbatch);
+ free_unref_folios(&free_fbatch);
+ }
+
folios_put(fbatch);
}
@@ -964,6 +999,10 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
struct folio *folio = folios->folios[i];
unsigned int nr_refs = refs ? refs[i] : 1;
+ /* Folio batch entry may have been preemptively removed during drain. */
+ if (!folio)
+ continue;
+
if (is_huge_zero_folio(folio))
continue;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9174f1eeffb09..74a1e324449dc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1054,6 +1054,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
swap_cluster_unlock(ci);
if (to_scan <= 0)
break;
+ cond_resched();
}
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bb6ae08d18f58..eabb86b13b7e5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3459,19 +3459,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
- for (i = 0; i < vm->nr_pages; i++) {
- struct page *page = vm->pages[i];
- BUG_ON(!page);
- /*
- * High-order allocs for huge vmallocs are split, so
- * can be freed as an array of order-0 allocations
- */
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_lruvec_page_state(page, NR_VMALLOC, -1);
- __free_page(page);
- cond_resched();
+ if (!(vm->flags & VM_MAP_PUT_PAGES)) {
+ for (i = 0; i < vm->nr_pages; i++)
+ mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1);
}
+ free_pages_bulk(vm->pages, vm->nr_pages);
+
kvfree(vm->pages);
kfree(vm);
}
@@ -3939,7 +3933,7 @@ fail:
__GFP_NOFAIL | __GFP_ZERO |\
__GFP_NORETRY | __GFP_RETRY_MAYFAIL |\
GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
- GFP_USER | __GFP_NOLOCKDEP)
+ GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN)
static gfp_t vmalloc_fix_flags(gfp_t flags)
{
@@ -3980,6 +3974,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags)
*
* %__GFP_NOWARN can be used to suppress failure messages.
*
+ * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages
+ * (when prot=%PAGE_KERNEL).
+ *
* Can not be called from interrupt nor NMI contexts.
* Return: the address of the area or %NULL on failure
*/
@@ -3993,6 +3990,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
+ bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN);
if (WARN_ON_ONCE(!size))
return NULL;
@@ -4023,7 +4021,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
again:
area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
- gfp_mask, caller);
+ gfp_mask & ~__GFP_SKIP_KASAN, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
@@ -4041,7 +4039,7 @@ again:
* kasan_unpoison_vmalloc().
*/
if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
- if (kasan_hw_tags_enabled()) {
+ if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) {
/*
* Modify protection bits to allow tagging.
* This must be done before mapping.
@@ -4078,7 +4076,8 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
+ if (!skip_vmalloc_kasan)
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 3fbb86996c4d2..f053554e58264 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -218,6 +218,7 @@ static void vmpressure_work_fn(struct work_struct *work)
/**
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
+ * @order: allocation order being reclaimed for
* @memcg: cgroup memory controller handle
* @tree: legacy subtree mode
* @scanned: number of pages scanned
@@ -236,7 +237,7 @@ static void vmpressure_work_fn(struct work_struct *work)
*
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr;
@@ -307,7 +308,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
level = vmpressure_calc_level(scanned, reclaimed);
- if (level > VMPRESSURE_LOW) {
+ /*
+ * Once we go above COSTLY_ORDER, reclaim relies heavily on
+ * compaction to make progress. Reclaim efficiency was never a
+ * great proxy for pressure to begin with, but it's outright
+ * misleading with these high orders. Don't throttle sockets
+ * because somebody is attempting something crazy like an order-7
+ * and predictably struggling.
+ */
+ if (level > VMPRESSURE_LOW && order <= PAGE_ALLOC_COSTLY_ORDER) {
/*
* Let the socket buffer allocator know that
* we are having trouble reclaiming LRU pages.
@@ -348,7 +357,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, true, vmpressure_win, 0);
+ vmpressure(gfp, 0, memcg, true, vmpressure_win, 0);
}
#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd1b1aa125819..4b09843876583 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -109,7 +109,7 @@ struct scan_control {
/* zone_reclaim_mode */
unsigned int may_unmap:1;
- /* zome_reclaim_mode, boost reclaim, cgroup restrictions */
+ /* zone_reclaim_mode, boost reclaim, cgroup restrictions */
unsigned int may_swap:1;
/* Not allow cache_trim_mode to be turned on as part of reclaim? */
@@ -5071,8 +5071,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
if (!sc->proactive)
- vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
- sc->nr_reclaimed - reclaimed);
+ vmpressure(sc->gfp_mask, sc->order, memcg, false,
+ sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed);
flush_reclaim_state(sc);
@@ -6175,7 +6175,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
/* Record the group's reclaim efficiency */
if (!sc->proactive)
- vmpressure(sc->gfp_mask, memcg, false,
+ vmpressure(sc->gfp_mask, sc->order, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
@@ -6220,7 +6220,7 @@ again:
/* Record the subtree's reclaim efficiency */
if (!sc->proactive)
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ vmpressure(sc->gfp_mask, sc->order, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned, nr_node_reclaimed);
if (nr_node_reclaimed)
@@ -6359,7 +6359,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
if (current_is_kswapd() || cgroup_reclaim(sc))
return;
- /* Throttle if making no progress at high prioities. */
+ /* Throttle if making no progress at high priorities. */
if (sc->priority == 1 && !sc->nr_reclaimed)
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}
@@ -7121,6 +7121,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.may_unmap = 1,
};
+ trace_mm_vmscan_balance_pgdat_begin(pgdat->node_id, order,
+ highest_zoneidx);
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire(_THIS_IP_);
@@ -7222,7 +7224,7 @@ restart:
/*
* There should be no need to raise the scanning priority if
- * enough pages are already being scanned that that high
+ * enough pages are already being scanned that the high
* watermark would be met at 100% efficiency.
*/
if (kswapd_shrink_node(pgdat, &sc))
@@ -7314,6 +7316,9 @@ out:
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
+ trace_mm_vmscan_balance_pgdat_end(pgdat->node_id, sc.order,
+ highest_zoneidx, sc.nr_reclaimed);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller