diff options
| author | Andrew Morton <akpm@linux-foundation.org> | 2026-05-28 21:30:18 -0700 |
|---|---|---|
| committer | Andrew Morton <akpm@linux-foundation.org> | 2026-05-28 21:30:18 -0700 |
| commit | 82432d0b9ab22b91c2baf0458f3a0669af7a8156 (patch) | |
| tree | e377f474bae4c2166caeb560b2853539f78829c2 /mm | |
| parent | d90fdc074685684dfc210e86688cb009c1a327a7 (diff) | |
| parent | cfaef29c20e86738aec28641b6de1e078298999e (diff) | |
| download | linux-next-history-82432d0b9ab22b91c2baf0458f3a0669af7a8156.tar.gz | |
foo
Diffstat (limited to 'mm')
41 files changed, 1342 insertions, 597 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e8bf1e9e6ad90..e221fa1dc54d0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -810,7 +810,6 @@ if TRANSPARENT_HUGEPAGE choice prompt "Transparent Hugepage Support sysfs defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_ALWAYS help Selects the sysfs defaults for Transparent Hugepage Support. @@ -840,7 +839,6 @@ endchoice choice prompt "Shmem hugepage allocation defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER help Selects the hugepage allocation policy defaults for @@ -886,7 +884,6 @@ endchoice choice prompt "Tmpfs hugepage allocation defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER help Selects the hugepage allocation policy defaults for @@ -931,7 +928,7 @@ endchoice config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT + depends on ARCH_WANTS_THP_SWAP && SWAP && 64BIT help Swap transparent huge pages in one piece, without splitting. XXX: For now, swap cluster backing transparent huge page diff --git a/mm/damon/core.c b/mm/damon/core.c index 3dbbbfdeff719..9f38deddcb30e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -13,10 +13,14 @@ #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/psi.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/string_choices.h> +/* for damon_get_folio() used by node eligible memory metrics */ +#include "ops-common.h" + #define CREATE_TRACE_POINTS #include <trace/events/damon.h> @@ -918,6 +922,8 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) if (err) return err; dst->goal_tuner = src->goal_tuner; + dst->fail_charge_num = src->fail_charge_num; + dst->fail_charge_denom = src->fail_charge_denom; dst->weight_sz = src->weight_sz; dst->weight_nr_accesses = src->weight_nr_accesses; dst->weight_age = src->weight_age; @@ -1326,11 +1332,26 @@ static int damon_commit_targets( int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { int err; + struct damos *scheme; + struct damos_quota_goal *goal; dst->maybe_corrupted = true; if (!is_power_of_2(src->min_region_sz)) return -EINVAL; + /* node_eligible_mem_bp metric requires PADDR ops */ + if (src->ops.id != DAMON_OPS_PADDR) { + damon_for_each_scheme(scheme, src) { + struct damos_quota *quota = &scheme->quota; + + damos_for_each_quota_goal(goal, quota) { + if (goal->metric == + DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP) + return -EINVAL; + } + } + } + err = damon_commit_schemes(dst, src); if (err) return err; @@ -1349,6 +1370,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) if (err) return err; } + dst->pause = src->pause; dst->ops = src->ops; dst->addr_unit = src->addr_unit; dst->min_region_sz = src->min_region_sz; @@ -2046,6 +2068,37 @@ static void damos_walk_cancel(struct damon_ctx *ctx) mutex_unlock(&ctx->walk_control_lock); } +static void damos_charge_quota(struct damos_quota *quota, + unsigned long sz_region, unsigned long sz_applied) +{ + /* + * sz_applied could be bigger than sz_region, depending on ops + * implementation of the action, e.g., damos_pa_pageout(). Charge only + * the region size in the case. + */ + if (!quota->fail_charge_denom || sz_applied > sz_region) + quota->charged_sz += sz_region; + else + quota->charged_sz += sz_applied + mult_frac( + (sz_region - sz_applied), + quota->fail_charge_num, + quota->fail_charge_denom); +} + +static bool damos_quota_is_full(struct damos_quota *quota, + unsigned long min_region_sz) +{ + if (!damos_quota_is_set(quota)) + return false; + if (quota->charged_sz >= quota->esz) + return true; + /* + * DAMOS action is applied per region, so <min_region_sz remaining + * quota means the quota is effectively full. + */ + return quota->esz - quota->charged_sz < min_region_sz; +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -2102,11 +2155,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); - quota->charged_sz += sz; - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) { + damos_charge_quota(quota, sz, sz_applied); + if (damos_quota_is_full(quota, c->min_region_sz)) { quota->charge_target_from = t; - quota->charge_addr_from = r->ar.end + 1; + quota->charge_addr_from = r->ar.end; } } if (s->action != DAMOS_STAT) @@ -2132,8 +2184,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; /* Check the quota */ - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) + if (damos_quota_is_full(quota, c->min_region_sz)) continue; if (damos_skip_charged_region(t, r, s, c->min_region_sz)) @@ -2152,6 +2203,58 @@ static void damon_do_apply_schemes(struct damon_ctx *c, } /* + * damos_apply_target() - Apply DAMOS schemes to a given target. + * @c: monitoring context to apply its DAMOS schemes to.. + * @t: monitoring target to apply the schemes to. + * @max_region_sz: maximum region size for @c. + * + * This function could split regions for keeping the quota. To minimize + * overhead from the split operations increased number of regions, this + * function will also merge regions after the schemes applying attempt is done, + * for each region. The merge operation is made only when it doesn't lose the + * monitoring information and not violating @max_region_sz. + * + * Hence, after this function is called, the total number of regions could + * be increased or reduced. The increase could make max_nr_regions temporarily + * be violated, until the next per-aggregation interval regions merge operation + * is executed. The decrease will not violate min_nr_regions though, since it + * keeps @max_region_sz. + */ +static void damos_apply_target(struct damon_ctx *c, struct damon_target *t, + unsigned long max_region_sz) +{ + struct damon_region *r; + + damon_for_each_region(r, t) { + struct damon_region *prev_r; + + damon_do_apply_schemes(c, t, r); + /* + * damon_do_apply_scheems() could split the region for the + * quota. Keeping the new slices is an overhead. Merge back + * the slices into the previous region if it doesn't lose any + * information and not violating the max_region_sz. + */ + if (damon_first_region(t) == r) + continue; + prev_r = damon_prev_region(r); + if (prev_r->ar.end != r->ar.start) + continue; + if (prev_r->age != r->age) + continue; + if (prev_r->last_nr_accesses != r->last_nr_accesses) + continue; + if (prev_r->nr_accesses != r->nr_accesses) + continue; + if (r->ar.end - prev_r->ar.start > max_region_sz) + continue; + prev_r->ar.end = r->ar.end; + damon_destroy_region(r, t); + r = prev_r; + } +} + +/* * damon_feed_loop_next_input() - get next input to achieve a target score. * @last_input The last input. * @score Current score that made with @last_input. @@ -2287,7 +2390,115 @@ static unsigned long damos_get_node_memcg_used_bp( numerator = i.totalram - used_pages; return mult_frac(numerator, 10000, i.totalram); } -#else + +#ifdef CONFIG_DAMON_PADDR +/* + * damos_calc_eligible_bytes() - Calculate raw eligible bytes per node. + * @c: The DAMON context. + * @s: The scheme. + * @nid: The target NUMA node id. + * @total: Output for total eligible bytes across all nodes. + * + * Iterates through each folio in eligible regions to accurately determine + * which node the memory resides on. Returns eligible bytes on the specified + * node and sets *total to the sum across all nodes. + * + * Note: This function requires damon_get_folio() from ops-common.c, which is + * only available when CONFIG_DAMON_PADDR is enabled. It also requires the + * context to be using PADDR operations for meaningful results. + */ +static phys_addr_t damos_calc_eligible_bytes(struct damon_ctx *c, + struct damos *s, int nid, phys_addr_t *total) +{ + struct damon_target *t; + struct damon_region *r; + phys_addr_t total_eligible = 0; + phys_addr_t node_eligible = 0; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + phys_addr_t addr, end_addr; + + if (!__damos_valid_target(r, s)) + continue; + + /* Convert from core address units to physical bytes */ + addr = (phys_addr_t)r->ar.start * c->addr_unit; + end_addr = (phys_addr_t)r->ar.end * c->addr_unit; + while (addr < end_addr) { + struct folio *folio; + phys_addr_t folio_start, folio_end; + phys_addr_t overlap_start, overlap_end; + phys_addr_t counted; + + folio = damon_get_folio(PHYS_PFN(addr)); + if (!folio) { + addr = PAGE_ALIGN_DOWN(addr + + PAGE_SIZE); + if (!addr) + break; + continue; + } + + /* + * Calculate exact overlap between the region + * [addr, end_addr) and the folio range. + * The folio may start before addr if addr is + * in the middle of a large folio. + */ + folio_start = PFN_PHYS(folio_pfn(folio)); + folio_end = folio_start + folio_size(folio); + + overlap_start = max(addr, folio_start); + overlap_end = min(end_addr, folio_end); + + if (overlap_end > overlap_start) { + counted = overlap_end - overlap_start; + total_eligible += counted; + if (folio_nid(folio) == nid) + node_eligible += counted; + } + + /* Advance past the entire folio */ + addr = folio_end; + folio_put(folio); + } + cond_resched(); + } + } + + *total = total_eligible; + return node_eligible; +} + +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + phys_addr_t total_eligible = 0; + phys_addr_t node_eligible; + + if (c->ops.id != DAMON_OPS_PADDR) + return 0; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)) + return 0; + + node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible); + + if (!(unsigned long)total_eligible) + return 0; + + return mult_frac((unsigned long)node_eligible, 10000, + (unsigned long)total_eligible); +} +#else /* CONFIG_DAMON_PADDR */ +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + return 0; +} +#endif /* CONFIG_DAMON_PADDR */ +#else /* CONFIG_NUMA */ static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { @@ -2299,7 +2510,13 @@ static unsigned long damos_get_node_memcg_used_bp( { return 0; } -#endif + +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + return 0; +} +#endif /* CONFIG_NUMA */ /* * Returns LRU-active or inactive memory to total LRU memory size ratio. @@ -2319,7 +2536,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio) return mult_frac(inactive, 10000, total); } -static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) +static void damos_set_quota_goal_current_value(struct damon_ctx *c, + struct damos *s, struct damos_quota_goal *goal) { u64 now_psi_total; @@ -2345,19 +2563,24 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = damos_get_in_active_mem_bp( goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP); break; + case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: + goal->current_value = damos_get_node_eligible_mem_bp(c, s, + goal->nid); + break; default: break; } } /* Return the highest score since it makes schemes least aggressive */ -static unsigned long damos_quota_score(struct damos_quota *quota) +static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s) { struct damos_quota_goal *goal; + struct damos_quota *quota = &s->quota; unsigned long highest_score = 0; damos_for_each_quota_goal(goal, quota) { - damos_set_quota_goal_current_value(goal); + damos_set_quota_goal_current_value(c, s, goal); highest_score = max(highest_score, mult_frac(goal->current_value, 10000, goal->target_value)); @@ -2366,17 +2589,20 @@ static unsigned long damos_quota_score(struct damos_quota *quota) return highest_score; } -static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota) +static void damos_goal_tune_esz_bp_consist(struct damon_ctx *c, struct damos *s) { - unsigned long score = damos_quota_score(quota); + struct damos_quota *quota = &s->quota; + unsigned long score = damos_quota_score(c, s); quota->esz_bp = damon_feed_loop_next_input( max(quota->esz_bp, 10000UL), score); } -static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) +static void damos_goal_tune_esz_bp_temporal(struct damon_ctx *c, + struct damos *s) { - unsigned long score = damos_quota_score(quota); + struct damos_quota *quota = &s->quota; + unsigned long score = damos_quota_score(c, s); if (score >= 10000) quota->esz_bp = 0; @@ -2389,9 +2615,9 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ -static void damos_set_effective_quota(struct damos_quota *quota, - struct damon_ctx *ctx) +static void damos_set_effective_quota(struct damon_ctx *ctx, struct damos *s) { + struct damos_quota *quota = &s->quota; unsigned long throughput; unsigned long esz = ULONG_MAX; @@ -2402,9 +2628,9 @@ static void damos_set_effective_quota(struct damos_quota *quota, if (!list_empty("a->goals)) { if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST) - damos_goal_tune_esz_bp_consist(quota); + damos_goal_tune_esz_bp_consist(ctx, s); else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL) - damos_goal_tune_esz_bp_temporal(quota); + damos_goal_tune_esz_bp_temporal(ctx, s); esz = quota->esz_bp / 10000; } @@ -2452,22 +2678,21 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* First charge window */ if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; - damos_set_effective_quota(quota, c); + damos_set_effective_quota(c, s); } /* New charge window starts */ if (!time_in_range_open(jiffies, quota->charged_from, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) + if (damos_quota_is_full(quota, c->min_region_sz)) s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; if (trace_damos_esz_enabled()) cached_esz = quota->esz; - damos_set_effective_quota(quota, c); + damos_set_effective_quota(c, s); if (trace_damos_esz_enabled() && quota->esz != cached_esz) damos_trace_esz(c, s, quota); } @@ -2521,9 +2746,9 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s) static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; - struct damon_region *r; struct damos *s; bool has_schemes_to_apply = false; + unsigned long max_region_sz; damon_for_each_scheme(s, c) { if (time_before(c->passed_sample_intervals, s->next_apply_sis)) @@ -2540,13 +2765,12 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (!has_schemes_to_apply) return; + max_region_sz = damon_region_sz_limit(c); mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { if (c->ops.target_valid && c->ops.target_valid(t) == false) continue; - - damon_for_each_region(r, t) - damon_do_apply_schemes(c, t, r); + damos_apply_target(c, t, max_region_sz); } damon_for_each_scheme(s, c) { @@ -3014,6 +3238,14 @@ static int kdamond_fn(void *data) kdamond_call(ctx, false); if (ctx->maybe_corrupted) break; + while (ctx->pause) { + damos_walk_cancel(ctx); + kdamond_usleep(ctx->attrs.sample_interval); + /* allow caller unset pause via damon_call() */ + kdamond_call(ctx, false); + if (kdamond_need_stop(ctx) || ctx->maybe_corrupted) + goto done; + } if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); else @@ -3096,14 +3328,20 @@ done: return 0; } -static int walk_system_ram(struct resource *res, void *arg) +struct damon_system_ram_range_walk_arg { + bool walked; + struct resource res; +}; + +static int damon_system_ram_walk_fn(struct resource *res, void *arg) { - struct resource *a = arg; + struct damon_system_ram_range_walk_arg *a = arg; - if (resource_size(a) < resource_size(res)) { - a->start = res->start; - a->end = res->end; + if (!a->walked) { + a->walked = true; + a->res.start = res->start; } + a->res.end = res->end; return 0; } @@ -3120,27 +3358,24 @@ static unsigned long damon_res_to_core_addr(resource_size_t ra, return ra / addr_unit; } -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool damon_find_biggest_system_ram(unsigned long *start, +static bool damon_find_system_rams_range(unsigned long *start, unsigned long *end, unsigned long addr_unit) - { - struct resource res = {}; + struct damon_system_ram_range_walk_arg arg = {}; - walk_system_ram_res(0, -1, &res, walk_system_ram); - *start = damon_res_to_core_addr(res.start, addr_unit); - *end = damon_res_to_core_addr(res.end + 1, addr_unit); + walk_system_ram_res(0, -1, &arg, damon_system_ram_walk_fn); + if (!arg.walked) + return false; + *start = damon_res_to_core_addr(arg.res.start, addr_unit); + *end = damon_res_to_core_addr(arg.res.end + 1, addr_unit); if (*end <= *start) return false; return true; } /** - * damon_set_region_biggest_system_ram_default() - Set the region of the given - * monitoring target as requested, or biggest 'System RAM'. + * damon_set_region_system_rams_default() - Set the region of the given + * monitoring target as requested, or to cover all 'System RAM' resources. * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. @@ -3148,14 +3383,14 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @min_region_sz: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the - * values of @start and @end are zero, however, this function finds the biggest - * 'System RAM' resource and sets the region to cover the resource. In the - * latter case, this function saves the start and end addresses of the resource - * in @start and @end, respectively. + * values of @start and @end are zero, however, this function finds 'System + * RAM' resources and sets the region to cover all the resource. In the latter + * case, this function saves the start and the end addresseses of the first and + * the last resources in @start and @end, respectively. * * Return: 0 on success, negative error code otherwise. */ -int damon_set_region_biggest_system_ram_default(struct damon_target *t, +int damon_set_region_system_rams_default(struct damon_target *t, unsigned long *start, unsigned long *end, unsigned long addr_unit, unsigned long min_region_sz) { @@ -3165,7 +3400,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return -EINVAL; if (!*start && !*end && - !damon_find_biggest_system_ram(start, end, addr_unit)) + !damon_find_system_rams_range(start, end, addr_unit)) return -EINVAL; addr_range.start = *start; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8cfe7bd3dc1d3..08500a15bbfb8 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -39,7 +39,6 @@ static bool enabled __read_mostly; * the re-reading, DAMON_LRU_SORT will be disabled. */ static bool commit_inputs __read_mostly; -module_param(commit_inputs, bool, 0600); /* * Desired active to [in]active memory ratio in bp (1/10,000). @@ -140,7 +139,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); * Start of the target memory region in physical address. * * The start physical address of memory region that DAMON_LRU_SORT will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_start __read_mostly; module_param(monitor_region_start, ulong, 0600); @@ -149,7 +149,8 @@ module_param(monitor_region_start, ulong, 0600); * End of the target memory region in physical address. * * The end physical address of memory region that DAMON_LRU_SORT will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); @@ -327,7 +328,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) goto out; - err = damon_set_region_biggest_system_ram_default(param_target, + err = damon_set_region_system_rams_default(param_target, &monitor_region_start, &monitor_region_end, param_ctx->addr_unit, @@ -340,18 +341,51 @@ out: return err; } -static int damon_lru_sort_handle_commit_inputs(void) +static int damon_lru_sort_commit_inputs_fn(void *arg) { + return damon_lru_sort_apply_parameters(); +} + +static int damon_lru_sort_commit_inputs_store(const char *val, + const struct kernel_param *kp) +{ + bool commit_inputs_request; int err; + struct damon_call_control control = { + .fn = damon_lru_sort_commit_inputs_fn, + }; + + if (!val) { + commit_inputs_request = true; + } else { + err = kstrtobool(val, &commit_inputs_request); + if (err) + return err; + } - if (!commit_inputs) + if (!commit_inputs_request) return 0; - err = damon_lru_sort_apply_parameters(); - commit_inputs = false; - return err; + /* + * Skip damon_call() if ctx is not initialized to avoid + * NULL pointer dereference. + */ + if (!ctx) + return -EINVAL; + + err = damon_call(ctx, &control); + + return err ? err : control.return_code; } +static const struct kernel_param_ops commit_inputs_param_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = damon_lru_sort_commit_inputs_store, + .get = param_get_bool, +}; + +module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600); + static int damon_lru_sort_damon_call_fn(void *arg) { struct damon_ctx *c = arg; @@ -365,7 +399,7 @@ static int damon_lru_sort_damon_call_fn(void *arg) damon_lru_sort_cold_stat = s->stat; } - return damon_lru_sort_handle_commit_inputs(); + return 0; } static struct damon_call_control call_control = { diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index c3e4c871b0bb2..5c93ef2bb8a97 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -117,9 +117,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, damon_max_nr_accesses(&c->attrs); age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; - for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; - age_in_log++, age_in_sec >>= 1) - ; + if (age_in_sec) + age_in_log = min_t(int, ilog2(age_in_sec) + 1, + DAMON_MAX_AGE_IN_LOG); + else + age_in_log = 0; + /* If frequency is 0, higher age means it's colder */ if (freq_subscore == 0) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 96f6dfc28eae4..1f0319bfa0788 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -39,7 +39,6 @@ static bool enabled __read_mostly; * re-reading, DAMON_RECLAIM will be disabled. */ static bool commit_inputs __read_mostly; -module_param(commit_inputs, bool, 0600); /* * Time threshold for cold memory regions identification in microseconds. @@ -92,6 +91,20 @@ module_param(quota_mem_pressure_us, ulong, 0600); static unsigned long quota_autotune_feedback __read_mostly; module_param(quota_autotune_feedback, ulong, 0600); +/* + * Auto-tune monitoring intervals. + * + * If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's + * sampling and aggregation intervals. The auto-tuning aims to capture + * meaningful amount of access events in each DAMON-snapshot, while keeping the + * sampling intervals 5 milliseconds in minimum, and 10 seconds in maximum. + * Setting this as ``N`` disables the auto-tuning. + * + * Disabled by default. + */ +static bool autotune_monitoring_intervals __read_mostly; +module_param(autotune_monitoring_intervals, bool, 0600); + static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ @@ -114,7 +127,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); * Start of the target memory region in physical address. * * The start physical address of memory region that DAMON_RECLAIM will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_start __read_mostly; module_param(monitor_region_start, ulong, 0600); @@ -123,7 +137,8 @@ module_param(monitor_region_start, ulong, 0600); * End of the target memory region in physical address. * * The end physical address of memory region that DAMON_RECLAIM will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); @@ -151,7 +166,7 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, static struct damon_ctx *ctx; static struct damon_target *target; -static struct damos *damon_reclaim_new_scheme(void) +static struct damos *damon_reclaim_new_scheme(unsigned long aggr_interval) { struct damos_access_pattern pattern = { /* Find regions having PAGE_SIZE or larger size */ @@ -161,8 +176,7 @@ static struct damos *damon_reclaim_new_scheme(void) .min_nr_accesses = 0, .max_nr_accesses = 0, /* for min_age or more micro-seconds */ - .min_age_region = min_age / - damon_reclaim_mon_attrs.aggr_interval, + .min_age_region = min_age / aggr_interval, .max_age_region = UINT_MAX, }; @@ -183,6 +197,7 @@ static int damon_reclaim_apply_parameters(void) { struct damon_ctx *param_ctx; struct damon_target *param_target; + struct damon_attrs attrs; struct damos *scheme; struct damos_quota_goal *goal; struct damos_filter *filter; @@ -200,12 +215,21 @@ static int damon_reclaim_apply_parameters(void) goto out; } - err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); + attrs = damon_reclaim_mon_attrs; + if (autotune_monitoring_intervals) { + attrs.sample_interval = 5000; + attrs.aggr_interval = 100000; + attrs.intervals_goal.access_bp = 40; + attrs.intervals_goal.aggrs = 3; + attrs.intervals_goal.min_sample_us = 5000; + attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000; + } + err = damon_set_attrs(param_ctx, &attrs); if (err) goto out; err = -ENOMEM; - scheme = damon_reclaim_new_scheme(); + scheme = damon_reclaim_new_scheme(attrs.aggr_interval); if (!scheme) goto out; damon_set_schemes(param_ctx, &scheme, 1); @@ -233,11 +257,9 @@ static int damon_reclaim_apply_parameters(void) damos_add_filter(scheme, filter); } - err = damon_set_region_biggest_system_ram_default(param_target, - &monitor_region_start, - &monitor_region_end, - param_ctx->addr_unit, - param_ctx->min_region_sz); + err = damon_set_region_system_rams_default(param_target, + &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); @@ -246,18 +268,51 @@ out: return err; } -static int damon_reclaim_handle_commit_inputs(void) +static int damon_reclaim_commit_inputs_fn(void *arg) { + return damon_reclaim_apply_parameters(); +} + +static int damon_reclaim_commit_inputs_store(const char *val, + const struct kernel_param *kp) +{ + bool commit_inputs_request; int err; + struct damon_call_control control = { + .fn = damon_reclaim_commit_inputs_fn, + }; + + if (!val) { + commit_inputs_request = true; + } else { + err = kstrtobool(val, &commit_inputs_request); + if (err) + return err; + } - if (!commit_inputs) + if (!commit_inputs_request) return 0; - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - return err; + /* + * Skip damon_call() if ctx is not initialized to avoid + * NULL pointer dereference. + */ + if (!ctx) + return -EINVAL; + + err = damon_call(ctx, &control); + + return err ? err : control.return_code; } +static const struct kernel_param_ops commit_inputs_param_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = damon_reclaim_commit_inputs_store, + .get = param_get_bool, +}; + +module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600); + static int damon_reclaim_damon_call_fn(void *arg) { struct damon_ctx *c = arg; @@ -267,7 +322,7 @@ static int damon_reclaim_damon_call_fn(void *arg) damon_for_each_scheme(s, c) damon_reclaim_stat = s->stat; - return damon_reclaim_handle_commit_inputs(); + return 0; } static struct damon_call_control call_control = { diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 3951b762cbddf..0e14f5bb8f75d 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -148,59 +148,12 @@ static int damon_stat_damon_call_fn(void *data) return 0; } -struct damon_stat_system_ram_range_walk_arg { - bool walked; - struct resource res; -}; - -static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg) -{ - struct damon_stat_system_ram_range_walk_arg *a = arg; - - if (!a->walked) { - a->walked = true; - a->res.start = res->start; - } - a->res.end = res->end; - return 0; -} - -static unsigned long damon_stat_res_to_core_addr(resource_size_t ra, - unsigned long addr_unit) -{ - /* - * Use div_u64() for avoiding linking errors related with __udivdi3, - * __aeabi_uldivmod, or similar problems. This should also improve the - * performance optimization (read div_u64() comment for the detail). - */ - if (sizeof(ra) == 8 && sizeof(addr_unit) == 4) - return div_u64(ra, addr_unit); - return ra / addr_unit; -} - -static int damon_stat_set_monitoring_region(struct damon_target *t, - unsigned long addr_unit, unsigned long min_region_sz) -{ - struct damon_addr_range addr_range; - struct damon_stat_system_ram_range_walk_arg arg = {}; - - walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn); - if (!arg.walked) - return -EINVAL; - addr_range.start = damon_stat_res_to_core_addr( - arg.res.start, addr_unit); - addr_range.end = damon_stat_res_to_core_addr( - arg.res.end + 1, addr_unit); - if (addr_range.end <= addr_range.start) - return -EINVAL; - return damon_set_regions(t, &addr_range, 1, min_region_sz); -} - static struct damon_ctx *damon_stat_build_ctx(void) { struct damon_ctx *ctx; struct damon_attrs attrs; struct damon_target *target; + unsigned long start = 0, end = 0; ctx = damon_new_ctx(); if (!ctx) @@ -230,8 +183,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_stat_set_monitoring_region(target, ctx->addr_unit, - ctx->min_region_sz)) + if (damon_set_region_system_rams_default(target, &start, &end, + ctx->addr_unit, ctx->min_region_sz)) goto free_out; return ctx; free_out: @@ -313,6 +266,45 @@ static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp) return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N'); } +static int damon_stat_kdamond_pid_store( + const char *val, const struct kernel_param *kp) +{ + /* + * kdamond_pid is read-only, but kernel command line could write it. + * Do nothing here. + */ + return 0; +} + +static int damon_stat_kdamond_pid_load( + char *buffer, const struct kernel_param *kp) +{ + int pid; + + if (!damon_stat_context) { + pid = -1; + } else { + pid = damon_kdamond_pid(damon_stat_context); + if (pid < 1) + pid = -1; + } + return sprintf(buffer, "%d\n", pid); +} + +static const struct kernel_param_ops kdamond_pid_param_ops = { + .set = damon_stat_kdamond_pid_store, + .get = damon_stat_kdamond_pid_load, +}; + +/* + * PID of the DAMON thread + * + * If DAMON_STAT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400); +MODULE_PARM_DESC(kdamond_pid, "pid of the kdamond"); + static int __init damon_stat_init(void) { int err = 0; diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index a8014780edae9..ab2153fff9a83 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1093,6 +1093,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_INACTIVE_MEM_BP, .name = "inactive_mem_bp", }, + { + .metric = DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP, + .name = "node_eligible_mem_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, @@ -1508,6 +1512,8 @@ struct damon_sysfs_quotas { unsigned long reset_interval_ms; unsigned long effective_sz; /* Effective size quota in bytes */ enum damos_quota_goal_tuner goal_tuner; + unsigned int fail_charge_num; + unsigned int fail_charge_denom; }; static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) @@ -1682,6 +1688,48 @@ static ssize_t goal_tuner_store(struct kobject *kobj, return -EINVAL; } +static ssize_t fail_charge_num_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%u\n", quotas->fail_charge_num); +} + +static ssize_t fail_charge_num_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtouint(buf, 0, "as->fail_charge_num); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t fail_charge_denom_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%u\n", quotas->fail_charge_denom); +} + +static ssize_t fail_charge_denom_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtouint(buf, 0, "as->fail_charge_denom); + + if (err) + return -EINVAL; + return count; +} + static void damon_sysfs_quotas_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); @@ -1702,12 +1750,20 @@ static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr = static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr = __ATTR_RW_MODE(goal_tuner, 0600); +static struct kobj_attribute damon_sysfs_quotas_fail_charge_num_attr = + __ATTR_RW_MODE(fail_charge_num, 0600); + +static struct kobj_attribute damon_sysfs_quotas_fail_charge_denom_attr = + __ATTR_RW_MODE(fail_charge_denom, 0600); + static struct attribute *damon_sysfs_quotas_attrs[] = { &damon_sysfs_quotas_ms_attr.attr, &damon_sysfs_quotas_sz_attr.attr, &damon_sysfs_quotas_reset_interval_ms_attr.attr, &damon_sysfs_quotas_effective_bytes_attr.attr, &damon_sysfs_quotas_goal_tuner_attr.attr, + &damon_sysfs_quotas_fail_charge_num_attr.attr, + &damon_sysfs_quotas_fail_charge_denom_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); @@ -2061,6 +2117,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = { .name = "nohugepage", }, { + .action = DAMOS_COLLAPSE, + .name = "collapse", + }, + { .action = DAMOS_LRU_PRIO, .name = "lru_prio", }, @@ -2685,6 +2745,9 @@ static int damos_sysfs_add_quota_score( } goal->nid = sysfs_goal->nid; break; + case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: + goal->nid = sysfs_goal->nid; + break; default: break; } @@ -2796,6 +2859,8 @@ static struct damos *damon_sysfs_mk_scheme( .weight_nr_accesses = sysfs_weights->nr_accesses, .weight_age = sysfs_weights->age, .goal_tuner = sysfs_quotas->goal_tuner, + .fail_charge_num = sysfs_quotas->fail_charge_num, + .fail_charge_denom = sysfs_quotas->fail_charge_denom, }; struct damos_watermarks wmarks = { .metric = sysfs_wmarks->metric, diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index eefa959aa30ae..d5863cc33d230 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -866,6 +866,7 @@ struct damon_sysfs_context { struct damon_sysfs_attrs *attrs; struct damon_sysfs_targets *targets; struct damon_sysfs_schemes *schemes; + bool pause; }; static struct damon_sysfs_context *damon_sysfs_context_alloc( @@ -878,6 +879,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc( context->kobj = (struct kobject){}; context->ops_id = ops_id; context->addr_unit = 1; + context->pause = false; return context; } @@ -1053,6 +1055,30 @@ static ssize_t addr_unit_store(struct kobject *kobj, return count; } +static ssize_t pause_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%c\n", context->pause ? 'Y' : 'N'); +} + +static ssize_t pause_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + bool pause; + int err = kstrtobool(buf, &pause); + + if (err) + return err; + context->pause = pause; + return count; +} + + static void damon_sysfs_context_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_context, kobj)); @@ -1067,10 +1093,14 @@ static struct kobj_attribute damon_sysfs_context_operations_attr = static struct kobj_attribute damon_sysfs_context_addr_unit_attr = __ATTR_RW_MODE(addr_unit, 0600); +static struct kobj_attribute damon_sysfs_context_pause_attr = + __ATTR_RW_MODE(pause, 0600); + static struct attribute *damon_sysfs_context_attrs[] = { &damon_sysfs_context_avail_operations_attr.attr, &damon_sysfs_context_operations_attr.attr, &damon_sysfs_context_addr_unit_attr.attr, + &damon_sysfs_context_pause_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_context); @@ -1470,6 +1500,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, if (sys_ctx->ops_id == DAMON_OPS_PADDR) ctx->min_region_sz = max( DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1); + ctx->pause = sys_ctx->pause; err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 9e5904c2beeb2..1b23a22ac04c4 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -694,6 +694,8 @@ static void damos_test_commit_quota(struct kunit *test) .ms = 2, .sz = 3, .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST, + .fail_charge_num = 2, + .fail_charge_denom = 3, .weight_sz = 4, .weight_nr_accesses = 5, .weight_age = 6, @@ -703,6 +705,8 @@ static void damos_test_commit_quota(struct kunit *test) .ms = 8, .sz = 9, .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, + .fail_charge_num = 1, + .fail_charge_denom = 1024, .weight_sz = 10, .weight_nr_accesses = 11, .weight_age = 12, @@ -717,6 +721,8 @@ static void damos_test_commit_quota(struct kunit *test) KUNIT_EXPECT_EQ(test, dst.ms, src.ms); KUNIT_EXPECT_EQ(test, dst.sz, src.sz); KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner); + KUNIT_EXPECT_EQ(test, dst.fail_charge_num, src.fail_charge_num); + KUNIT_EXPECT_EQ(test, dst.fail_charge_denom, src.fail_charge_denom); KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz); KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses); KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); @@ -1077,6 +1083,10 @@ static void damon_test_commit_ctx(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); src->min_region_sz = 4095; KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL); + src->min_region_sz = 4096; + src->pause = true; + KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); + KUNIT_EXPECT_TRUE(test, dst->pause); damon_destroy_ctx(src); damon_destroy_ctx(dst); } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b069dbc7e3d25..dd5f2d7027ac4 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -903,6 +903,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_COLLAPSE: + madv_action = MADV_COLLAPSE; + break; case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: return damos_va_migrate(t, r, scheme, sz_filter_passed); diff --git a/mm/filemap.c b/mm/filemap.c index 4e636647100c1..ab34cab2416a4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3314,6 +3314,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) bool force_thp_readahead = false; unsigned short mmap_miss; + ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1; + /* Use the readahead code, even if readahead is disabled */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) @@ -3396,6 +3398,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * mmap read-around */ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->start = max(ra->start, vmf->vma->vm_pgoff); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra->order = 0; @@ -3438,6 +3441,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, } if (folio_test_readahead(folio)) { + ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1; fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } @@ -3747,8 +3751,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss, - pgoff_t file_end) + unsigned long *rss, pgoff_t file_end) { struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; @@ -3781,16 +3784,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, goto skip; /* - * If there are too many folios that are recently evicted - * in a file, they will probably continue to be evicted. - * In such situation, read-ahead is only a waste of IO. - * Don't decrease mmap_miss in this scenario to make sure - * we can stop read-ahead. - */ - if (!folio_test_workingset(folio)) - (*mmap_miss)++; - - /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the * fault-around logic. @@ -3836,7 +3829,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3844,10 +3837,6 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, if (PageHWPoison(page)) goto out; - /* See comment of filemap_map_folio_range() */ - if (!folio_test_workingset(folio)) - (*mmap_miss)++; - /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit @@ -3882,7 +3871,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, folio_type; - unsigned short mmap_miss = 0, mmap_miss_saved; /* * Recalculate end_pgoff based on file_end before calling @@ -3921,6 +3909,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, folio_type = mm_counter_file(folio); do { unsigned long end; + vm_fault_t map_ret; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; @@ -3928,13 +3917,35 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; - if (!folio_test_large(folio)) - ret |= filemap_map_order0_folio(vmf, - folio, addr, &rss, &mmap_miss); - else - ret |= filemap_map_folio_range(vmf, folio, - xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss, file_end); + if (!folio_test_large(folio)) { + map_ret = filemap_map_order0_folio(vmf, folio, addr, + &rss); + } else { + unsigned long start = xas.xa_index - folio->index; + + map_ret = filemap_map_folio_range(vmf, folio, start, + addr, nr_pages, &rss, + file_end); + } + ret |= map_ret; + + /* + * If there are too many folios that are recently evicted + * in a file, they will probably continue to be evicted. + * In such situation, read-ahead is only a waste of IO. + * Don't decrease mmap_miss in this scenario to make sure + * we can stop read-ahead. + */ + if ((map_ret & VM_FAULT_NOPAGE) && + !(vmf->flags & FAULT_FLAG_TRIED) && + !folio_test_workingset(folio)) { + unsigned short mmap_miss; + + mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + if (mmap_miss) + WRITE_ONCE(file->f_ra.mmap_miss, + mmap_miss - 1); + } folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); @@ -3944,12 +3955,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, out: rcu_read_unlock(); - mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); - if (mmap_miss >= mmap_miss_saved) - WRITE_ONCE(file->f_ra.mmap_miss, 0); - else - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); - return ret; } EXPORT_SYMBOL(filemap_map_pages); diff --git a/mm/gup.c b/mm/gup.c index ad9ded39609cb..0692119b79043 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2865,8 +2865,8 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!folio) goto pte_unmap; - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { + if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get_lockless(pmdp))) || + unlikely(pte_val(pte) != pte_val(ptep_get_lockless(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2942,7 +2942,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!folio) return 0; - if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + if (unlikely(pmd_val(orig) != pmd_val(pmdp_get_lockless(pmdp)))) { gup_put_folio(folio, refs, flags); return 0; } @@ -2985,7 +2985,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (!folio) return 0; - if (unlikely(pud_val(orig) != pud_val(*pudp))) { + if (unlikely(pud_val(orig) != pud_val(pudp_get(pudp)))) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 653f2dc034036..b7df167f7acbf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -429,61 +429,75 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, return count; } +enum defrag_mode { + DEFRAG_ALWAYS = 0, + DEFRAG_DEFER, + DEFRAG_DEFER_MADVISE, + DEFRAG_MADVISE, + DEFRAG_NEVER, +}; + +static const char * const defrag_mode_strings[] = { + [DEFRAG_ALWAYS] = "always", + [DEFRAG_DEFER] = "defer", + [DEFRAG_DEFER_MADVISE] = "defer+madvise", + [DEFRAG_MADVISE] = "madvise", + [DEFRAG_NEVER] = "never", +}; + +static const enum transparent_hugepage_flag defrag_flags[] = { + [DEFRAG_ALWAYS] = TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + [DEFRAG_DEFER] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + [DEFRAG_DEFER_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, + [DEFRAG_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, +}; + static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - const char *output; + int active = DEFRAG_NEVER; + int len = 0; + int i; - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - &transparent_hugepage_flags)) - output = "[always] defer defer+madvise madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - &transparent_hugepage_flags)) - output = "always [defer] defer+madvise madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, - &transparent_hugepage_flags)) - output = "always defer [defer+madvise] madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, - &transparent_hugepage_flags)) - output = "always defer defer+madvise [madvise] never"; - else - output = "always defer defer+madvise madvise [never]"; + for (i = 0; i < ARRAY_SIZE(defrag_flags); i++) { + if (test_bit(defrag_flags[i], &transparent_hugepage_flags)) { + active = i; + break; + } + } - return sysfs_emit(buf, "%s\n", output); + for (i = 0; i < ARRAY_SIZE(defrag_mode_strings); i++) { + if (i == active) + len += sysfs_emit_at(buf, len, "[%s] ", + defrag_mode_strings[i]); + else + len += sysfs_emit_at(buf, len, "%s ", + defrag_mode_strings[i]); + } + + /* Replace trailing space with newline */ + buf[len - 1] = '\n'; + + return len; } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "always")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "defer+madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "defer")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "never")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else + int mode, m; + + mode = sysfs_match_string(defrag_mode_strings, buf); + if (mode < 0) return -EINVAL; + for (m = 0; m < ARRAY_SIZE(defrag_flags); m++) { + if (m == mode) + set_bit(defrag_flags[m], &transparent_hugepage_flags); + else + clear_bit(defrag_flags[m], &transparent_hugepage_flags); + } + return count; } static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); @@ -4192,11 +4206,10 @@ fail: folio_unlock(new_folio); /* - * Subpages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. + * Subpages whose mapping has been zapped may be freed + * earlier, but freeing them requires taking the + * lru_lock, so we defer put_page() on tail pages until + * after the split completes. */ free_folio_and_swap_cache(new_folio); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c921287489de3..571212b80835e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2862,6 +2862,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct hugetlb_cgroup *h_cg_rsvd = NULL; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; idx = hstate_index(h); @@ -2912,7 +2913,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( - idx, pages_per_huge_page(h), &h_cg); + idx, pages_per_huge_page(h), &h_cg_rsvd); if (ret) goto out_subpool_put; } @@ -2954,7 +2955,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, folio); + h_cg_rsvd, folio); } spin_unlock_irq(&hugetlb_lock); @@ -3006,7 +3007,7 @@ out_uncharge_cgroup: out_uncharge_cgroup_reservation: if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), - h_cg); + h_cg_rsvd); out_subpool_put: /* * put page to subpool iff the quota of subpool's rsv_hpages is used diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 5725a367246d9..10424cd25e5a6 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -263,7 +263,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat break; } - kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp, + kunit_info(test, "%s: size=%zu, gfp=%pGg, policy=%s, cache=%i\n", __func__, size, &gfp, policy_name, !!test_cache); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b8452dbdb043f..28a843f30b32b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2528,8 +2528,8 @@ static void collapse_scan_mm_slot(unsigned int progress_max, cc->progress++; continue; } - hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); - hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); + hstart = ALIGN(vma->vm_start, HPAGE_PMD_SIZE); + hend = ALIGN_DOWN(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) { cc->progress++; continue; @@ -2808,6 +2808,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: + case SCAN_PAGE_HAS_PRIVATE: case SCAN_PAGE_DIRTY_OR_WRITEBACK: return -EAGAIN; /* @@ -2845,8 +2846,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, mmgrab(mm); lru_add_drain_all(); - hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = end & HPAGE_PMD_MASK; + hstart = ALIGN(start, HPAGE_PMD_SIZE); + hend = ALIGN_DOWN(end, HPAGE_PMD_SIZE); for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { enum scan_result result = SCAN_FAIL; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2eff0d6b622b6..7c7ba17ce7af0 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -92,6 +92,7 @@ #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/workqueue.h> +#include <linux/xarray.h> #include <linux/crc32.h> #include <asm/sections.h> @@ -157,6 +158,8 @@ struct kmemleak_object { struct hlist_head area_list; unsigned long jiffies; /* creation timestamp */ pid_t pid; /* pid of the current task */ + /* per-scan dedup count, valid only while in scan-local dedup xarray */ + unsigned int dup_count; char comm[TASK_COMM_LEN]; /* executable name */ }; @@ -360,8 +363,9 @@ static const char *__object_type_str(struct kmemleak_object *object) * Printing of the unreferenced objects information to the seq file. The * print_unreferenced function must be called with the object->lock held. */ -static void print_unreferenced(struct seq_file *seq, - struct kmemleak_object *object) +static void __print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object, + bool hex_dump) { int i; unsigned long *entries; @@ -373,7 +377,8 @@ static void print_unreferenced(struct seq_file *seq, object->pointer, object->size); warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); - hex_dump_object(seq, object); + if (hex_dump) + hex_dump_object(seq, object); warn_or_seq_printf(seq, " backtrace (crc %x):\n", object->checksum); for (i = 0; i < nr_entries; i++) { @@ -382,6 +387,12 @@ static void print_unreferenced(struct seq_file *seq, } } +static void print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object) +{ + __print_unreferenced(seq, object, true); +} + /* * Print the kmemleak_object information. This function is used mainly for * debugging special cases when kmemleak operations. It must be called with @@ -1685,6 +1696,103 @@ unlock_put: } /* + * Print one leak inline. The hex dump is gated on OBJECT_ALLOCATED so it + * does not touch user memory that was freed concurrently; the rest of the + * report (backtrace, comm, pid) is always emitted since the kmemleak_object + * metadata is pinned by the caller. + */ +static void print_leak_locked(struct kmemleak_object *object, bool hex_dump) +{ + raw_spin_lock_irq(&object->lock); + __print_unreferenced(NULL, object, + hex_dump && (object->flags & OBJECT_ALLOCATED)); + raw_spin_unlock_irq(&object->lock); +} + +/* + * Per-scan dedup table for verbose leak printing. The xarray is keyed by + * stackdepot trace_handle and stores a pointer to the representative + * kmemleak_object. The per-scan repeat count lives in object->dup_count. + * + * dedup_record() must run outside object->lock: xa_store() may take + * mutexes (xa_node slab allocation) which lockdep would flag against the + * raw spinlock object->lock. + */ +static void dedup_record(struct xarray *dedup, struct kmemleak_object *object, + depot_stack_handle_t trace_handle) +{ + struct kmemleak_object *rep; + void *old; + + /* + * No stack trace to dedup against: early-boot allocation tracked + * before kmemleak_init() set up object_cache, or stack_depot_save() + * failure under memory pressure. + */ + if (!trace_handle) { + print_leak_locked(object, true); + return; + } + + /* stack is available, now we can de-dup */ + rep = xa_load(dedup, trace_handle); + if (rep) { + rep->dup_count++; + return; + } + + /* + * Object is being torn down (use_count already hit zero); the + * tracked memory at object->pointer is unsafe to read, so skip. + */ + if (!get_object(object)) + return; + + object->dup_count = 1; + old = xa_store(dedup, trace_handle, object, GFP_ATOMIC); + if (xa_is_err(old)) { + /* xa_node allocation failed; fall back to inline print. */ + print_leak_locked(object, true); + put_object(object); + return; + } + /* + * scan_mutex serialises all writers to the dedup xarray, so xa_store() + * after a NULL xa_load() must always overwrite an empty slot. + */ + WARN_ON_ONCE(old); +} + +/* + * Drain the dedup table. Re-acquires object->lock and re-checks + * OBJECT_ALLOCATED before printing: while get_object() pins the + * kmemleak_object metadata, the underlying tracked allocation may have + * been freed since the scan walked it (kmemleak_free clears + * OBJECT_ALLOCATED under object->lock before the user memory goes away). + * The hex dump is skipped for coalesced entries since the bytes would + * differ across objects anyway. + */ +static void dedup_flush(struct xarray *dedup) +{ + struct kmemleak_object *object; + unsigned long idx; + unsigned int dup; + bool coalesced; + + xa_for_each(dedup, idx, object) { + dup = object->dup_count; + coalesced = dup > 1; + + print_leak_locked(object, !coalesced); + if (coalesced) + pr_warn(" ... and %u more object(s) with the same backtrace\n", + dup - 1); + put_object(object); + xa_erase(dedup, idx); + } +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1694,6 +1802,7 @@ static void kmemleak_scan(void) struct kmemleak_object *object; struct zone *zone; int __maybe_unused i; + struct xarray dedup; int new_leaks = 0; jiffies_last_scan = jiffies; @@ -1834,10 +1943,18 @@ static void kmemleak_scan(void) return; /* - * Scanning result reporting. + * Scanning result reporting. When verbose printing is enabled, dedupe + * by stackdepot trace_handle so each unique backtrace is logged once + * per scan, annotated with the number of objects that share it. The + * per-leak count below still reflects every object, and + * /sys/kernel/debug/kmemleak still lists them individually. */ + xa_init(&dedup); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { + depot_stack_handle_t trace_handle; + bool dedup_print; + if (need_resched()) kmemleak_cond_resched(object); @@ -1849,18 +1966,33 @@ static void kmemleak_scan(void) if (!color_white(object)) continue; raw_spin_lock_irq(&object->lock); + trace_handle = 0; + dedup_print = false; if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; - - if (kmemleak_verbose) - print_unreferenced(NULL, object); - + if (kmemleak_verbose) { + trace_handle = object->trace_handle; + dedup_print = true; + } new_leaks++; } raw_spin_unlock_irq(&object->lock); + + /* + * Defer the verbose print outside object->lock: xa_store() + * may take xa_node slab locks at a higher wait-context level + * which lockdep would flag against the raw_spinlock_t + * object->lock. rcu_read_lock() keeps the kmemleak_object + * alive across the call. + */ + if (dedup_print) + dedup_record(&dedup, object, trace_handle); } rcu_read_unlock(); + /* Flush'em all */ + dedup_flush(&dedup); + xa_destroy(&dedup); if (new_leaks) { kmemleak_found_leaks = true; diff --git a/mm/madvise.c b/mm/madvise.c index 69708e953cf56..cd9bb077072cc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1834,50 +1834,29 @@ static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) tlb_finish_mmu(madv_behavior->tlb); } -static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +/** + * check_input_range() - Check if the requested range is valid. + * @start: Start address of madvise-requested address range. + * @len_in: Length of madvise-requested address range. + * + * Returns: 0 if the input range is valid, otherwise an error code. + */ +static int check_input_range(unsigned long start, size_t len_in) { size_t len; - if (!madvise_behavior_valid(behavior)) - return false; - if (!PAGE_ALIGNED(start)) - return false; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return false; + return -EINVAL; if (start + len < start) - return false; - - return true; -} + return -EINVAL; -/* - * madvise_should_skip() - Return if the request is invalid or nothing. - * @start: Start address of madvise-requested address range. - * @len_in: Length of madvise-requested address range. - * @behavior: Requested madvise behavior. - * @err: Pointer to store an error code from the check. - * - * If the specified behaviour is invalid or nothing would occur, we skip the - * operation. This function returns true in the cases, otherwise false. In - * the former case we store an error on @err. - */ -static bool madvise_should_skip(unsigned long start, size_t len_in, - int behavior, int *err) -{ - if (!is_valid_madvise(start, len_in, behavior)) { - *err = -EINVAL; - return true; - } - if (start + PAGE_ALIGN(len_in) == start) { - *err = 0; - return true; - } - return false; + return 0; } static bool is_madvise_populate(struct madvise_behavior *madv_behavior) @@ -2013,8 +1992,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh .tlb = &tlb, }; - if (madvise_should_skip(start, len_in, behavior, &error)) + if (!madvise_behavior_valid(behavior)) + return -EINVAL; + + error = check_input_range(start, len_in); + if (error || !len_in) return error; + error = madvise_lock(&madv_behavior); if (error) return error; @@ -2056,7 +2040,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, size_t len_in = iter_iov_len(iter); int error; - if (madvise_should_skip(start, len_in, behavior, &error)) + error = check_input_range(start, len_in); + if (error || !len_in) ret = error; else ret = madvise_do_behavior(start, len_in, &madv_behavior); @@ -2131,6 +2116,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + if (!madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1a4fd2504bcdf..431cad99189f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4005,11 +4005,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg1_alloc_events(memcg)) goto fail; + pstatc_pcpu = parent ? parent->vmstats_percpu : NULL; for_each_possible_cpu(cpu) { - if (parent) - pstatc_pcpu = parent->vmstats_percpu; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - statc->parent_pcpu = parent ? pstatc_pcpu : NULL; + statc->parent_pcpu = pstatc_pcpu; statc->vmstats = memcg->vmstats; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d47aef256a324..eff405a21c68b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -459,7 +459,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, * Only do anything when FORCEKILL is set, otherwise just free the * list (this is used for clean pages which do not need killing) */ -static void kill_procs(struct list_head *to_kill, int forcekill, +static void kill_procs(struct list_head *to_kill, bool forcekill, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -1418,7 +1418,7 @@ try_again: * We raced with (possibly temporary) unhandlable * page, retry. */ - if (pass++ < 3) { + if (pass++ < GET_PAGE_MAX_RETRY_NUM) { shake_page(p); goto try_again; } @@ -1582,7 +1582,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, { LIST_HEAD(tokill); bool unmap_success; - int forcekill; + bool forcekill; bool mlocked = folio_test_mlocked(folio); /* @@ -1703,7 +1703,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, unmap_mapping_range(mapping, start, size, 0); } - kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags); + kill_procs(to_kill, !!(flags & MF_MUST_KILL), pfn, flags); } /* diff --git a/mm/memory.c b/mm/memory.c index 86a973119bd46..0c9d9c2cbf0e0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3837,8 +3837,8 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. * - * Called with mmap_lock locked and the old page referenced, but - * without the ptl held. + * Called with either the VMA lock or the mmap_lock held (see FAULT_FLAG_VMA_LOCK) + * and the old page referenced, but without the ptl held. * * High level logic flow: * @@ -4237,9 +4237,9 @@ static bool wp_can_reuse_anon_folio(struct folio *folio, * though the page will change only once the write actually happens. This * avoids a few races, and potentially makes it more efficient. * - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_lock still held, but pte unmapped and unlocked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK) and pte both mapped and locked. We return with + * the same lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) @@ -4785,12 +4785,12 @@ static void check_swap_exclusive(struct folio *folio, swp_entry_t entry, } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * - * We return with the mmap_lock locked or unlocked in the same cases - * as does filemap_fault(). + * When returning, the lock may have been released in the same cases + * as done by filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { @@ -5330,9 +5330,10 @@ static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte, } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_lock still held, but pte unmapped and unlocked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK), and pte unmapped and unlocked. + * We return with the lock still held, but pte unmapped and unlocked. + * If VM_FAULT_RETRY is returned, the lock may have been released. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { @@ -5440,9 +5441,10 @@ oom: } /* - * The mmap_lock must have been held on entry, and may have been - * released depending on flags and vma->vm_ops->fault() return value. - * See filemap_fault() and __lock_page_retry(). + * Either the VMA lock or the mmap_lock must have been held on entry + * (see FAULT_FLAG_VMA_LOCK) and may have been released depending on + * flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { @@ -5480,7 +5482,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) if (unlikely(PageHWPoison(vmf->page))) { vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { - if (page_mapped(vmf->page)) + if (folio_mapped(folio)) unmap_mapping_folio(folio); /* Retry if a clean folio was removed from the cache. */ if (mapping_evict_folio(folio->mapping, folio)) @@ -6003,11 +6005,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults). - * The mmap_lock may have been released depending on flags and our + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK). + * The lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). - * If mmap_lock is released, vma may become invalid (for example + * If the lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) @@ -6374,10 +6376,11 @@ static void fix_spurious_fault(struct vm_fault *vmf, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow - * concurrent faults). + * On entry, we hold either the VMA lock or the mmap_lock + * (see FAULT_FLAG_VMA_LOCK). * - * The mmap_lock may have been released depending on flags and our return value. + * The mmap_lock or VMA lock may have been released depending on flags + * and our return value. * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) @@ -6458,8 +6461,8 @@ unlock: /* * On entry, we hold either the VMA lock or the mmap_lock - * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in - * the result, the mmap_lock is not held on exit. See filemap_fault() + * (see FAULT_FLAG_VMA_LOCK). If VM_FAULT_RETRY is set in + * the result, the lock is not held on exit. See filemap_fault() * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, @@ -6691,9 +6694,9 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, /* * By the time we get here, we already hold either the VMA lock or the - * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). + * mmap_lock (see FAULT_FLAG_VMA_LOCK). * - * The mmap_lock may have been released depending on flags and our + * The lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 40c7915dabe05..462d8dcd636dc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -576,6 +576,7 @@ void remove_pfn_range_from_zone(struct zone *zone, * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used + * @pgmap: device page map or %NULL if not ZONE_DEVICE * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make @@ -583,7 +584,7 @@ void remove_pfn_range_from_zone(struct zone *zone, * calling offline_pages(). */ void __remove_pages(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; @@ -598,7 +599,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); - sparse_remove_section(pfn, cur_nr_pages, altmap); + sparse_remove_section(pfn, cur_nr_pages, altmap, pgmap); } } @@ -1427,7 +1428,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) remove_memory_block_devices(cur_start, memblock_size); - arch_remove_memory(cur_start, memblock_size, altmap); + arch_remove_memory(cur_start, memblock_size, altmap, NULL); /* Verify that all vmemmap pages have actually been freed. */ WARN(altmap->alloc, "Altmap not fully unmapped"); @@ -1470,7 +1471,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { - arch_remove_memory(cur_start, memblock_size, NULL); + arch_remove_memory(cur_start, memblock_size, params.altmap, NULL); kfree(params.altmap); goto out; } @@ -1556,7 +1557,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { - arch_remove_memory(start, size, params.altmap); + arch_remove_memory(start, size, params.altmap, NULL); goto error; } } @@ -2268,7 +2269,7 @@ static int try_remove_memory(u64 start, u64 size) * No altmaps present, do the removal directly */ remove_memory_block_devices(start, size); - arch_remove_memory(start, size, NULL); + arch_remove_memory(start, size, NULL, NULL); } else { /* all memblocks in the range have altmaps */ remove_memory_blocks_and_altmaps(start, size); diff --git a/mm/memremap.c b/mm/memremap.c index 053842d45cb10..81766d8224009 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -97,10 +97,10 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) PHYS_PFN(range_len(range))); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { __remove_pages(PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), NULL); + PHYS_PFN(range_len(range)), NULL, pgmap); } else { arch_remove_memory(range->start, range_len(range), - pgmap_altmap(pgmap)); + pgmap_altmap(pgmap), pgmap); kasan_remove_zero_shadow(__va(range->start), range_len(range)); } mem_hotplug_done(); diff --git a/mm/migrate.c b/mm/migrate.c index 8a64291ab5b44..0c6a0ab6eccef 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1135,26 +1135,24 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, * This is safe because nobody is using it except us. */ enum { - PAGE_WAS_MAPPED = BIT(0), - PAGE_WAS_MLOCKED = BIT(1), - PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED, + FOLIO_WAS_MAPPED = BIT(0), + FOLIO_WAS_MLOCKED = BIT(1), + FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED, }; static void __migrate_folio_record(struct folio *dst, - int old_page_state, - struct anon_vma *anon_vma) + int old_folio_state, struct anon_vma *anon_vma) { - dst->private = (void *)anon_vma + old_page_state; + dst->private = (void *)anon_vma + old_folio_state; } static void __migrate_folio_extract(struct folio *dst, - int *old_page_state, - struct anon_vma **anon_vmap) + int *old_folio_state, struct anon_vma **anon_vmap) { unsigned long private = (unsigned long)dst->private; - *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES); - *old_page_state = private & PAGE_OLD_STATES; + *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES); + *old_folio_state = private & FOLIO_OLD_STATES; dst->private = NULL; } @@ -1209,7 +1207,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, { struct folio *dst; int rc = -EAGAIN; - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; bool locked = false; bool dst_locked = false; @@ -1253,7 +1251,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, } locked = true; if (folio_test_mlocked(src)) - old_page_state |= PAGE_WAS_MLOCKED; + old_folio_state |= FOLIO_WAS_MLOCKED; if (folio_test_writeback(src)) { /* @@ -1302,7 +1300,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, dst_locked = true; if (unlikely(page_has_movable_ops(&src->page))) { - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return 0; } @@ -1328,11 +1326,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); - old_page_state |= PAGE_WAS_MAPPED; + old_folio_state |= FOLIO_WAS_MAPPED; } if (!folio_mapped(src)) { - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return 0; } @@ -1344,7 +1342,7 @@ out: if (rc == -EAGAIN) ret = NULL; - migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, locked, ret); migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); @@ -1358,13 +1356,13 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct list_head *ret) { int rc; - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; bool src_deferred_split = false; bool src_partially_mapped = false; struct list_head *prev; - __migrate_folio_extract(dst, &old_page_state, &anon_vma); + __migrate_folio_extract(dst, &old_folio_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); @@ -1404,10 +1402,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, * isolated from the unevictable LRU: but this case is the easiest. */ folio_add_lru(dst); - if (old_page_state & PAGE_WAS_MLOCKED) + if (old_folio_state & FOLIO_WAS_MLOCKED) lru_add_drain(); - if (old_page_state & PAGE_WAS_MAPPED) + if (old_folio_state & FOLIO_WAS_MAPPED) remove_migration_ptes(src, dst, 0); out_unlock_both: @@ -1439,11 +1437,11 @@ out: */ if (rc == -EAGAIN) { list_add(&dst->lru, prev); - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return rc; } - migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, true, ret); migrate_folio_undo_dst(dst, true, put_new_folio, private); @@ -1777,11 +1775,11 @@ static void migrate_folios_undo(struct list_head *src_folios, dst = list_first_entry(dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, src_folios, lru) { - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; - __migrate_folio_extract(dst, &old_page_state, &anon_vma); - migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, + __migrate_folio_extract(dst, &old_folio_state, &anon_vma); + migrate_folio_undo_src(folio, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, true, ret_folios); list_del(&dst->lru); migrate_folio_undo_dst(dst, true, put_new_folio, private); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 19cd14b341146..554754eb26ff2 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -801,8 +801,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, bool flush = false; unsigned long i; - VM_WARN_ON_FOLIO(!folio, folio); - VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); + VM_WARN_ON_ONCE(!folio); if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) return -EINVAL; @@ -859,11 +858,9 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, if (userfaultfd_missing(vma)) goto unlock_abort; - if (!pmd_none(*pmdp)) { - if (!is_huge_zero_pmd(*pmdp)) - goto unlock_abort; + if (is_huge_zero_pmd(*pmdp)) flush = true; - } else if (!pmd_none(*pmdp)) + else if (!pmd_none(*pmdp)) goto unlock_abort; add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); diff --git a/mm/mm_init.c b/mm/mm_init.c index f9f8e1af921cd..bd466a3c10c8e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -674,6 +674,20 @@ static inline void fixup_hashdist(void) static inline void fixup_hashdist(void) {} #endif /* CONFIG_NUMA */ +#ifdef CONFIG_ZONE_DEVICE +static __meminit void pageblock_migratetype_init_range(unsigned long pfn, + unsigned long nr_pages, int migratetype) +{ + const unsigned long end = pfn + nr_pages; + + for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) { + init_pageblock_migratetype(pfn_to_page(pfn), migratetype, false); + if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) + cond_resched(); + } +} +#endif + /* * Initialize a reserved page unconditionally, finding its zone first. */ @@ -1012,21 +1026,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, page->zone_device_data = NULL; /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (pageblock_aligned(pfn)) { - init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); - cond_resched(); - } - - /* * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released * directly to the driver page allocator which will set the page count * to 1 when allocating the page. @@ -1056,10 +1055,17 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * of how the sparse_vmemmap internals handle compound pages in the lack * of an altmap. See vmemmap_populate_compound_pages(). */ -static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, +static inline unsigned long compound_nr_pages(unsigned long pfn, + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { - if (!vmemmap_can_optimize(altmap, pgmap)) + /* + * If DAX memory is hot-plugged into an unoccupied subsection + * of an early section, the unoptimized boot memmap is reused. + * See section_activate(). + */ + if (early_section(__pfn_to_section(pfn)) || + !vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap); return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); @@ -1122,13 +1128,18 @@ void __ref memmap_init_zone_device(struct zone *zone, __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) + cond_resched(); + if (pfns_per_compound == 1) continue; memmap_init_compound(page, pfn, zone_idx, nid, pgmap, - compound_nr_pages(altmap, pgmap)); + compound_nr_pages(pfn, altmap, pgmap)); } + pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE); + pr_debug("%s initialised %lu pages in %ums\n", __func__, nr_pages, jiffies_to_msecs(jiffies - start)); } diff --git a/mm/mmap.c b/mm/mmap.c index 5754d1c364624..2311ae7c2ff45 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -504,7 +504,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) - return -ENOTSUPP; + return -EOPNOTSUPP; /* * A locked or stack area makes no sense to be droppable. * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d49c254174da7..69a99af777771 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -90,6 +90,9 @@ typedef int __bitwise fpi_t; /* Free the page without taking locks. Rely on trylock only. */ #define FPI_TRYLOCK ((__force fpi_t)BIT(2)) +/* free_pages_prepare() has already been called for page(s) being freed. */ +#define FPI_PREPARED ((__force fpi_t)BIT(3)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -282,6 +285,14 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +/* + * When page allocations stall for longer than a threshold, + * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log. Only one warning + * will be printed during this duration for the entire system. + */ +#define ALLOC_STALL_WARN_MSECS (10 * 1000UL) +static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES; + static bool page_contains_unaccepted(struct page *page, unsigned int order); static bool cond_accept_memory(struct zone *zone, unsigned int order, int alloc_flags); @@ -1211,14 +1222,18 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -static void kernel_init_pages(struct page *page, int numpages) +static void clear_highpages_kasan_tagged(struct page *page, int numpages) { - int i; - /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) - clear_highpage_kasan_tagged(page + i); + if (!IS_ENABLED(CONFIG_HIGHMEM)) { + clear_pages(kasan_reset_tag(page_address(page)), numpages); + } else { + int i; + + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); + } kasan_enable_current(); } @@ -1303,8 +1318,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #endif /* CONFIG_MEM_ALLOC_PROFILING */ -__always_inline bool __free_pages_prepare(struct page *page, - unsigned int order, fpi_t fpi_flags) +static __always_inline bool __free_pages_prepare(struct page *page, + unsigned int order, fpi_t fpi_flags) { int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page); @@ -1312,6 +1327,9 @@ __always_inline bool __free_pages_prepare(struct page *page, bool compound = PageCompound(page); struct folio *folio = page_folio(page); + if (fpi_flags & FPI_PREPARED) + return true; + VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); @@ -1423,7 +1441,7 @@ __always_inline bool __free_pages_prepare(struct page *page, init = false; } if (init) - kernel_init_pages(page, 1 << order); + clear_highpages_kasan_tagged(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1451,7 +1469,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex) { - unsigned long flags; unsigned int order; struct page *page; @@ -1464,7 +1481,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); while (count > 0) { struct list_head *list; @@ -1496,8 +1513,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, order, mt); } while (count > 0 && !list_empty(list)); } - - spin_unlock_irqrestore(&zone->lock, flags); } /* Split a multi-block free page into its individual pageblocks. */ @@ -1848,7 +1863,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } /* If memory is still not initialized, initialize it now. */ if (init) - kernel_init_pages(page, 1 << order); + clear_highpages_kasan_tagged(page, 1 << order); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); @@ -3424,7 +3439,7 @@ static void reserve_highatomic_pageblock(struct page *page, int order, struct zone *zone) { int mt; - unsigned long max_managed, flags; + unsigned long max_managed; /* * The number reserved as: minimum is 1 pageblock, maximum is @@ -3438,29 +3453,26 @@ static void reserve_highatomic_pageblock(struct page *page, int order, if (zone->nr_reserved_highatomic >= max_managed) return; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); /* Recheck the nr_reserved_highatomic limit under the lock */ if (zone->nr_reserved_highatomic >= max_managed) - goto out_unlock; + return; /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ if (!migratetype_is_mergeable(mt)) - goto out_unlock; + return; if (order < pageblock_order) { if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) - goto out_unlock; + return; zone->nr_reserved_highatomic += pageblock_nr_pages; } else { change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); zone->nr_reserved_highatomic += 1 << order; } - -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); } /* @@ -3476,7 +3488,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, bool force) { struct zonelist *zonelist = ac->zonelist; - unsigned long flags; struct zoneref *z; struct zone *zone; struct page *page; @@ -3493,7 +3504,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, pageblock_nr_pages) continue; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); unsigned long size; @@ -3540,12 +3551,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * so this should not fail on zone boundaries. */ WARN_ON_ONCE(ret == -1); - if (ret > 0) { - spin_unlock_irqrestore(&zone->lock, flags); + if (ret > 0) return ret; - } } - spin_unlock_irqrestore(&zone->lock, flags); } return false; @@ -4678,6 +4686,40 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask, + unsigned int order, unsigned long alloc_start_time) +{ + static DEFINE_SPINLOCK(alloc_stall_lock); + unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time); + + if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS)) + return; + if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies))) + return; + if (gfp_mask & __GFP_NOWARN) + return; + + if (!spin_trylock(&alloc_stall_lock)) + return; + + /* Check again, this time under the lock */ + if (time_is_after_jiffies(alloc_stall_warn_jiffies)) { + spin_unlock(&alloc_stall_lock); + return; + } + + WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS)); + spin_unlock(&alloc_stall_lock); + + pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl", + current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); + cpuset_print_current_mems_allowed(); + pr_cont("\n"); + dump_stack(); + warn_alloc_show_mem(gfp_mask, nodemask); +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4698,6 +4740,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int reserve_flags; bool compact_first = false; bool can_retry_reserves = true; + unsigned long alloc_start_time = jiffies; if (unlikely(nofail)) { /* @@ -4813,6 +4856,9 @@ retry: if (current->flags & PF_MEMALLOC) goto nopage; + /* If allocation has taken excessively long, warn about it */ + check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time); + /* Try direct reclaim and then allocating */ if (!compact_first) { page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, @@ -5044,7 +5090,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct per_cpu_pages *pcp; struct list_head *pcp_list; struct alloc_context ac; - gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; int nr_populated = 0, nr_account = 0; @@ -5085,10 +5130,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ gfp &= gfp_allowed_mask; - alloc_gfp = gfp; - if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags)) goto out; - gfp = alloc_gfp; /* Find an allowed local zone that meets the low watermark. */ z = ac.preferred_zoneref; @@ -5180,6 +5223,34 @@ failed: EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* + * free_pages_bulk - Free an array of order-0 pages + * @page_array: Array of pages to free + * @nr_pages: The number of pages in the array + * + * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous + * run are released with a single __free_contig_range() call. + * + * This assumes page_array is sorted in ascending PFN order. Without that, + * the function still frees all pages, but contiguous runs may not be + * detected and the freeing pattern can degrade to freeing one page at a + * time. + * + * Context: Sleepable process context only; calls cond_resched() + */ +void free_pages_bulk(struct page **page_array, unsigned long nr_pages) +{ + while (nr_pages) { + unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages); + + __free_contig_range(page_to_pfn(*page_array), nr_contig); + + nr_pages -= nr_contig; + page_array += nr_contig; + cond_resched(); + } +} + +/* * This is the 'heart' of the zoned buddy allocator. */ struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, @@ -6758,6 +6829,105 @@ void __init page_alloc_sysctl_init(void) register_sysctl_init("vm", page_alloc_sysctl_table); } +static void free_prepared_contig_range(struct page *page, + unsigned long nr_pages) +{ + unsigned long pfn = page_to_pfn(page); + + while (nr_pages) { + unsigned int order; + + /* We are limited by the largest buddy order. */ + order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER; + /* Don't exceed the number of pages to free. */ + order = min_t(unsigned int, order, ilog2(nr_pages)); + order = min_t(unsigned int, order, MAX_PAGE_ORDER); + + /* + * Free the chunk as a single block. Our caller has already + * called free_pages_prepare() for each order-0 page. + */ + __free_frozen_pages(page, order, FPI_PREPARED); + + pfn += 1UL << order; + page += 1UL << order; + nr_pages -= 1UL << order; + } +} + +static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages, + bool is_frozen) +{ + struct page *page, *start = NULL; + unsigned long nr_start = 0; + unsigned long start_sec; + unsigned long i; + + for (i = 0; i < nr_pages; i++) { + bool can_free = true; + + /* + * Contiguous PFNs might not have contiguous "struct pages" + * in some kernel configs: page++ across a section boundary + * is undefined. Use pfn_to_page() for each PFN. + */ + page = pfn_to_page(pfn + i); + + VM_WARN_ON_ONCE(PageHead(page)); + VM_WARN_ON_ONCE(PageTail(page)); + + if (!is_frozen) + can_free = put_page_testzero(page); + + if (can_free) + can_free = free_pages_prepare(page, 0); + + if (!can_free) { + if (start) { + free_prepared_contig_range(start, i - nr_start); + start = NULL; + } + continue; + } + + if (start && memdesc_section(page->flags) != start_sec) { + free_prepared_contig_range(start, i - nr_start); + start = page; + nr_start = i; + start_sec = memdesc_section(page->flags); + } else if (!start) { + start = page; + nr_start = i; + start_sec = memdesc_section(page->flags); + } + } + + if (start) + free_prepared_contig_range(start, nr_pages - nr_start); +} + +/** + * __free_contig_range - Free contiguous range of order-0 pages. + * @pfn: Page frame number of the first page in the range. + * @nr_pages: Number of pages to free. + * + * For each order-0 struct page in the physically contiguous range, put a + * reference. Free any page who's reference count falls to zero. The + * implementation is functionally equivalent to, but significantly faster than + * calling __free_page() for each struct page in a loop. + * + * Memory allocated with alloc_pages(order>=1) then subsequently split to + * order-0 with split_page() is an example of appropriate contiguous pages that + * can be freed with this API. + * + * Context: May be called in interrupt context or while holding a normal + * spinlock, but not in NMI context or while holding a raw spinlock. + */ +void __free_contig_range(unsigned long pfn, unsigned long nr_pages) +{ + __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false); +} + #ifdef CONFIG_CONTIG_ALLOC /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) @@ -6895,8 +7065,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) { - for (; nr_pages--; pfn++) - free_frozen_pages(pfn_to_page(pfn), 0); + __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true); } /** @@ -7304,8 +7473,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn)))) return; - for (; nr_pages--; pfn++) - __free_page(pfn_to_page(pfn)); + __free_contig_range(pfn, nr_pages); } EXPORT_SYMBOL(free_contig_range); #endif /* CONFIG_CONTIG_ALLOC */ @@ -7363,7 +7531,7 @@ void zone_pcp_reset(struct zone *zone) unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long already_offline = 0, flags; + unsigned long already_offline = 0; unsigned long pfn = start_pfn; struct page *page; struct zone *zone; @@ -7371,7 +7539,7 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn, offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); while (pfn < end_pfn) { page = pfn_to_page(pfn); /* @@ -7401,7 +7569,6 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn, del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } - spin_unlock_irqrestore(&zone->lock, flags); return end_pfn - start_pfn - already_offline; } @@ -7473,11 +7640,9 @@ bool take_page_off_buddy(struct page *page) { struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); - unsigned long flags; unsigned int order; - bool ret = false; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head); @@ -7492,14 +7657,12 @@ bool take_page_off_buddy(struct page *page) break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - ret = true; - break; + return true; } if (page_count(page_head) > 0) break; } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + return false; } /* @@ -7508,23 +7671,19 @@ bool take_page_off_buddy(struct page *page) bool put_page_back_buddy(struct page *page) { struct zone *zone = page_zone(page); - unsigned long flags; - bool ret = false; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); if (put_page_testzero(page)) { unsigned long pfn = page_to_pfn(page); int migratetype = get_pfnblock_migratetype(page, pfn); ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); - if (TestClearPageHWPoison(page)) { - ret = true; - } + if (TestClearPageHWPoison(page)) + return true; } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + return false; } #endif diff --git a/mm/page_io.c b/mm/page_io.c index 70cea9e24d2fd..7ed76592e20d8 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -326,8 +326,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) struct swap_iocb { struct kiocb iocb; - struct bio_vec bvec[SWAP_CLUSTER_MAX]; - int pages; + struct bio_vec bvecs[SWAP_CLUSTER_MAX]; + int nr_bvecs; int len; }; static mempool_t *sio_pool; @@ -348,7 +348,7 @@ int sio_pool_init(void) static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); - struct page *page = sio->bvec[0].bv_page; + struct page *page = sio->bvecs[0].bv_page; int p; if (ret != sio->len) { @@ -362,15 +362,15 @@ static void sio_write_complete(struct kiocb *iocb, long ret) */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, swap_dev_pos(page_swap_entry(page))); - for (p = 0; p < sio->pages; p++) { - page = sio->bvec[p].bv_page; + for (p = 0; p < sio->nr_bvecs; p++) { + page = sio->bvecs[p].bv_page; set_page_dirty(page); ClearPageReclaim(page); } } - for (p = 0; p < sio->pages; p++) - end_page_writeback(sio->bvec[p].bv_page); + for (p = 0; p < sio->nr_bvecs; p++) + end_page_writeback(sio->bvecs[p].bv_page); mempool_free(sio, sio_pool); } @@ -397,13 +397,13 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) init_sync_kiocb(&sio->iocb, swap_file); sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = pos; - sio->pages = 0; + sio->nr_bvecs = 0; sio->len = 0; } - bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0); sio->len += folio_size(folio); - sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) { + sio->nr_bvecs += 1; + if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !swap_plug) { swap_write_unplug(sio); sio = NULL; } @@ -477,7 +477,7 @@ void swap_write_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -489,8 +489,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret) int p; if (ret == sio->len) { - for (p = 0; p < sio->pages; p++) { - struct folio *folio = page_folio(sio->bvec[p].bv_page); + for (p = 0; p < sio->nr_bvecs; p++) { + struct folio *folio = page_folio(sio->bvecs[p].bv_page); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); @@ -499,8 +499,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret) } count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT); } else { - for (p = 0; p < sio->pages; p++) { - struct folio *folio = page_folio(sio->bvec[p].bv_page); + for (p = 0; p < sio->nr_bvecs; p++) { + struct folio *folio = page_folio(sio->bvecs[p].bv_page); folio_unlock(folio); } @@ -559,13 +559,13 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) init_sync_kiocb(&sio->iocb, sis->swap_file); sio->iocb.ki_pos = pos; sio->iocb.ki_complete = sio_read_complete; - sio->pages = 0; + sio->nr_bvecs = 0; sio->len = 0; } - bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0); sio->len += folio_size(folio); - sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { + sio->nr_bvecs += 1; + if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !plug) { swap_read_unplug(sio); sio = NULL; } @@ -666,7 +666,7 @@ void __swap_read_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c48ff5c002449..7a9d631945a34 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -167,48 +167,40 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, { struct zone *zone = page_zone(page); struct page *unmovable; - unsigned long flags; unsigned long check_unmovable_start, check_unmovable_end; if (PageUnaccepted(page)) accept_page(page); - spin_lock_irqsave(&zone->lock, flags); - - /* - * We assume the caller intended to SET migrate type to isolate. - * If it is already set, then someone else must have raced and - * set it before us. - */ - if (is_migrate_isolate_page(page)) { - spin_unlock_irqrestore(&zone->lock, flags); - return -EBUSY; - } + scoped_guard(spinlock_irqsave, &zone->lock) { + /* + * We assume the caller intended to SET migrate type to + * isolate. If it is already set, then someone else must have + * raced and set it before us. + */ + if (is_migrate_isolate_page(page)) + return -EBUSY; - /* - * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. - * We just check MOVABLE pages. - * - * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock - * to avoid redundant checks. - */ - check_unmovable_start = max(page_to_pfn(page), start_pfn); - check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), - end_pfn); + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by + * itself. We just check MOVABLE pages. + * + * Pass the intersection of [start_pfn, end_pfn) and the page's + * pageblock to avoid redundant checks. + */ + check_unmovable_start = max(page_to_pfn(page), start_pfn); + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), + end_pfn); - unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, - mode); - if (!unmovable) { - if (!pageblock_isolate_and_move_free_pages(zone, page)) { - spin_unlock_irqrestore(&zone->lock, flags); - return -EBUSY; + unmovable = has_unmovable_pages(check_unmovable_start, + check_unmovable_end, mode); + if (!unmovable) { + if (!pageblock_isolate_and_move_free_pages(zone, page)) + return -EBUSY; + zone->nr_isolate_pageblock++; + return 0; } - zone->nr_isolate_pageblock++; - spin_unlock_irqrestore(&zone->lock, flags); - return 0; } - - spin_unlock_irqrestore(&zone->lock, flags); if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) { /* * printk() with zone->lock held will likely trigger a @@ -223,15 +215,14 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, static void unset_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; bool isolated_page = false; unsigned int order; struct page *buddy; zone = page_zone(page); - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); if (!is_migrate_isolate_page(page)) - goto out; + return; /* * Because freepage with more than pageblock_order on isolated @@ -279,8 +270,6 @@ static void unset_migratetype_isolate(struct page *page) __putback_isolated_page(page, order, get_pageblock_migratetype(page)); } zone->nr_isolate_pageblock--; -out: - spin_unlock_irqrestore(&zone->lock, flags); } static inline struct page * diff --git a/mm/page_owner.c b/mm/page_owner.c index 8178e0be557f8..2dddcb6510aa1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -573,7 +573,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[page_mt], pfn >> pageblock_order, migratetype_names[pageblock_mt], - &page->flags); + &page->flags.f); ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) diff --git a/mm/readahead.c b/mm/readahead.c index 7b05082c89ea2..8c12b63ccd4a2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -324,6 +324,8 @@ static void do_page_cache_ra(struct readahead_control *ractl, return; end_index = (isize - 1) >> PAGE_SHIFT; + if (end_index > ractl->_max_index) + end_index = ractl->_max_index; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ @@ -471,7 +473,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t start = readahead_index(ractl); pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); - pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + pgoff_t limit; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; @@ -484,6 +486,8 @@ void page_cache_ra_order(struct readahead_control *ractl, goto fallback; } + limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + limit = min(limit, ractl->_max_index); limit = min(limit, index + ra->size - 1); new_order = min(mapping_max_folio_order(mapping), new_order); diff --git a/mm/rmap.c b/mm/rmap.c index 99e1b3dc390b7..1c77d5dc06e9f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -571,7 +571,7 @@ void __init anon_vma_init(void) * In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as - * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * long as we observe folio_mapped() [ hence all those folio_mapped() tests ]. * * All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it @@ -1999,7 +1999,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), - * try_to_unmap() may return before page_mapped() has become false, + * try_to_unmap() may return before folio_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) @@ -2428,7 +2428,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), - * try_to_migrate() may return before page_mapped() has become false, + * try_to_migrate() may return before folio_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) @@ -2929,7 +2929,7 @@ static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() - * because that depends on page_mapped(); but not all its usages + * because that depends on folio_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ diff --git a/mm/shmem.c b/mm/shmem.c index 3b5dc21b323c2..bab3529af23c5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3101,10 +3101,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); - - /* Don't consider 'deny' for emergencies and 'force' for testing */ - if (sbinfo->huge) - mapping_set_large_folios(inode->i_mapping); + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 6eadb9d116e43..112ccf9c71caf 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -87,15 +87,10 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap) { - void *ptr; - if (altmap) return altmap_alloc_block_buf(size, altmap); - ptr = sparse_buffer_alloc(size); - if (!ptr) - ptr = vmemmap_alloc_block(size, node); - return ptr; + return vmemmap_alloc_block(size, node); } static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) @@ -151,7 +146,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node, start, end - 1); } -pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, +static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, unsigned long ptpfn, unsigned long flags) { @@ -195,7 +190,7 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) return p; } -pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) +static pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { @@ -208,7 +203,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } -pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) +static pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { @@ -221,7 +216,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) return pud; } -p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) +static p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { @@ -234,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) return p4d; } -pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) +static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { @@ -652,26 +647,61 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } +static int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + const unsigned int order = pgmap ? pgmap->vmemmap_shift : 0; + const unsigned long pages_per_compound = 1UL << order; + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION)); + VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION); + + if (!vmemmap_can_optimize(altmap, pgmap)) + return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE); + + if (order < PFN_SECTION_SHIFT) { + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound)); + return VMEMMAP_RESERVE_NR * nr_pages / pages_per_compound; + } + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)); + + if (IS_ALIGNED(pfn, pages_per_compound)) + return VMEMMAP_RESERVE_NR; + + return 0; +} + static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { - return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap, + pgmap); + + memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap)); + + return page; } static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); + memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap)); vmemmap_free(start, end, altmap); } + static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long pfn = page_to_pfn(memmap); + memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION, + NULL, NULL)); vmemmap_free(start, end, NULL); } @@ -737,7 +767,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) * usage map, but still need to free the vmemmap range. */ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { struct mem_section *ms = __pfn_to_section(pfn); bool section_is_early = early_section(ms); @@ -774,14 +804,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * The memmap of early sections is always fully populated. See * section_activate() and pfn_valid() . */ - if (!section_is_early) { - memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); - depopulate_section_memmap(pfn, nr_pages, altmap); - } else if (memmap) { - memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), - PAGE_SIZE))); + if (!section_is_early) + depopulate_section_memmap(pfn, nr_pages, altmap, pgmap); + else if (memmap) free_map_bootmem(memmap); - } if (empty) ms->section_mem_map = (unsigned long)NULL; @@ -823,10 +849,9 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); if (!memmap) { - section_deactivate(pfn, nr_pages, altmap); + section_deactivate(pfn, nr_pages, altmap, pgmap); return ERR_PTR(-ENOMEM); } - memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); return memmap; } @@ -885,13 +910,13 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, } void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { struct mem_section *ms = __pfn_to_section(pfn); if (WARN_ON_ONCE(!valid_section(ms))) return; - section_deactivate(pfn, nr_pages, altmap); + section_deactivate(pfn, nr_pages, altmap, pgmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/sparse.c b/mm/sparse.c index effdac6b0ab13..16ac6df3c89fa 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -201,13 +201,11 @@ static void __init memblocks_present(void) int i, nid; #ifdef CONFIG_SPARSEMEM_EXTREME - if (unlikely(!mem_section)) { - unsigned long size, align; + unsigned long size, align; - size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc_or_panic(size, align); - } + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_alloc_or_panic(size, align); #endif for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) @@ -241,12 +239,9 @@ struct page __init *__populate_section_memmap(unsigned long pfn, struct dev_pagemap *pgmap) { unsigned long size = section_map_size(); - struct page *map = sparse_buffer_alloc(size); + struct page *map; phys_addr_t addr = __pa(MAX_DMA_ADDRESS); - if (map) - return map; - map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", @@ -256,55 +251,6 @@ struct page __init *__populate_section_memmap(unsigned long pfn, } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -static void *sparsemap_buf __meminitdata; -static void *sparsemap_buf_end __meminitdata; - -static inline void __meminit sparse_buffer_free(unsigned long size) -{ - WARN_ON(!sparsemap_buf || size == 0); - memblock_free(sparsemap_buf, size); -} - -static void __init sparse_buffer_init(unsigned long size, int nid) -{ - phys_addr_t addr = __pa(MAX_DMA_ADDRESS); - WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ - /* - * Pre-allocated buffer is mainly used by __populate_section_memmap - * and we want it to be properly aligned to the section size - this is - * especially the case for VMEMMAP which maps memmap to PMDs - */ - sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); - sparsemap_buf_end = sparsemap_buf + size; -} - -static void __init sparse_buffer_fini(void) -{ - unsigned long size = sparsemap_buf_end - sparsemap_buf; - - if (sparsemap_buf && size > 0) - sparse_buffer_free(size); - sparsemap_buf = NULL; -} - -void * __meminit sparse_buffer_alloc(unsigned long size) -{ - void *ptr = NULL; - - if (sparsemap_buf) { - ptr = (void *) roundup((unsigned long)sparsemap_buf, size); - if (ptr + size > sparsemap_buf_end) - ptr = NULL; - else { - /* Free redundant aligned space */ - if ((unsigned long)(ptr - sparsemap_buf) > 0) - sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); - sparsemap_buf = ptr + size; - } - } - return ptr; -} - void __weak __meminit vmemmap_populate_print_last(void) { } @@ -362,8 +308,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, goto failed; } - sparse_buffer_init(map_count * section_map_size(), nid); - sparse_vmemmap_init_nid_early(nid); for_each_present_section_nr(pnum_begin, pnum) { @@ -381,7 +325,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, __func__, nid); pnum_begin = pnum; sparse_usage_fini(); - sparse_buffer_fini(); goto failed; } memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), @@ -390,7 +333,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, } } sparse_usage_fini(); - sparse_buffer_fini(); return; failed: /* diff --git a/mm/swap.c b/mm/swap.c index 5cc44f0de9877..2dd84813f4dde 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -160,14 +160,42 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; + struct folio_batch free_fbatch; + bool is_lru_add = (move_fn == lru_add); + + /* + * If we're adding to the LRU, preemptively filter dead folios. Use + * this dedicated folio batch for temp storage and deferred cleanup. + */ + if (is_lru_add) + folio_batch_init(&free_fbatch); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ - if (move_fn != lru_add && !folio_test_clear_lru(folio)) + if (!is_lru_add && !folio_test_clear_lru(folio)) continue; + /* + * Filter dead folios by moving them from the add batch to the temp + * batch for freeing after this loop. + * + * We're bypassing normal cleanup. Clear flags that are not + * applicable to dead folios. + * + * Since the folio may be part of a huge page, unqueue from + * deferred split list to avoid a dangling list entry. + */ + if (is_lru_add && folio_ref_freeze(folio, 1)) { + __folio_clear_active(folio); + __folio_clear_unevictable(folio); + folio_unqueue_deferred_split(folio); + fbatch->folios[i] = NULL; + folio_batch_add(&free_fbatch, folio); + continue; + } + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,6 +204,13 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) lruvec_unlock_irqrestore(lruvec, flags); + + /* Cleanup filtered dead folios. */ + if (is_lru_add) { + mem_cgroup_uncharge_folios(&free_fbatch); + free_unref_folios(&free_fbatch); + } + folios_put(fbatch); } @@ -964,6 +999,10 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; + /* Folio batch entry may have been preemptively removed during drain. */ + if (!folio) + continue; + if (is_huge_zero_folio(folio)) continue; diff --git a/mm/swapfile.c b/mm/swapfile.c index 9174f1eeffb09..74a1e324449dc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1054,6 +1054,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) swap_cluster_unlock(ci); if (to_scan <= 0) break; + cond_resched(); } } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bb6ae08d18f58..eabb86b13b7e5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3459,19 +3459,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - for (i = 0; i < vm->nr_pages; i++) { - struct page *page = vm->pages[i]; - BUG_ON(!page); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_lruvec_page_state(page, NR_VMALLOC, -1); - __free_page(page); - cond_resched(); + if (!(vm->flags & VM_MAP_PUT_PAGES)) { + for (i = 0; i < vm->nr_pages; i++) + mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); } + free_pages_bulk(vm->pages, vm->nr_pages); + kvfree(vm->pages); kfree(vm); } @@ -3939,7 +3933,7 @@ fail: __GFP_NOFAIL | __GFP_ZERO |\ __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ - GFP_USER | __GFP_NOLOCKDEP) + GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN) static gfp_t vmalloc_fix_flags(gfp_t flags) { @@ -3980,6 +3974,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags) * * %__GFP_NOWARN can be used to suppress failure messages. * + * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages + * (when prot=%PAGE_KERNEL). + * * Can not be called from interrupt nor NMI contexts. * Return: the address of the area or %NULL on failure */ @@ -3993,6 +3990,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long original_align = align; unsigned int shift = PAGE_SHIFT; + bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN); if (WARN_ON_ONCE(!size)) return NULL; @@ -4023,7 +4021,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, again: area = __get_vm_area_node(size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, - gfp_mask, caller); + gfp_mask & ~__GFP_SKIP_KASAN, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, @@ -4041,7 +4039,7 @@ again: * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { - if (kasan_hw_tags_enabled()) { + if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) { /* * Modify protection bits to allow tagging. * This must be done before mapping. @@ -4078,7 +4076,8 @@ again: (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ - area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); + if (!skip_vmalloc_kasan) + area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 3fbb86996c4d2..f053554e58264 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -218,6 +218,7 @@ static void vmpressure_work_fn(struct work_struct *work) /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask + * @order: allocation order being reclaimed for * @memcg: cgroup memory controller handle * @tree: legacy subtree mode * @scanned: number of pages scanned @@ -236,7 +237,7 @@ static void vmpressure_work_fn(struct work_struct *work) * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, +void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr; @@ -307,7 +308,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, level = vmpressure_calc_level(scanned, reclaimed); - if (level > VMPRESSURE_LOW) { + /* + * Once we go above COSTLY_ORDER, reclaim relies heavily on + * compaction to make progress. Reclaim efficiency was never a + * great proxy for pressure to begin with, but it's outright + * misleading with these high orders. Don't throttle sockets + * because somebody is attempting something crazy like an order-7 + * and predictably struggling. + */ + if (level > VMPRESSURE_LOW && order <= PAGE_ALLOC_COSTLY_ORDER) { /* * Let the socket buffer allocator know that * we are having trouble reclaiming LRU pages. @@ -348,7 +357,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, true, vmpressure_win, 0); + vmpressure(gfp, 0, memcg, true, vmpressure_win, 0); } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) diff --git a/mm/vmscan.c b/mm/vmscan.c index bd1b1aa125819..4b09843876583 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -109,7 +109,7 @@ struct scan_control { /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ + /* zone_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -5071,8 +5071,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); if (!sc->proactive) - vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + vmpressure(sc->gfp_mask, sc->order, memcg, false, + sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); flush_reclaim_state(sc); @@ -6175,7 +6175,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) /* Record the group's reclaim efficiency */ if (!sc->proactive) - vmpressure(sc->gfp_mask, memcg, false, + vmpressure(sc->gfp_mask, sc->order, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); @@ -6220,7 +6220,7 @@ again: /* Record the subtree's reclaim efficiency */ if (!sc->proactive) - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + vmpressure(sc->gfp_mask, sc->order, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, nr_node_reclaimed); if (nr_node_reclaimed) @@ -6359,7 +6359,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) if (current_is_kswapd() || cgroup_reclaim(sc)) return; - /* Throttle if making no progress at high prioities. */ + /* Throttle if making no progress at high priorities. */ if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } @@ -7121,6 +7121,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) .may_unmap = 1, }; + trace_mm_vmscan_balance_pgdat_begin(pgdat->node_id, order, + highest_zoneidx); set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(_THIS_IP_); @@ -7222,7 +7224,7 @@ restart: /* * There should be no need to raise the scanning priority if - * enough pages are already being scanned that that high + * enough pages are already being scanned that the high * watermark would be met at 100% efficiency. */ if (kswapd_shrink_node(pgdat, &sc)) @@ -7314,6 +7316,9 @@ out: psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); + trace_mm_vmscan_balance_pgdat_end(pgdat->node_id, sc.order, + highest_zoneidx, sc.nr_reclaimed); + /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller |
