aboutsummaryrefslogtreecommitdiffstats
diff options
authorMark Brown <broonie@kernel.org>2026-05-29 17:54:33 +0100
committerMark Brown <broonie@kernel.org>2026-05-29 17:54:33 +0100
commit18a549b897f99757e5ec7ed570222de21c36285f (patch)
tree3ad8f162aac9dc9dcbb07a50b050f08272f209d9
parent1eb7867715322670ed08997d90ecd153ac5fbe4f (diff)
parentcfaef29c20e86738aec28641b6de1e078298999e (diff)
downloadlinux-next-history-18a549b897f99757e5ec7ed570222de21c36285f.tar.gz
Merge branch 'mm-stable' of https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-damon81
-rw-r--r--Documentation/admin-guide/mm/damon/lru_sort.rst8
-rw-r--r--Documentation/admin-guide/mm/damon/reclaim.rst19
-rw-r--r--Documentation/admin-guide/mm/damon/stat.rst7
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst38
-rw-r--r--Documentation/mm/damon/design.rst33
-rw-r--r--Documentation/mm/damon/maintainer-profile.rst21
-rw-r--r--arch/arm64/mm/mmu.c5
-rw-r--r--arch/loongarch/mm/init.c5
-rw-r--r--arch/powerpc/mm/mem.c5
-rw-r--r--arch/riscv/mm/init.c5
-rw-r--r--arch/s390/mm/init.c5
-rw-r--r--arch/sh/mm/cache-sh4.c2
-rw-r--r--arch/x86/mm/init_64.c5
-rw-r--r--drivers/dax/kmem.c6
-rw-r--r--drivers/gpu/drm/drm_managed.c4
-rw-r--r--include/asm-generic/pgalloc.h2
-rw-r--r--include/linux/damon.h18
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/gfp_types.h6
-rw-r--r--include/linux/highmem-internal.h2
-rw-r--r--include/linux/memory_hotplug.h8
-rw-r--r--include/linux/mm.h18
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/thread_info.h2
-rw-r--r--include/linux/vmpressure.h9
-rw-r--r--include/trace/events/vmscan.h52
-rw-r--r--kernel/bpf/arena.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/damon/core.c335
-rw-r--r--mm/damon/lru_sort.c54
-rw-r--r--mm/damon/ops-common.c9
-rw-r--r--mm/damon/reclaim.c93
-rw-r--r--mm/damon/stat.c92
-rw-r--r--mm/damon/sysfs-schemes.c65
-rw-r--r--mm/damon/sysfs.c31
-rw-r--r--mm/damon/tests/core-kunit.h10
-rw-r--r--mm/damon/vaddr.c3
-rw-r--r--mm/filemap.c67
-rw-r--r--mm/gup.c8
-rw-r--r--mm/huge_memory.c107
-rw-r--r--mm/hugetlb.c7
-rw-r--r--mm/kfence/kfence_test.c2
-rw-r--r--mm/khugepaged.c9
-rw-r--r--mm/kmemleak.c148
-rw-r--r--mm/madvise.c60
-rw-r--r--mm/memcontrol.c5
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c57
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/memremap.c4
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/migrate_device.c9
-rw-r--r--mm/mm_init.c47
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/page_alloc.c265
-rw-r--r--mm/page_io.c42
-rw-r--r--mm/page_isolation.c65
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/sparse-vmemmap.c75
-rw-r--r--mm/sparse.c68
-rw-r--r--mm/swap.c41
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/vmalloc.c29
-rw-r--r--mm/vmpressure.c15
-rw-r--r--mm/vmscan.c19
-rw-r--r--net/rds/tcp_recv.c2
-rw-r--r--tools/testing/selftests/cgroup/lib/cgroup_util.c18
-rw-r--r--tools/testing/selftests/cgroup/lib/include/cgroup_util.h4
-rw-r--r--tools/testing/selftests/cgroup/test_core.c2
-rw-r--r--tools/testing/selftests/cgroup/test_freezer.c2
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c13
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c19
-rw-r--r--tools/testing/selftests/cgroup/test_zswap.c181
-rw-r--r--tools/testing/selftests/damon/_damon_sysfs.py31
-rwxr-xr-xtools/testing/selftests/damon/drgn_dump_damon_status.py3
-rwxr-xr-xtools/testing/selftests/damon/sysfs.py56
-rw-r--r--tools/testing/selftests/mm/Makefile10
-rwxr-xr-xtools/testing/selftests/mm/check_config.sh2
-rw-r--r--tools/testing/selftests/mm/droppable.c9
-rw-r--r--tools/testing/selftests/mm/khugepaged.c18
-rwxr-xr-xtools/testing/selftests/mm/ksft_kmemleak_dedup.sh222
-rw-r--r--tools/testing/selftests/mm/mlock2-tests.c84
-rw-r--r--tools/testing/selftests/mm/mremap_test.c109
-rw-r--r--tools/testing/selftests/mm/process_madv.c28
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh1
90 files changed, 2234 insertions, 898 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index 2424237ebb105..ee29d4e204ffa 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -84,6 +84,13 @@ Description: Writing an integer to this file sets the 'address unit'
parameter of the given operations set of the context. Reading
the file returns the last-written 'address unit' value.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/pause
+Date: Mar 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a boolean keyword to this file sets the 'pause' request
+ parameter for the context. Reading the file returns the
+ last-written 'pause' value.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
@@ -322,6 +329,18 @@ Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the
goal-based effective quota auto-tuning algorithm to use.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/fail_charge_num
+Date: Mar 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the
+ action-failed memory quota charging ratio numerator.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/fail_charge_denom
+Date: Mar 2026
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the
+ action-failed memory quota charging ratio denominator.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/sz_permil
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
@@ -377,15 +396,20 @@ Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the low
watermark of the scheme in permil.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/nr_filters
-Date: Dec 2022
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters
+Date: Feb 2025
+Contact: SeongJae Park <sj@kernel.org>
+Description: Directory for DAMON core layer-handled DAMOS filters.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/nr_filters
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Writing a number 'N' to this file creates the number of
directories for setting filters of the scheme named '0' to
- 'N-1' under the filters/ directory.
+ 'N-1' under the core_filters/ directory.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/type
-Date: Dec 2022
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/type
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the type of
the memory of the interest. 'anon' for anonymous pages,
@@ -393,77 +417,78 @@ Description: Writing to and reading from this file sets and gets the type of
'addr' for address range (an open-ended interval), or 'target'
for DAMON monitoring target can be written and read.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/memcg_path
-Date: Dec 2022
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/memcg_path
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: If 'memcg' is written to the 'type' file, writing to and
reading from this file sets and gets the path to the memory
cgroup of the interest.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/addr_start
-Date: Jul 2023
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/addr_start
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: If 'addr' is written to the 'type' file, writing to or reading
from this file sets or gets the start address of the address
range for the filter.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/addr_end
-Date: Jul 2023
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/addr_end
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: If 'addr' is written to the 'type' file, writing to or reading
from this file sets or gets the end address of the address
range for the filter.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/min
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/min
Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: If 'hugepage_size' is written to the 'type' file, writing to
or reading from this file sets or gets the minimum size of the
hugepage for the filter.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/max
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/max
Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: If 'hugepage_size' is written to the 'type' file, writing to
or reading from this file sets or gets the maximum size of the
hugepage for the filter.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/target_idx
-Date: Dec 2022
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/target_idx
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: If 'target' is written to the 'type' file, writing to or
reading from this file sets or gets the index of the DAMON
monitoring target of the interest.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
-Date: Dec 2022
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/matching
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Writing 'Y' or 'N' to this file sets whether the filter is for
the memory of the 'type', or all except the 'type'.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/allow
-Date: Jan 2025
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters/<F>/allow
+Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Writing 'Y' or 'N' to this file sets whether to allow or reject
applying the scheme's action to the memory that satisfies the
'type' and the 'matching' of the directory.
-What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters
-Date: Feb 2025
-Contact: SeongJae Park <sj@kernel.org>
-Description: Directory for DAMON core layer-handled DAMOS filters. Files
- under this directory works same to those of
- /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters
- directory.
-
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/ops_filters
Date: Feb 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Directory for DAMON operations set layer-handled DAMOS filters.
Files under this directory works same to those of
- /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters
+ /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters
directory.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Directory for DAMOS filters. Files under this directory works
+ same to those of
+ /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/{core,ops}_filters
+ directory. This is deprecated. Use the core_filters and
+ ops_filters instead.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/dests/nr_dests
Date: Jul 2025
Contact: SeongJae Park <sj@kernel.org>
diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst
index 14cc6b2db8973..b93ca9b0853d2 100644
--- a/Documentation/admin-guide/mm/damon/lru_sort.rst
+++ b/Documentation/admin-guide/mm/damon/lru_sort.rst
@@ -75,7 +75,7 @@ Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
Input parameters that updated while DAMON_LRU_SORT is running are not applied
by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values
-of parametrs except ``enabled`` again. Once the re-reading is done, this
+of parameters except ``enabled`` again. Once the re-reading is done, this
parameter is set as ``N``. If invalid parameters are found while the
re-reading, DAMON_LRU_SORT will be disabled.
@@ -246,7 +246,8 @@ monitor_region_start
Start of target memory region in physical address.
The start physical address of memory region that DAMON_LRU_SORT will do work
-against. By default, biggest System RAM is used as the region.
+against. By default, the system's entire physical memory is used as the
+region.
monitor_region_end
------------------
@@ -254,7 +255,8 @@ monitor_region_end
End of target memory region in physical address.
The end physical address of memory region that DAMON_LRU_SORT will do work
-against. By default, biggest System RAM is used as the region.
+against. By default, the system's entire physical memory is used as the
+region.
addr_unit
---------
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index d7a0225b49508..ec7e3e32b4ac6 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -67,7 +67,7 @@ Make DAMON_RECLAIM reads the input parameters again, except ``enabled``.
Input parameters that updated while DAMON_RECLAIM is running are not applied
by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
-of parametrs except ``enabled`` again. Once the re-reading is done, this
+of parameters except ``enabled`` again. Once the re-reading is done, this
parameter is set as ``N``. If invalid parameters are found while the
re-reading, DAMON_RECLAIM will be disabled.
@@ -85,6 +85,17 @@ identifies the region as cold, and reclaims it.
120 seconds by default.
+autotune_monitoring_intervals
+-----------------------------
+
+If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's
+sampling and aggregation intervals. The auto-tuning aims to capture meaningful
+amount of access events in each DAMON-snapshot, while keeping the sampling
+interval 5 milliseconds in minimum, and 10 seconds in maximum. Setting this as
+``N`` disables the auto-tuning.
+
+Disabled by default.
+
quota_ms
--------
@@ -229,7 +240,8 @@ Start of target memory region in physical address.
The start physical address of memory region that DAMON_RECLAIM will do work
against. That is, DAMON_RECLAIM will find cold memory regions in this region
-and reclaims. By default, biggest System RAM is used as the region.
+and reclaims. By default, the system's entire physical memory is used as the
+region.
monitor_region_end
------------------
@@ -238,7 +250,8 @@ End of target memory region in physical address.
The end physical address of memory region that DAMON_RECLAIM will do work
against. That is, DAMON_RECLAIM will find cold memory regions in this region
-and reclaims. By default, biggest System RAM is used as the region.
+and reclaims. By default, the system's entire physical memory is used as the
+region.
addr_unit
---------
diff --git a/Documentation/admin-guide/mm/damon/stat.rst b/Documentation/admin-guide/mm/damon/stat.rst
index c4b14daeb2dd6..46c5dd96aa2ed 100644
--- a/Documentation/admin-guide/mm/damon/stat.rst
+++ b/Documentation/admin-guide/mm/damon/stat.rst
@@ -89,3 +89,10 @@ percentiles of the idle time values via this read-only parameter. Reading the
parameter returns 101 idle time values in milliseconds, separated by comma.
Each value represents 0-th, 1st, 2nd, 3rd, ..., 99th and 100th percentile idle
times.
+
+kdamond_pid
+-----------
+
+PID of the DAMON thread.
+
+If DAMON_STAT is enabled, this becomes the PID of the worker thread. Else, -1.
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 534e1199cf091..11c75a598393c 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -66,7 +66,8 @@ comma (",").
│ :ref:`kdamonds <sysfs_kdamonds>`/nr_kdamonds
│ │ :ref:`0 <sysfs_kdamond>`/state,pid,refresh_ms
│ │ │ :ref:`contexts <sysfs_contexts>`/nr_contexts
- │ │ │ │ :ref:`0 <sysfs_context>`/avail_operations,operations,addr_unit
+ │ │ │ │ :ref:`0 <sysfs_context>`/avail_operations,operations,addr_unit,
+ │ │ │ │ pause
│ │ │ │ │ :ref:`monitoring_attrs <sysfs_monitoring_attrs>`/
│ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
│ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us
@@ -83,7 +84,9 @@ comma (",").
│ │ │ │ │ │ │ │ sz/min,max
│ │ │ │ │ │ │ │ nr_accesses/min,max
│ │ │ │ │ │ │ │ age/min,max
- │ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms,effective_bytes,goal_tuner
+ │ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms,
+ │ │ │ │ │ │ │ effective_bytes,goal_tuner,
+ │ │ │ │ │ │ │ fail_charge_num,fail_charge_denom
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
│ │ │ │ │ │ │ │ :ref:`goals <sysfs_schemes_quota_goals>`/nr_goals
│ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid,path
@@ -194,9 +197,9 @@ details). At the moment, only one context per kdamond is supported, so only
contexts/<N>/
-------------
-In each context directory, three files (``avail_operations``, ``operations``
-and ``addr_unit``) and three directories (``monitoring_attrs``, ``targets``,
-and ``schemes``) exist.
+In each context directory, four files (``avail_operations``, ``operations``,
+``addr_unit`` and ``pause``) and three directories (``monitoring_attrs``,
+``targets``, and ``schemes``) exist.
DAMON supports multiple types of :ref:`monitoring operations
<damon_design_configurable_operations_set>`, including those for virtual address
@@ -214,6 +217,9 @@ reading from the ``operations`` file.
``addr_unit`` file is for setting and getting the :ref:`address unit
<damon_design_addr_unit>` parameter of the operations set.
+``pause`` file is for setting and getting the :ref:`pause request
+<damon_design_execution_model_and_data_structures>` parameter of the context.
+
.. _sysfs_monitoring_attrs:
contexts/<N>/monitoring_attrs/
@@ -377,9 +383,10 @@ schemes/<N>/quotas/
The directory for the :ref:`quotas <damon_design_damos_quotas>` of the given
DAMON-based operation scheme.
-Under ``quotas`` directory, five files (``ms``, ``bytes``,
-``reset_interval_ms``, ``effective_bytes`` and ``goal_tuner``) and two
-directories (``weights`` and ``goals``) exist.
+Under ``quotas`` directory, seven files (``ms``, ``bytes``,
+``reset_interval_ms``, ``effective_bytes``, ``goal_tuner``, ``fail_charge_num``
+and ``fail_charge_denom``) and two directories (``weights`` and ``goals``)
+exist.
You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and
``reset interval`` in milliseconds by writing the values to the three files,
@@ -398,6 +405,13 @@ the background design of the feature and the name of the selectable algorithms.
Refer to :ref:`goals directory <sysfs_schemes_quota_goals>` for the goals
setup.
+You can set the action-failed memory quota charging ratio by writing the
+numerator and the denominator for the ratio to ``fail_charge_num`` and
+``fail_charge_denom`` files, respectively. Reading those files will return the
+current set values. Refer to :ref:`design
+<damon_design_damos_quotas_failed_memory_charging_ratio>` for more details of
+the ratio feature.
+
The time quota is internally transformed to a size quota. Between the
transformed size quota and user-specified size quota, smaller one is applied.
Based on the user-specified :ref:`goal <sysfs_schemes_quota_goals>`, the
@@ -471,10 +485,10 @@ directory can be used for installing filters regardless of their handled
layers. Filters that requested by ``core_filters`` and ``ops_filters`` will be
installed before those of ``filters``. All three directories have same files.
-Use of ``filters`` directory can make expecting evaluation orders of given
-filters with the files under directory bit confusing. Users are hence
-recommended to use ``core_filters`` and ``ops_filters`` directories. The
-``filters`` directory could be deprecated in future.
+Use of ``filters`` directory can make filters evaluation orders confusing to
+expect. For this reason, ``filters`` directory is deprecated. It is still
+functioning, but is scheduled for removal in the near future. Users should use
+``core_filters`` and ``ops_filters`` directories instead.
In the beginning, the directory has only one file, ``nr_filters``. Writing a
number (``N``) to the file creates the number of child directories named ``0``
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index afc7d52bda2f7..fa7392b5a331d 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -19,6 +19,13 @@ types of monitoring.
To know how user-space can do the configurations and start/stop DAMON, refer to
:ref:`DAMON sysfs interface <sysfs_interface>` documentation.
+Users can also request each context execution to be paused and resumed. When
+it is paused, the kdamond does nothing other than applying online parameter
+update.
+
+To know how user-space can pause/resume each context, refer to :ref:`DAMON
+sysfs context <sysfs_context>` usage documentation.
+
Overall Architecture
====================
@@ -474,6 +481,10 @@ that supports each action are as below.
Supported by ``vaddr`` and ``fvaddr`` operations set. When
TRANSPARENT_HUGEPAGE is disabled, the application of the action will just
fail.
+ - ``collapse``: Call ``madvise()`` for the region with ``MADV_COLLAPSE``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set. When
+ TRANSPARENT_HUGEPAGE is disabled, the application of the action will just
+ fail.
- ``lru_prio``: Prioritize the region on its LRU lists.
Supported by ``paddr`` operations set.
- ``lru_deprio``: Deprioritize the region on its LRU lists.
@@ -565,6 +576,28 @@ interface <sysfs_interface>`, refer to :ref:`weights <sysfs_quotas>` part of
the documentation.
+.. _damon_design_damos_quotas_failed_memory_charging_ratio:
+
+Action-failed Memory Charging Ratio
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+DAMOS action to a given region can fail for some subsets of the memory of the
+region. For example, if the action is ``pageout`` and the region has some
+unreclaimable pages, applying the action to the pages will fail. The amount of
+system resource that is taken for such failed action applications is usually
+different from that for successful action applications. For such cases, users
+can set different charging ratio for such failed memory. The ratio can be
+specified using ``fail_charge_num`` and ``fail_charge_denom`` parameters. The
+two parameters represent the numerator and denominator of the ratio. The
+feature is enabled only if ``fail_charge_denom`` is not zero.
+
+For example, let's suppose a DAMOS action is applied to a region of 1,000 MiB
+size. The action is successfully applied to only 700 MiB of the region.
+``fail_charge_num`` and ``fail_charge_denom`` are set to ``1`` and ``1024``,
+respectively. Then only 700 MiB and 300 KiB of size (``700 MiB + 300 MiB * 1 /
+1024``) will be charged.
+
+
.. _damon_design_damos_quotas_auto_tuning:
Aim-oriented Feedback-driven Auto-tuning
diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst
index bcb9798a27a86..fb2fa00cc9aa1 100644
--- a/Documentation/mm/damon/maintainer-profile.rst
+++ b/Documentation/mm/damon/maintainer-profile.rst
@@ -100,3 +100,24 @@ There is also a public Google `calendar
<https://calendar.google.com/calendar/u/0?cid=ZDIwOTA4YTMxNjc2MDQ3NTIyMmUzYTM5ZmQyM2U4NDA0ZGIwZjBiYmJlZGQxNDM0MmY4ZTRjOTE0NjdhZDRiY0Bncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`_
that has the events. Anyone can subscribe to it. DAMON maintainer will also
provide periodic reminders to the mailing list (damon@lists.linux.dev).
+
+AI Review
+---------
+
+For patches that are publicly posted to DAMON mailing list
+(damon@lists.linux.dev), AI reviews of the patches will be available at
+sashiko.dev. The reviews could also be sent as mails to the author of the
+patch.
+
+Patch authors are encouraged to check the AI reviews and share their opinions.
+The sharing could be done as a reply to the mail thread. Consider reducing the
+recipients list for such sharing, since some people are not really interested
+in AI reviews. As a rule of thumb, drop stable@vger.kernel.org and individuals
+except DAMON maintainer.
+
+`hkml` also provides a `feature
+<https://github.com/sjp38/hackermail/blob/master/USAGE.md#forwarding-sashikodev-statuscomments-to-mailing-list>`_
+for such sharing. Please feel free to use the feature.
+
+It is only an optional recommendation. DAMON maintainer could also ask any
+question about the AI reviews, though.
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8242f93f05e4f..bfea81307126a 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -2025,12 +2025,13 @@ err:
return ret;
}
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- __remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap, pgmap);
__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
}
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index 031b39eb081c5..687980b6e91f0 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -119,12 +119,13 @@ int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
return ret;
}
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- __remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap, pgmap);
}
#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 648d0c5602ec8..4c1afab919963 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -158,12 +158,13 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
return rc;
}
-void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- __remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap, pgmap);
arch_remove_linear_mapping(start, size);
}
#endif
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index fa8d2f6f554b5..885f1db4e9bfd 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1742,9 +1742,10 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *param
return ret;
}
-void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
- __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
+ __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap, pgmap);
remove_linear_mapping(start, size);
flush_tlb_all();
}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 1f72efc2a579f..11a689423440f 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -276,12 +276,13 @@ int arch_add_memory(int nid, u64 start, u64 size,
return rc;
}
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- __remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap, pgmap);
vmem_remove_mapping(start, size);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 83fb34b39ca7c..8bc9ce541c141 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -248,7 +248,7 @@ static void sh4_flush_cache_page(void *args)
*/
map_coherent = (current_cpu_data.dcache.n_aliases &&
test_bit(PG_dcache_clean, folio_flags(folio, 0)) &&
- page_mapped(page));
+ folio_mapped(folio));
if (map_coherent)
vaddr = kmap_coherent(page, address);
else
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df2261fa4f985..77b889b71cf38 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1288,12 +1288,13 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- __remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap, pgmap);
kernel_physical_mapping_remove(start, start + size);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 2cc8749bc8711..a18e2b968e4da 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -227,6 +227,12 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
if (rc)
continue;
+ /* range was never added during probe */
+ if (!data->res[i]) {
+ success++;
+ continue;
+ }
+
rc = remove_memory(range.start, range_len(&range));
if (rc == 0) {
remove_resource(data->res[i]);
diff --git a/drivers/gpu/drm/drm_managed.c b/drivers/gpu/drm/drm_managed.c
index 247f468731de0..a9da94319b052 100644
--- a/drivers/gpu/drm/drm_managed.c
+++ b/drivers/gpu/drm/drm_managed.c
@@ -232,8 +232,8 @@ void *drmm_kmalloc(struct drm_device *dev, size_t size, gfp_t gfp)
dr = alloc_dr(NULL, size, gfp, dev_to_node(dev->dev));
if (!dr) {
- drm_dbg_drmres(dev, "failed to allocate %zu bytes, %u flags\n",
- size, gfp);
+ drm_dbg_drmres(dev, "failed to allocate %zu bytes, %pGg\n",
+ size, &gfp);
return NULL;
}
dr->node.name = kstrdup_const("kmalloc", gfp);
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 57137d3ac1592..051aa1331051c 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -4,7 +4,7 @@
#ifdef CONFIG_MMU
-#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO)
+#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO | __GFP_SKIP_KASAN)
#define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)
/**
diff --git a/include/linux/damon.h b/include/linux/damon.h
index f2cdb7c3f5e6c..c7a31572689be 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -121,6 +121,7 @@ struct damon_target {
* @DAMOS_PAGEOUT: Reclaim the region.
* @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE.
* @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_COLLAPSE: Call ``madvise()`` for the region with MADV_COLLAPSE.
* @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists.
* @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists.
* @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions.
@@ -140,6 +141,7 @@ enum damos_action {
DAMOS_PAGEOUT,
DAMOS_HUGEPAGE,
DAMOS_NOHUGEPAGE,
+ DAMOS_COLLAPSE,
DAMOS_LRU_PRIO,
DAMOS_LRU_DEPRIO,
DAMOS_MIGRATE_HOT,
@@ -159,6 +161,8 @@ enum damos_action {
* @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup.
* @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio.
* @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio.
+ * @DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: Scheme-eligible memory ratio of a
+ * node in basis points (0-10000).
* @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
*
* Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -172,6 +176,7 @@ enum damos_quota_goal_metric {
DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
DAMOS_QUOTA_ACTIVE_MEM_BP,
DAMOS_QUOTA_INACTIVE_MEM_BP,
+ DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP,
NR_DAMOS_QUOTA_GOAL_METRICS,
};
@@ -233,6 +238,8 @@ enum damos_quota_goal_tuner {
* @goals: Head of quota tuning goals (&damos_quota_goal) list.
* @goal_tuner: Goal-based @esz tuning algorithm to use.
* @esz: Effective size quota in bytes.
+ * @fail_charge_num: Failed regions charge rate numerator.
+ * @fail_charge_denom: Failed regions charge rate denominator.
*
* @weight_sz: Weight of the region's size for prioritization.
* @weight_nr_accesses: Weight of the region's nr_accesses for prioritization.
@@ -262,6 +269,10 @@ enum damos_quota_goal_tuner {
*
* The resulting effective size quota in bytes is set to @esz.
*
+ * For DAMOS action applying failed amount of regions, charging those same to
+ * those that the action has successfully applied may be unfair. For the
+ * reason, 'the size * @fail_charge_num / @fail_charge_denom' is charged.
+ *
* For selecting regions within the quota, DAMON prioritizes current scheme's
* target memory regions using the &struct damon_operations->get_scheme_score.
* You could customize the prioritization logic by setting &weight_sz,
@@ -276,6 +287,9 @@ struct damos_quota {
enum damos_quota_goal_tuner goal_tuner;
unsigned long esz;
+ unsigned int fail_charge_num;
+ unsigned int fail_charge_denom;
+
unsigned int weight_sz;
unsigned int weight_nr_accesses;
unsigned int weight_age;
@@ -787,6 +801,7 @@ struct damon_attrs {
* @ops: Set of monitoring operations for given use cases.
* @addr_unit: Scale factor for core to ops address conversion.
* @min_region_sz: Minimum region size.
+ * @pause: Pause kdamond main loop.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
@@ -840,6 +855,7 @@ struct damon_ctx {
struct damon_operations ops;
unsigned long addr_unit;
unsigned long min_region_sz;
+ bool pause;
struct list_head adaptive_targets;
struct list_head schemes;
@@ -994,7 +1010,7 @@ int damon_kdamond_pid(struct damon_ctx *ctx);
int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
-int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+int damon_set_region_system_rams_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
unsigned long addr_unit,
unsigned long min_region_sz);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 51ef13ed756eb..cdf95a9f0b87c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -239,6 +239,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct page **page_array);
#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
+void free_pages_bulk(struct page **page_array, unsigned long nr_pages);
+
unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages,
struct page **page_array);
@@ -467,6 +469,8 @@ void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages);
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
#endif
+void __free_contig_range(unsigned long pfn, unsigned long nr_pages);
+
DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index cd4972a7c97ca..54ca0c88bab6e 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -281,9 +281,9 @@ enum {
*
* %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
* Used for userspace and vmalloc pages; the latter are unpoisoned by
- * kasan_unpoison_vmalloc instead. For userspace pages, results in
- * poisoning being skipped as well, see should_skip_kasan_poison for
- * details. Only effective in HW_TAGS mode.
+ * kasan_unpoison_vmalloc instead. If passed to vmalloc, kasan_unpoison_vmalloc
+ * is skipped too. For userspace pages, results in poisoning being skipped as
+ * well, see should_skip_kasan_poison for details. Only effective in HW_TAGS mode.
*/
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 0574c21ca45da..bb71e7dba4f7f 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -262,7 +262,7 @@ static inline bool is_kmap_addr(const void *x)
* @__addr: Virtual address to be unmapped
*
* Unmaps an address previously mapped by kmap_atomic() and re-enables
- * pagefaults. Depending on PREEMP_RT configuration, re-enables also
+ * pagefaults. Depending on PREEMPT_RT configuration, re-enables also
* migration and preemption. Users should not count on these side effects.
*
* Mappings should be unmapped in the reverse order that they were mapped.
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 815e908c4135b..7c9d66729c609 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -135,9 +135,10 @@ static inline bool movable_node_is_enabled(void)
return movable_node_enabled;
}
-extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap);
+extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap);
extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap);
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
/* reasonably generic interface to expand the physical pages */
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
@@ -307,7 +308,8 @@ extern int sparse_add_section(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap);
+ struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap);
extern struct zone *zone_for_pfn_range(enum mmop online_type,
int nid, struct memory_group *group, unsigned long start_pfn,
unsigned long nr_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fc2acedf0b763..11f440e9d7cde 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1888,16 +1888,6 @@ static inline bool folio_mapped(const struct folio *folio)
return folio_mapcount(folio) >= 1;
}
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any sub-page of compound page is mapped,
- * even if this particular sub-page is not itself mapped by any PTE or PMD.
- */
-static inline bool page_mapped(const struct page *page)
-{
- return folio_mapped(page_folio(page));
-}
-
static inline struct page *virt_to_head_page(const void *x)
{
struct page *page = virt_to_page(x);
@@ -4855,18 +4845,10 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
}
#endif
-void *sparse_buffer_alloc(unsigned long size);
unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
-pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
-p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
-pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
-pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
-pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
- struct vmem_altmap *altmap, unsigned long ptpfn,
- unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 31a848485ad9d..1f50991b43e3b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1350,6 +1350,7 @@ struct readahead_control {
struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
pgoff_t _index;
+ pgoff_t _max_index; /* limit readahead to _max_index, inclusive */
unsigned int _nr_pages;
unsigned int _batch_count;
bool dropbehind;
@@ -1363,6 +1364,7 @@ struct readahead_control {
.mapping = m, \
.ra = r, \
._index = i, \
+ ._max_index = ULONG_MAX, \
}
#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 051e429026904..307b8390fc670 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -92,7 +92,7 @@ static inline long set_restart_fn(struct restart_block *restart,
#define THREAD_ALIGN THREAD_SIZE
#endif
-#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
+#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_SKIP_KASAN)
/*
* flag set/clear/test wrappers
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 6a2f51ebbfd35..faecd55224017 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -30,8 +30,8 @@ struct vmpressure {
struct mem_cgroup;
#ifdef CONFIG_MEMCG
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
- unsigned long scanned, unsigned long reclaimed);
+void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
+ unsigned long scanned, unsigned long reclaimed);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
extern void vmpressure_init(struct vmpressure *vmpr);
@@ -44,8 +44,9 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
-static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
- unsigned long scanned, unsigned long reclaimed) {}
+static inline void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg,
+ bool tree, unsigned long scanned,
+ unsigned long reclaimed) {}
static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
int prio) {}
#endif /* CONFIG_MEMCG */
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 4445a8d9218de..b4bf7b8def1f5 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -96,6 +96,58 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
__entry->order)
);
+TRACE_EVENT(mm_vmscan_balance_pgdat_begin,
+
+ TP_PROTO(int nid, int order, int highest_zoneidx),
+
+ TP_ARGS(nid, order, highest_zoneidx),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(int, highest_zoneidx)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->highest_zoneidx = highest_zoneidx;
+ ),
+
+ TP_printk("nid=%d order=%d highest_zoneidx=%-8s",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE))
+);
+
+TRACE_EVENT(mm_vmscan_balance_pgdat_end,
+
+ TP_PROTO(int nid, int order, int highest_zoneidx,
+ unsigned long nr_reclaimed),
+
+ TP_ARGS(nid, order, highest_zoneidx, nr_reclaimed),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(int, highest_zoneidx)
+ __field(unsigned long, nr_reclaimed)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->highest_zoneidx = highest_zoneidx;
+ __entry->nr_reclaimed = nr_reclaimed;
+ ),
+
+ TP_printk("nid=%d order=%d highest_zoneidx=%-8s nr_reclaimed=%lu",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE),
+ __entry->nr_reclaimed)
+);
+
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 49a8f7b1beef5..a497c5913bd49 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -729,7 +729,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) {
page = llist_entry(pos, struct page, pcp_llist);
- if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+ if (page_cnt == 1 && page_ref_count(page) > 1) /* maybe mapped by user space */
/* Optimization for the common case of page_cnt==1:
* If page wasn't mapped into some user vma there
* is no need to call zap_pages which is slow. When
diff --git a/kernel/fork.c b/kernel/fork.c
index 8ac38beae360b..ec6a120291e5d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -204,7 +204,7 @@ static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
* accounting is performed by the code assigning/releasing stacks to tasks.
* We need a zeroed memory without __GFP_ACCOUNT.
*/
-#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
+#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO | __GFP_SKIP_KASAN)
struct vm_stack {
struct rcu_head rcu;
@@ -342,7 +342,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
}
/* Reset stack metadata. */
- kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
+ if (!kasan_hw_tags_enabled())
+ kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
stack = kasan_reset_tag(vm_area->addr);
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6ad90..e221fa1dc54d0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -810,7 +810,6 @@ if TRANSPARENT_HUGEPAGE
choice
prompt "Transparent Hugepage Support sysfs defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_ALWAYS
help
Selects the sysfs defaults for Transparent Hugepage Support.
@@ -840,7 +839,6 @@ endchoice
choice
prompt "Shmem hugepage allocation defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER
help
Selects the hugepage allocation policy defaults for
@@ -886,7 +884,6 @@ endchoice
choice
prompt "Tmpfs hugepage allocation defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER
help
Selects the hugepage allocation policy defaults for
@@ -931,7 +928,7 @@ endchoice
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
+ depends on ARCH_WANTS_THP_SWAP && SWAP && 64BIT
help
Swap transparent huge pages in one piece, without splitting.
XXX: For now, swap cluster backing transparent huge page
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3dbbbfdeff719..9f38deddcb30e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -13,10 +13,14 @@
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/psi.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/string_choices.h>
+/* for damon_get_folio() used by node eligible memory metrics */
+#include "ops-common.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -918,6 +922,8 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src)
if (err)
return err;
dst->goal_tuner = src->goal_tuner;
+ dst->fail_charge_num = src->fail_charge_num;
+ dst->fail_charge_denom = src->fail_charge_denom;
dst->weight_sz = src->weight_sz;
dst->weight_nr_accesses = src->weight_nr_accesses;
dst->weight_age = src->weight_age;
@@ -1326,11 +1332,26 @@ static int damon_commit_targets(
int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
int err;
+ struct damos *scheme;
+ struct damos_quota_goal *goal;
dst->maybe_corrupted = true;
if (!is_power_of_2(src->min_region_sz))
return -EINVAL;
+ /* node_eligible_mem_bp metric requires PADDR ops */
+ if (src->ops.id != DAMON_OPS_PADDR) {
+ damon_for_each_scheme(scheme, src) {
+ struct damos_quota *quota = &scheme->quota;
+
+ damos_for_each_quota_goal(goal, quota) {
+ if (goal->metric ==
+ DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP)
+ return -EINVAL;
+ }
+ }
+ }
+
err = damon_commit_schemes(dst, src);
if (err)
return err;
@@ -1349,6 +1370,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
if (err)
return err;
}
+ dst->pause = src->pause;
dst->ops = src->ops;
dst->addr_unit = src->addr_unit;
dst->min_region_sz = src->min_region_sz;
@@ -2046,6 +2068,37 @@ static void damos_walk_cancel(struct damon_ctx *ctx)
mutex_unlock(&ctx->walk_control_lock);
}
+static void damos_charge_quota(struct damos_quota *quota,
+ unsigned long sz_region, unsigned long sz_applied)
+{
+ /*
+ * sz_applied could be bigger than sz_region, depending on ops
+ * implementation of the action, e.g., damos_pa_pageout(). Charge only
+ * the region size in the case.
+ */
+ if (!quota->fail_charge_denom || sz_applied > sz_region)
+ quota->charged_sz += sz_region;
+ else
+ quota->charged_sz += sz_applied + mult_frac(
+ (sz_region - sz_applied),
+ quota->fail_charge_num,
+ quota->fail_charge_denom);
+}
+
+static bool damos_quota_is_full(struct damos_quota *quota,
+ unsigned long min_region_sz)
+{
+ if (!damos_quota_is_set(quota))
+ return false;
+ if (quota->charged_sz >= quota->esz)
+ return true;
+ /*
+ * DAMOS action is applied per region, so <min_region_sz remaining
+ * quota means the quota is effectively full.
+ */
+ return quota->esz - quota->charged_sz < min_region_sz;
+}
+
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
@@ -2102,11 +2155,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
- quota->charged_sz += sz;
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz) {
+ damos_charge_quota(quota, sz, sz_applied);
+ if (damos_quota_is_full(quota, c->min_region_sz)) {
quota->charge_target_from = t;
- quota->charge_addr_from = r->ar.end + 1;
+ quota->charge_addr_from = r->ar.end;
}
}
if (s->action != DAMOS_STAT)
@@ -2132,8 +2184,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
continue;
/* Check the quota */
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz)
+ if (damos_quota_is_full(quota, c->min_region_sz))
continue;
if (damos_skip_charged_region(t, r, s, c->min_region_sz))
@@ -2152,6 +2203,58 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
}
/*
+ * damos_apply_target() - Apply DAMOS schemes to a given target.
+ * @c: monitoring context to apply its DAMOS schemes to..
+ * @t: monitoring target to apply the schemes to.
+ * @max_region_sz: maximum region size for @c.
+ *
+ * This function could split regions for keeping the quota. To minimize
+ * overhead from the split operations increased number of regions, this
+ * function will also merge regions after the schemes applying attempt is done,
+ * for each region. The merge operation is made only when it doesn't lose the
+ * monitoring information and not violating @max_region_sz.
+ *
+ * Hence, after this function is called, the total number of regions could
+ * be increased or reduced. The increase could make max_nr_regions temporarily
+ * be violated, until the next per-aggregation interval regions merge operation
+ * is executed. The decrease will not violate min_nr_regions though, since it
+ * keeps @max_region_sz.
+ */
+static void damos_apply_target(struct damon_ctx *c, struct damon_target *t,
+ unsigned long max_region_sz)
+{
+ struct damon_region *r;
+
+ damon_for_each_region(r, t) {
+ struct damon_region *prev_r;
+
+ damon_do_apply_schemes(c, t, r);
+ /*
+ * damon_do_apply_scheems() could split the region for the
+ * quota. Keeping the new slices is an overhead. Merge back
+ * the slices into the previous region if it doesn't lose any
+ * information and not violating the max_region_sz.
+ */
+ if (damon_first_region(t) == r)
+ continue;
+ prev_r = damon_prev_region(r);
+ if (prev_r->ar.end != r->ar.start)
+ continue;
+ if (prev_r->age != r->age)
+ continue;
+ if (prev_r->last_nr_accesses != r->last_nr_accesses)
+ continue;
+ if (prev_r->nr_accesses != r->nr_accesses)
+ continue;
+ if (r->ar.end - prev_r->ar.start > max_region_sz)
+ continue;
+ prev_r->ar.end = r->ar.end;
+ damon_destroy_region(r, t);
+ r = prev_r;
+ }
+}
+
+/*
* damon_feed_loop_next_input() - get next input to achieve a target score.
* @last_input The last input.
* @score Current score that made with @last_input.
@@ -2287,7 +2390,115 @@ static unsigned long damos_get_node_memcg_used_bp(
numerator = i.totalram - used_pages;
return mult_frac(numerator, 10000, i.totalram);
}
-#else
+
+#ifdef CONFIG_DAMON_PADDR
+/*
+ * damos_calc_eligible_bytes() - Calculate raw eligible bytes per node.
+ * @c: The DAMON context.
+ * @s: The scheme.
+ * @nid: The target NUMA node id.
+ * @total: Output for total eligible bytes across all nodes.
+ *
+ * Iterates through each folio in eligible regions to accurately determine
+ * which node the memory resides on. Returns eligible bytes on the specified
+ * node and sets *total to the sum across all nodes.
+ *
+ * Note: This function requires damon_get_folio() from ops-common.c, which is
+ * only available when CONFIG_DAMON_PADDR is enabled. It also requires the
+ * context to be using PADDR operations for meaningful results.
+ */
+static phys_addr_t damos_calc_eligible_bytes(struct damon_ctx *c,
+ struct damos *s, int nid, phys_addr_t *total)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ phys_addr_t total_eligible = 0;
+ phys_addr_t node_eligible = 0;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ phys_addr_t addr, end_addr;
+
+ if (!__damos_valid_target(r, s))
+ continue;
+
+ /* Convert from core address units to physical bytes */
+ addr = (phys_addr_t)r->ar.start * c->addr_unit;
+ end_addr = (phys_addr_t)r->ar.end * c->addr_unit;
+ while (addr < end_addr) {
+ struct folio *folio;
+ phys_addr_t folio_start, folio_end;
+ phys_addr_t overlap_start, overlap_end;
+ phys_addr_t counted;
+
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (!folio) {
+ addr = PAGE_ALIGN_DOWN(addr +
+ PAGE_SIZE);
+ if (!addr)
+ break;
+ continue;
+ }
+
+ /*
+ * Calculate exact overlap between the region
+ * [addr, end_addr) and the folio range.
+ * The folio may start before addr if addr is
+ * in the middle of a large folio.
+ */
+ folio_start = PFN_PHYS(folio_pfn(folio));
+ folio_end = folio_start + folio_size(folio);
+
+ overlap_start = max(addr, folio_start);
+ overlap_end = min(end_addr, folio_end);
+
+ if (overlap_end > overlap_start) {
+ counted = overlap_end - overlap_start;
+ total_eligible += counted;
+ if (folio_nid(folio) == nid)
+ node_eligible += counted;
+ }
+
+ /* Advance past the entire folio */
+ addr = folio_end;
+ folio_put(folio);
+ }
+ cond_resched();
+ }
+ }
+
+ *total = total_eligible;
+ return node_eligible;
+}
+
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ phys_addr_t total_eligible = 0;
+ phys_addr_t node_eligible;
+
+ if (c->ops.id != DAMON_OPS_PADDR)
+ return 0;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid))
+ return 0;
+
+ node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible);
+
+ if (!(unsigned long)total_eligible)
+ return 0;
+
+ return mult_frac((unsigned long)node_eligible, 10000,
+ (unsigned long)total_eligible);
+}
+#else /* CONFIG_DAMON_PADDR */
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_DAMON_PADDR */
+#else /* CONFIG_NUMA */
static __kernel_ulong_t damos_get_node_mem_bp(
struct damos_quota_goal *goal)
{
@@ -2299,7 +2510,13 @@ static unsigned long damos_get_node_memcg_used_bp(
{
return 0;
}
-#endif
+
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
/*
* Returns LRU-active or inactive memory to total LRU memory size ratio.
@@ -2319,7 +2536,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio)
return mult_frac(inactive, 10000, total);
}
-static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
+static void damos_set_quota_goal_current_value(struct damon_ctx *c,
+ struct damos *s, struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -2345,19 +2563,24 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = damos_get_in_active_mem_bp(
goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP);
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ goal->current_value = damos_get_node_eligible_mem_bp(c, s,
+ goal->nid);
+ break;
default:
break;
}
}
/* Return the highest score since it makes schemes least aggressive */
-static unsigned long damos_quota_score(struct damos_quota *quota)
+static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s)
{
struct damos_quota_goal *goal;
+ struct damos_quota *quota = &s->quota;
unsigned long highest_score = 0;
damos_for_each_quota_goal(goal, quota) {
- damos_set_quota_goal_current_value(goal);
+ damos_set_quota_goal_current_value(c, s, goal);
highest_score = max(highest_score,
mult_frac(goal->current_value, 10000,
goal->target_value));
@@ -2366,17 +2589,20 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
return highest_score;
}
-static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_consist(struct damon_ctx *c, struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
quota->esz_bp = damon_feed_loop_next_input(
max(quota->esz_bp, 10000UL), score);
}
-static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_temporal(struct damon_ctx *c,
+ struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
if (score >= 10000)
quota->esz_bp = 0;
@@ -2389,9 +2615,9 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
/*
* Called only if quota->ms, or quota->sz are set, or quota->goals is not empty
*/
-static void damos_set_effective_quota(struct damos_quota *quota,
- struct damon_ctx *ctx)
+static void damos_set_effective_quota(struct damon_ctx *ctx, struct damos *s)
{
+ struct damos_quota *quota = &s->quota;
unsigned long throughput;
unsigned long esz = ULONG_MAX;
@@ -2402,9 +2628,9 @@ static void damos_set_effective_quota(struct damos_quota *quota,
if (!list_empty(&quota->goals)) {
if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST)
- damos_goal_tune_esz_bp_consist(quota);
+ damos_goal_tune_esz_bp_consist(ctx, s);
else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL)
- damos_goal_tune_esz_bp_temporal(quota);
+ damos_goal_tune_esz_bp_temporal(ctx, s);
esz = quota->esz_bp / 10000;
}
@@ -2452,22 +2678,21 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
/* First charge window */
if (!quota->total_charged_sz && !quota->charged_from) {
quota->charged_from = jiffies;
- damos_set_effective_quota(quota, c);
+ damos_set_effective_quota(c, s);
}
/* New charge window starts */
if (!time_in_range_open(jiffies, quota->charged_from,
quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz)
+ if (damos_quota_is_full(quota, c->min_region_sz))
s->stat.qt_exceeds++;
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
if (trace_damos_esz_enabled())
cached_esz = quota->esz;
- damos_set_effective_quota(quota, c);
+ damos_set_effective_quota(c, s);
if (trace_damos_esz_enabled() && quota->esz != cached_esz)
damos_trace_esz(c, s, quota);
}
@@ -2521,9 +2746,9 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s)
static void kdamond_apply_schemes(struct damon_ctx *c)
{
struct damon_target *t;
- struct damon_region *r;
struct damos *s;
bool has_schemes_to_apply = false;
+ unsigned long max_region_sz;
damon_for_each_scheme(s, c) {
if (time_before(c->passed_sample_intervals, s->next_apply_sis))
@@ -2540,13 +2765,12 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (!has_schemes_to_apply)
return;
+ max_region_sz = damon_region_sz_limit(c);
mutex_lock(&c->walk_control_lock);
damon_for_each_target(t, c) {
if (c->ops.target_valid && c->ops.target_valid(t) == false)
continue;
-
- damon_for_each_region(r, t)
- damon_do_apply_schemes(c, t, r);
+ damos_apply_target(c, t, max_region_sz);
}
damon_for_each_scheme(s, c) {
@@ -3014,6 +3238,14 @@ static int kdamond_fn(void *data)
kdamond_call(ctx, false);
if (ctx->maybe_corrupted)
break;
+ while (ctx->pause) {
+ damos_walk_cancel(ctx);
+ kdamond_usleep(ctx->attrs.sample_interval);
+ /* allow caller unset pause via damon_call() */
+ kdamond_call(ctx, false);
+ if (kdamond_need_stop(ctx) || ctx->maybe_corrupted)
+ goto done;
+ }
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
else
@@ -3096,14 +3328,20 @@ done:
return 0;
}
-static int walk_system_ram(struct resource *res, void *arg)
+struct damon_system_ram_range_walk_arg {
+ bool walked;
+ struct resource res;
+};
+
+static int damon_system_ram_walk_fn(struct resource *res, void *arg)
{
- struct resource *a = arg;
+ struct damon_system_ram_range_walk_arg *a = arg;
- if (resource_size(a) < resource_size(res)) {
- a->start = res->start;
- a->end = res->end;
+ if (!a->walked) {
+ a->walked = true;
+ a->res.start = res->start;
}
+ a->res.end = res->end;
return 0;
}
@@ -3120,27 +3358,24 @@ static unsigned long damon_res_to_core_addr(resource_size_t ra,
return ra / addr_unit;
}
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively. If no System RAM is found, returns false.
- */
-static bool damon_find_biggest_system_ram(unsigned long *start,
+static bool damon_find_system_rams_range(unsigned long *start,
unsigned long *end, unsigned long addr_unit)
-
{
- struct resource res = {};
+ struct damon_system_ram_range_walk_arg arg = {};
- walk_system_ram_res(0, -1, &res, walk_system_ram);
- *start = damon_res_to_core_addr(res.start, addr_unit);
- *end = damon_res_to_core_addr(res.end + 1, addr_unit);
+ walk_system_ram_res(0, -1, &arg, damon_system_ram_walk_fn);
+ if (!arg.walked)
+ return false;
+ *start = damon_res_to_core_addr(arg.res.start, addr_unit);
+ *end = damon_res_to_core_addr(arg.res.end + 1, addr_unit);
if (*end <= *start)
return false;
return true;
}
/**
- * damon_set_region_biggest_system_ram_default() - Set the region of the given
- * monitoring target as requested, or biggest 'System RAM'.
+ * damon_set_region_system_rams_default() - Set the region of the given
+ * monitoring target as requested, or to cover all 'System RAM' resources.
* @t: The monitoring target to set the region.
* @start: The pointer to the start address of the region.
* @end: The pointer to the end address of the region.
@@ -3148,14 +3383,14 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
* @min_region_sz: Minimum region size.
*
* This function sets the region of @t as requested by @start and @end. If the
- * values of @start and @end are zero, however, this function finds the biggest
- * 'System RAM' resource and sets the region to cover the resource. In the
- * latter case, this function saves the start and end addresses of the resource
- * in @start and @end, respectively.
+ * values of @start and @end are zero, however, this function finds 'System
+ * RAM' resources and sets the region to cover all the resource. In the latter
+ * case, this function saves the start and the end addresseses of the first and
+ * the last resources in @start and @end, respectively.
*
* Return: 0 on success, negative error code otherwise.
*/
-int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+int damon_set_region_system_rams_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
unsigned long addr_unit, unsigned long min_region_sz)
{
@@ -3165,7 +3400,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
return -EINVAL;
if (!*start && !*end &&
- !damon_find_biggest_system_ram(start, end, addr_unit))
+ !damon_find_system_rams_range(start, end, addr_unit))
return -EINVAL;
addr_range.start = *start;
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 8cfe7bd3dc1d3..08500a15bbfb8 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -39,7 +39,6 @@ static bool enabled __read_mostly;
* the re-reading, DAMON_LRU_SORT will be disabled.
*/
static bool commit_inputs __read_mostly;
-module_param(commit_inputs, bool, 0600);
/*
* Desired active to [in]active memory ratio in bp (1/10,000).
@@ -140,7 +139,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs);
* Start of the target memory region in physical address.
*
* The start physical address of memory region that DAMON_LRU_SORT will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_start __read_mostly;
module_param(monitor_region_start, ulong, 0600);
@@ -149,7 +149,8 @@ module_param(monitor_region_start, ulong, 0600);
* End of the target memory region in physical address.
*
* The end physical address of memory region that DAMON_LRU_SORT will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
@@ -327,7 +328,7 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
goto out;
- err = damon_set_region_biggest_system_ram_default(param_target,
+ err = damon_set_region_system_rams_default(param_target,
&monitor_region_start,
&monitor_region_end,
param_ctx->addr_unit,
@@ -340,18 +341,51 @@ out:
return err;
}
-static int damon_lru_sort_handle_commit_inputs(void)
+static int damon_lru_sort_commit_inputs_fn(void *arg)
{
+ return damon_lru_sort_apply_parameters();
+}
+
+static int damon_lru_sort_commit_inputs_store(const char *val,
+ const struct kernel_param *kp)
+{
+ bool commit_inputs_request;
int err;
+ struct damon_call_control control = {
+ .fn = damon_lru_sort_commit_inputs_fn,
+ };
+
+ if (!val) {
+ commit_inputs_request = true;
+ } else {
+ err = kstrtobool(val, &commit_inputs_request);
+ if (err)
+ return err;
+ }
- if (!commit_inputs)
+ if (!commit_inputs_request)
return 0;
- err = damon_lru_sort_apply_parameters();
- commit_inputs = false;
- return err;
+ /*
+ * Skip damon_call() if ctx is not initialized to avoid
+ * NULL pointer dereference.
+ */
+ if (!ctx)
+ return -EINVAL;
+
+ err = damon_call(ctx, &control);
+
+ return err ? err : control.return_code;
}
+static const struct kernel_param_ops commit_inputs_param_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = damon_lru_sort_commit_inputs_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600);
+
static int damon_lru_sort_damon_call_fn(void *arg)
{
struct damon_ctx *c = arg;
@@ -365,7 +399,7 @@ static int damon_lru_sort_damon_call_fn(void *arg)
damon_lru_sort_cold_stat = s->stat;
}
- return damon_lru_sort_handle_commit_inputs();
+ return 0;
}
static struct damon_call_control call_control = {
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index c3e4c871b0bb2..5c93ef2bb8a97 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -117,9 +117,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
damon_max_nr_accesses(&c->attrs);
age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
- for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
- age_in_log++, age_in_sec >>= 1)
- ;
+ if (age_in_sec)
+ age_in_log = min_t(int, ilog2(age_in_sec) + 1,
+ DAMON_MAX_AGE_IN_LOG);
+ else
+ age_in_log = 0;
+
/* If frequency is 0, higher age means it's colder */
if (freq_subscore == 0)
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 96f6dfc28eae4..1f0319bfa0788 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -39,7 +39,6 @@ static bool enabled __read_mostly;
* re-reading, DAMON_RECLAIM will be disabled.
*/
static bool commit_inputs __read_mostly;
-module_param(commit_inputs, bool, 0600);
/*
* Time threshold for cold memory regions identification in microseconds.
@@ -92,6 +91,20 @@ module_param(quota_mem_pressure_us, ulong, 0600);
static unsigned long quota_autotune_feedback __read_mostly;
module_param(quota_autotune_feedback, ulong, 0600);
+/*
+ * Auto-tune monitoring intervals.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's
+ * sampling and aggregation intervals. The auto-tuning aims to capture
+ * meaningful amount of access events in each DAMON-snapshot, while keeping the
+ * sampling intervals 5 milliseconds in minimum, and 10 seconds in maximum.
+ * Setting this as ``N`` disables the auto-tuning.
+ *
+ * Disabled by default.
+ */
+static bool autotune_monitoring_intervals __read_mostly;
+module_param(autotune_monitoring_intervals, bool, 0600);
+
static struct damos_watermarks damon_reclaim_wmarks = {
.metric = DAMOS_WMARK_FREE_MEM_RATE,
.interval = 5000000, /* 5 seconds */
@@ -114,7 +127,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs);
* Start of the target memory region in physical address.
*
* The start physical address of memory region that DAMON_RECLAIM will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_start __read_mostly;
module_param(monitor_region_start, ulong, 0600);
@@ -123,7 +137,8 @@ module_param(monitor_region_start, ulong, 0600);
* End of the target memory region in physical address.
*
* The end physical address of memory region that DAMON_RECLAIM will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
@@ -151,7 +166,7 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
static struct damon_ctx *ctx;
static struct damon_target *target;
-static struct damos *damon_reclaim_new_scheme(void)
+static struct damos *damon_reclaim_new_scheme(unsigned long aggr_interval)
{
struct damos_access_pattern pattern = {
/* Find regions having PAGE_SIZE or larger size */
@@ -161,8 +176,7 @@ static struct damos *damon_reclaim_new_scheme(void)
.min_nr_accesses = 0,
.max_nr_accesses = 0,
/* for min_age or more micro-seconds */
- .min_age_region = min_age /
- damon_reclaim_mon_attrs.aggr_interval,
+ .min_age_region = min_age / aggr_interval,
.max_age_region = UINT_MAX,
};
@@ -183,6 +197,7 @@ static int damon_reclaim_apply_parameters(void)
{
struct damon_ctx *param_ctx;
struct damon_target *param_target;
+ struct damon_attrs attrs;
struct damos *scheme;
struct damos_quota_goal *goal;
struct damos_filter *filter;
@@ -200,12 +215,21 @@ static int damon_reclaim_apply_parameters(void)
goto out;
}
- err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
+ attrs = damon_reclaim_mon_attrs;
+ if (autotune_monitoring_intervals) {
+ attrs.sample_interval = 5000;
+ attrs.aggr_interval = 100000;
+ attrs.intervals_goal.access_bp = 40;
+ attrs.intervals_goal.aggrs = 3;
+ attrs.intervals_goal.min_sample_us = 5000;
+ attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000;
+ }
+ err = damon_set_attrs(param_ctx, &attrs);
if (err)
goto out;
err = -ENOMEM;
- scheme = damon_reclaim_new_scheme();
+ scheme = damon_reclaim_new_scheme(attrs.aggr_interval);
if (!scheme)
goto out;
damon_set_schemes(param_ctx, &scheme, 1);
@@ -233,11 +257,9 @@ static int damon_reclaim_apply_parameters(void)
damos_add_filter(scheme, filter);
}
- err = damon_set_region_biggest_system_ram_default(param_target,
- &monitor_region_start,
- &monitor_region_end,
- param_ctx->addr_unit,
- param_ctx->min_region_sz);
+ err = damon_set_region_system_rams_default(param_target,
+ &monitor_region_start, &monitor_region_end,
+ param_ctx->addr_unit, param_ctx->min_region_sz);
if (err)
goto out;
err = damon_commit_ctx(ctx, param_ctx);
@@ -246,18 +268,51 @@ out:
return err;
}
-static int damon_reclaim_handle_commit_inputs(void)
+static int damon_reclaim_commit_inputs_fn(void *arg)
{
+ return damon_reclaim_apply_parameters();
+}
+
+static int damon_reclaim_commit_inputs_store(const char *val,
+ const struct kernel_param *kp)
+{
+ bool commit_inputs_request;
int err;
+ struct damon_call_control control = {
+ .fn = damon_reclaim_commit_inputs_fn,
+ };
+
+ if (!val) {
+ commit_inputs_request = true;
+ } else {
+ err = kstrtobool(val, &commit_inputs_request);
+ if (err)
+ return err;
+ }
- if (!commit_inputs)
+ if (!commit_inputs_request)
return 0;
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- return err;
+ /*
+ * Skip damon_call() if ctx is not initialized to avoid
+ * NULL pointer dereference.
+ */
+ if (!ctx)
+ return -EINVAL;
+
+ err = damon_call(ctx, &control);
+
+ return err ? err : control.return_code;
}
+static const struct kernel_param_ops commit_inputs_param_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = damon_reclaim_commit_inputs_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600);
+
static int damon_reclaim_damon_call_fn(void *arg)
{
struct damon_ctx *c = arg;
@@ -267,7 +322,7 @@ static int damon_reclaim_damon_call_fn(void *arg)
damon_for_each_scheme(s, c)
damon_reclaim_stat = s->stat;
- return damon_reclaim_handle_commit_inputs();
+ return 0;
}
static struct damon_call_control call_control = {
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index 3951b762cbddf..0e14f5bb8f75d 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -148,59 +148,12 @@ static int damon_stat_damon_call_fn(void *data)
return 0;
}
-struct damon_stat_system_ram_range_walk_arg {
- bool walked;
- struct resource res;
-};
-
-static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg)
-{
- struct damon_stat_system_ram_range_walk_arg *a = arg;
-
- if (!a->walked) {
- a->walked = true;
- a->res.start = res->start;
- }
- a->res.end = res->end;
- return 0;
-}
-
-static unsigned long damon_stat_res_to_core_addr(resource_size_t ra,
- unsigned long addr_unit)
-{
- /*
- * Use div_u64() for avoiding linking errors related with __udivdi3,
- * __aeabi_uldivmod, or similar problems. This should also improve the
- * performance optimization (read div_u64() comment for the detail).
- */
- if (sizeof(ra) == 8 && sizeof(addr_unit) == 4)
- return div_u64(ra, addr_unit);
- return ra / addr_unit;
-}
-
-static int damon_stat_set_monitoring_region(struct damon_target *t,
- unsigned long addr_unit, unsigned long min_region_sz)
-{
- struct damon_addr_range addr_range;
- struct damon_stat_system_ram_range_walk_arg arg = {};
-
- walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn);
- if (!arg.walked)
- return -EINVAL;
- addr_range.start = damon_stat_res_to_core_addr(
- arg.res.start, addr_unit);
- addr_range.end = damon_stat_res_to_core_addr(
- arg.res.end + 1, addr_unit);
- if (addr_range.end <= addr_range.start)
- return -EINVAL;
- return damon_set_regions(t, &addr_range, 1, min_region_sz);
-}
-
static struct damon_ctx *damon_stat_build_ctx(void)
{
struct damon_ctx *ctx;
struct damon_attrs attrs;
struct damon_target *target;
+ unsigned long start = 0, end = 0;
ctx = damon_new_ctx();
if (!ctx)
@@ -230,8 +183,8 @@ static struct damon_ctx *damon_stat_build_ctx(void)
if (!target)
goto free_out;
damon_add_target(ctx, target);
- if (damon_stat_set_monitoring_region(target, ctx->addr_unit,
- ctx->min_region_sz))
+ if (damon_set_region_system_rams_default(target, &start, &end,
+ ctx->addr_unit, ctx->min_region_sz))
goto free_out;
return ctx;
free_out:
@@ -313,6 +266,45 @@ static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp)
return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N');
}
+static int damon_stat_kdamond_pid_store(
+ const char *val, const struct kernel_param *kp)
+{
+ /*
+ * kdamond_pid is read-only, but kernel command line could write it.
+ * Do nothing here.
+ */
+ return 0;
+}
+
+static int damon_stat_kdamond_pid_load(
+ char *buffer, const struct kernel_param *kp)
+{
+ int pid;
+
+ if (!damon_stat_context) {
+ pid = -1;
+ } else {
+ pid = damon_kdamond_pid(damon_stat_context);
+ if (pid < 1)
+ pid = -1;
+ }
+ return sprintf(buffer, "%d\n", pid);
+}
+
+static const struct kernel_param_ops kdamond_pid_param_ops = {
+ .set = damon_stat_kdamond_pid_store,
+ .get = damon_stat_kdamond_pid_load,
+};
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_STAT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400);
+MODULE_PARM_DESC(kdamond_pid, "pid of the kdamond");
+
static int __init damon_stat_init(void)
{
int err = 0;
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index a8014780edae9..ab2153fff9a83 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1093,6 +1093,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
.metric = DAMOS_QUOTA_INACTIVE_MEM_BP,
.name = "inactive_mem_bp",
},
+ {
+ .metric = DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP,
+ .name = "node_eligible_mem_bp",
+ },
};
static ssize_t target_metric_show(struct kobject *kobj,
@@ -1508,6 +1512,8 @@ struct damon_sysfs_quotas {
unsigned long reset_interval_ms;
unsigned long effective_sz; /* Effective size quota in bytes */
enum damos_quota_goal_tuner goal_tuner;
+ unsigned int fail_charge_num;
+ unsigned int fail_charge_denom;
};
static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
@@ -1682,6 +1688,48 @@ static ssize_t goal_tuner_store(struct kobject *kobj,
return -EINVAL;
}
+static ssize_t fail_charge_num_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%u\n", quotas->fail_charge_num);
+}
+
+static ssize_t fail_charge_num_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtouint(buf, 0, &quotas->fail_charge_num);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t fail_charge_denom_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%u\n", quotas->fail_charge_denom);
+}
+
+static ssize_t fail_charge_denom_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtouint(buf, 0, &quotas->fail_charge_denom);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
static void damon_sysfs_quotas_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
@@ -1702,12 +1750,20 @@ static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr =
static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr =
__ATTR_RW_MODE(goal_tuner, 0600);
+static struct kobj_attribute damon_sysfs_quotas_fail_charge_num_attr =
+ __ATTR_RW_MODE(fail_charge_num, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_fail_charge_denom_attr =
+ __ATTR_RW_MODE(fail_charge_denom, 0600);
+
static struct attribute *damon_sysfs_quotas_attrs[] = {
&damon_sysfs_quotas_ms_attr.attr,
&damon_sysfs_quotas_sz_attr.attr,
&damon_sysfs_quotas_reset_interval_ms_attr.attr,
&damon_sysfs_quotas_effective_bytes_attr.attr,
&damon_sysfs_quotas_goal_tuner_attr.attr,
+ &damon_sysfs_quotas_fail_charge_num_attr.attr,
+ &damon_sysfs_quotas_fail_charge_denom_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_quotas);
@@ -2061,6 +2117,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = {
.name = "nohugepage",
},
{
+ .action = DAMOS_COLLAPSE,
+ .name = "collapse",
+ },
+ {
.action = DAMOS_LRU_PRIO,
.name = "lru_prio",
},
@@ -2685,6 +2745,9 @@ static int damos_sysfs_add_quota_score(
}
goal->nid = sysfs_goal->nid;
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
default:
break;
}
@@ -2796,6 +2859,8 @@ static struct damos *damon_sysfs_mk_scheme(
.weight_nr_accesses = sysfs_weights->nr_accesses,
.weight_age = sysfs_weights->age,
.goal_tuner = sysfs_quotas->goal_tuner,
+ .fail_charge_num = sysfs_quotas->fail_charge_num,
+ .fail_charge_denom = sysfs_quotas->fail_charge_denom,
};
struct damos_watermarks wmarks = {
.metric = sysfs_wmarks->metric,
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index eefa959aa30ae..d5863cc33d230 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -866,6 +866,7 @@ struct damon_sysfs_context {
struct damon_sysfs_attrs *attrs;
struct damon_sysfs_targets *targets;
struct damon_sysfs_schemes *schemes;
+ bool pause;
};
static struct damon_sysfs_context *damon_sysfs_context_alloc(
@@ -878,6 +879,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc(
context->kobj = (struct kobject){};
context->ops_id = ops_id;
context->addr_unit = 1;
+ context->pause = false;
return context;
}
@@ -1053,6 +1055,30 @@ static ssize_t addr_unit_store(struct kobject *kobj,
return count;
}
+static ssize_t pause_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%c\n", context->pause ? 'Y' : 'N');
+}
+
+static ssize_t pause_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ bool pause;
+ int err = kstrtobool(buf, &pause);
+
+ if (err)
+ return err;
+ context->pause = pause;
+ return count;
+}
+
+
static void damon_sysfs_context_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
@@ -1067,10 +1093,14 @@ static struct kobj_attribute damon_sysfs_context_operations_attr =
static struct kobj_attribute damon_sysfs_context_addr_unit_attr =
__ATTR_RW_MODE(addr_unit, 0600);
+static struct kobj_attribute damon_sysfs_context_pause_attr =
+ __ATTR_RW_MODE(pause, 0600);
+
static struct attribute *damon_sysfs_context_attrs[] = {
&damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
&damon_sysfs_context_addr_unit_attr.attr,
+ &damon_sysfs_context_pause_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
@@ -1470,6 +1500,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
if (sys_ctx->ops_id == DAMON_OPS_PADDR)
ctx->min_region_sz = max(
DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1);
+ ctx->pause = sys_ctx->pause;
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 9e5904c2beeb2..1b23a22ac04c4 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -694,6 +694,8 @@ static void damos_test_commit_quota(struct kunit *test)
.ms = 2,
.sz = 3,
.goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST,
+ .fail_charge_num = 2,
+ .fail_charge_denom = 3,
.weight_sz = 4,
.weight_nr_accesses = 5,
.weight_age = 6,
@@ -703,6 +705,8 @@ static void damos_test_commit_quota(struct kunit *test)
.ms = 8,
.sz = 9,
.goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL,
+ .fail_charge_num = 1,
+ .fail_charge_denom = 1024,
.weight_sz = 10,
.weight_nr_accesses = 11,
.weight_age = 12,
@@ -717,6 +721,8 @@ static void damos_test_commit_quota(struct kunit *test)
KUNIT_EXPECT_EQ(test, dst.ms, src.ms);
KUNIT_EXPECT_EQ(test, dst.sz, src.sz);
KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner);
+ KUNIT_EXPECT_EQ(test, dst.fail_charge_num, src.fail_charge_num);
+ KUNIT_EXPECT_EQ(test, dst.fail_charge_denom, src.fail_charge_denom);
KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz);
KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses);
KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age);
@@ -1077,6 +1083,10 @@ static void damon_test_commit_ctx(struct kunit *test)
KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0);
src->min_region_sz = 4095;
KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL);
+ src->min_region_sz = 4096;
+ src->pause = true;
+ KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0);
+ KUNIT_EXPECT_TRUE(test, dst->pause);
damon_destroy_ctx(src);
damon_destroy_ctx(dst);
}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b069dbc7e3d25..dd5f2d7027ac4 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -903,6 +903,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_NOHUGEPAGE:
madv_action = MADV_NOHUGEPAGE;
break;
+ case DAMOS_COLLAPSE:
+ madv_action = MADV_COLLAPSE;
+ break;
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
return damos_va_migrate(t, r, scheme, sz_filter_passed);
diff --git a/mm/filemap.c b/mm/filemap.c
index 4e636647100c1..ab34cab2416a4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3314,6 +3314,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
bool force_thp_readahead = false;
unsigned short mmap_miss;
+ ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1;
+
/* Use the readahead code, even if readahead is disabled */
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
(vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)
@@ -3396,6 +3398,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+ ra->start = max(ra->start, vmf->vma->vm_pgoff);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ra->order = 0;
@@ -3438,6 +3441,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
}
if (folio_test_readahead(folio)) {
+ ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1;
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_ra(&ractl, folio, ra->ra_pages);
}
@@ -3747,8 +3751,7 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned short *mmap_miss,
- pgoff_t file_end)
+ unsigned long *rss, pgoff_t file_end)
{
struct address_space *mapping = folio->mapping;
unsigned int ref_from_caller = 1;
@@ -3781,16 +3784,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
goto skip;
/*
- * If there are too many folios that are recently evicted
- * in a file, they will probably continue to be evicted.
- * In such situation, read-ahead is only a waste of IO.
- * Don't decrease mmap_miss in this scenario to make sure
- * we can stop read-ahead.
- */
- if (!folio_test_workingset(folio))
- (*mmap_miss)++;
-
- /*
* NOTE: If there're PTE markers, we'll leave them to be
* handled in the specific fault path, and it'll prohibit the
* fault-around logic.
@@ -3836,7 +3829,7 @@ skip:
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned long *rss, unsigned short *mmap_miss)
+ unsigned long *rss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
@@ -3844,10 +3837,6 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
if (PageHWPoison(page))
goto out;
- /* See comment of filemap_map_folio_range() */
- if (!folio_test_workingset(folio))
- (*mmap_miss)++;
-
/*
* NOTE: If there're PTE markers, we'll leave them to be
* handled in the specific fault path, and it'll prohibit
@@ -3882,7 +3871,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
vm_fault_t ret = 0;
unsigned long rss = 0;
unsigned int nr_pages = 0, folio_type;
- unsigned short mmap_miss = 0, mmap_miss_saved;
/*
* Recalculate end_pgoff based on file_end before calling
@@ -3921,6 +3909,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
folio_type = mm_counter_file(folio);
do {
unsigned long end;
+ vm_fault_t map_ret;
addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
vmf->pte += xas.xa_index - last_pgoff;
@@ -3928,13 +3917,35 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
end = folio_next_index(folio) - 1;
nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
- if (!folio_test_large(folio))
- ret |= filemap_map_order0_folio(vmf,
- folio, addr, &rss, &mmap_miss);
- else
- ret |= filemap_map_folio_range(vmf, folio,
- xas.xa_index - folio->index, addr,
- nr_pages, &rss, &mmap_miss, file_end);
+ if (!folio_test_large(folio)) {
+ map_ret = filemap_map_order0_folio(vmf, folio, addr,
+ &rss);
+ } else {
+ unsigned long start = xas.xa_index - folio->index;
+
+ map_ret = filemap_map_folio_range(vmf, folio, start,
+ addr, nr_pages, &rss,
+ file_end);
+ }
+ ret |= map_ret;
+
+ /*
+ * If there are too many folios that are recently evicted
+ * in a file, they will probably continue to be evicted.
+ * In such situation, read-ahead is only a waste of IO.
+ * Don't decrease mmap_miss in this scenario to make sure
+ * we can stop read-ahead.
+ */
+ if ((map_ret & VM_FAULT_NOPAGE) &&
+ !(vmf->flags & FAULT_FLAG_TRIED) &&
+ !folio_test_workingset(folio)) {
+ unsigned short mmap_miss;
+
+ mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(file->f_ra.mmap_miss,
+ mmap_miss - 1);
+ }
folio_unlock(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
@@ -3944,12 +3955,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
out:
rcu_read_unlock();
- mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
- if (mmap_miss >= mmap_miss_saved)
- WRITE_ONCE(file->f_ra.mmap_miss, 0);
- else
- WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);
-
return ret;
}
EXPORT_SYMBOL(filemap_map_pages);
diff --git a/mm/gup.c b/mm/gup.c
index ad9ded39609cb..0692119b79043 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2865,8 +2865,8 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!folio)
goto pte_unmap;
- if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
- unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get_lockless(pmdp))) ||
+ unlikely(pte_val(pte) != pte_val(ptep_get_lockless(ptep)))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2942,7 +2942,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+ if (unlikely(pmd_val(orig) != pmd_val(pmdp_get_lockless(pmdp)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2985,7 +2985,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ if (unlikely(pud_val(orig) != pud_val(pudp_get(pudp)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 653f2dc034036..b7df167f7acbf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -429,61 +429,75 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
return count;
}
+enum defrag_mode {
+ DEFRAG_ALWAYS = 0,
+ DEFRAG_DEFER,
+ DEFRAG_DEFER_MADVISE,
+ DEFRAG_MADVISE,
+ DEFRAG_NEVER,
+};
+
+static const char * const defrag_mode_strings[] = {
+ [DEFRAG_ALWAYS] = "always",
+ [DEFRAG_DEFER] = "defer",
+ [DEFRAG_DEFER_MADVISE] = "defer+madvise",
+ [DEFRAG_MADVISE] = "madvise",
+ [DEFRAG_NEVER] = "never",
+};
+
+static const enum transparent_hugepage_flag defrag_flags[] = {
+ [DEFRAG_ALWAYS] = TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ [DEFRAG_DEFER] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+ [DEFRAG_DEFER_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
+ [DEFRAG_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+};
+
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- const char *output;
+ int active = DEFRAG_NEVER;
+ int len = 0;
+ int i;
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
- &transparent_hugepage_flags))
- output = "[always] defer defer+madvise madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
- &transparent_hugepage_flags))
- output = "always [defer] defer+madvise madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
- &transparent_hugepage_flags))
- output = "always defer [defer+madvise] madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
- &transparent_hugepage_flags))
- output = "always defer defer+madvise [madvise] never";
- else
- output = "always defer defer+madvise madvise [never]";
+ for (i = 0; i < ARRAY_SIZE(defrag_flags); i++) {
+ if (test_bit(defrag_flags[i], &transparent_hugepage_flags)) {
+ active = i;
+ break;
+ }
+ }
- return sysfs_emit(buf, "%s\n", output);
+ for (i = 0; i < ARRAY_SIZE(defrag_mode_strings); i++) {
+ if (i == active)
+ len += sysfs_emit_at(buf, len, "[%s] ",
+ defrag_mode_strings[i]);
+ else
+ len += sysfs_emit_at(buf, len, "%s ",
+ defrag_mode_strings[i]);
+ }
+
+ /* Replace trailing space with newline */
+ buf[len - 1] = '\n';
+
+ return len;
}
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (sysfs_streq(buf, "always")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "defer+madvise")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "defer")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "madvise")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "never")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else
+ int mode, m;
+
+ mode = sysfs_match_string(defrag_mode_strings, buf);
+ if (mode < 0)
return -EINVAL;
+ for (m = 0; m < ARRAY_SIZE(defrag_flags); m++) {
+ if (m == mode)
+ set_bit(defrag_flags[m], &transparent_hugepage_flags);
+ else
+ clear_bit(defrag_flags[m], &transparent_hugepage_flags);
+ }
+
return count;
}
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
@@ -4192,11 +4206,10 @@ fail:
folio_unlock(new_folio);
/*
- * Subpages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
+ * Subpages whose mapping has been zapped may be freed
+ * earlier, but freeing them requires taking the
+ * lru_lock, so we defer put_page() on tail pages until
+ * after the split completes.
*/
free_folio_and_swap_cache(new_folio);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c921287489de3..571212b80835e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2862,6 +2862,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
map_chg_state map_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
+ struct hugetlb_cgroup *h_cg_rsvd = NULL;
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
idx = hstate_index(h);
@@ -2912,7 +2913,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
- idx, pages_per_huge_page(h), &h_cg);
+ idx, pages_per_huge_page(h), &h_cg_rsvd);
if (ret)
goto out_subpool_put;
}
@@ -2954,7 +2955,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
- h_cg, folio);
+ h_cg_rsvd, folio);
}
spin_unlock_irq(&hugetlb_lock);
@@ -3006,7 +3007,7 @@ out_uncharge_cgroup:
out_uncharge_cgroup_reservation:
if (map_chg)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
- h_cg);
+ h_cg_rsvd);
out_subpool_put:
/*
* put page to subpool iff the quota of subpool's rsv_hpages is used
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 5725a367246d9..10424cd25e5a6 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -263,7 +263,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
break;
}
- kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+ kunit_info(test, "%s: size=%zu, gfp=%pGg, policy=%s, cache=%i\n", __func__, size, &gfp,
policy_name, !!test_cache);
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b8452dbdb043f..28a843f30b32b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2528,8 +2528,8 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
cc->progress++;
continue;
}
- hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
- hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
+ hstart = ALIGN(vma->vm_start, HPAGE_PMD_SIZE);
+ hend = ALIGN_DOWN(vma->vm_end, HPAGE_PMD_SIZE);
if (khugepaged_scan.address > hend) {
cc->progress++;
continue;
@@ -2808,6 +2808,7 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
case SCAN_PAGE_FILLED:
+ case SCAN_PAGE_HAS_PRIVATE:
case SCAN_PAGE_DIRTY_OR_WRITEBACK:
return -EAGAIN;
/*
@@ -2845,8 +2846,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmgrab(mm);
lru_add_drain_all();
- hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = end & HPAGE_PMD_MASK;
+ hstart = ALIGN(start, HPAGE_PMD_SIZE);
+ hend = ALIGN_DOWN(end, HPAGE_PMD_SIZE);
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
enum scan_result result = SCAN_FAIL;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2eff0d6b622b6..7c7ba17ce7af0 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,6 +92,7 @@
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
+#include <linux/xarray.h>
#include <linux/crc32.h>
#include <asm/sections.h>
@@ -157,6 +158,8 @@ struct kmemleak_object {
struct hlist_head area_list;
unsigned long jiffies; /* creation timestamp */
pid_t pid; /* pid of the current task */
+ /* per-scan dedup count, valid only while in scan-local dedup xarray */
+ unsigned int dup_count;
char comm[TASK_COMM_LEN]; /* executable name */
};
@@ -360,8 +363,9 @@ static const char *__object_type_str(struct kmemleak_object *object)
* Printing of the unreferenced objects information to the seq file. The
* print_unreferenced function must be called with the object->lock held.
*/
-static void print_unreferenced(struct seq_file *seq,
- struct kmemleak_object *object)
+static void __print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object,
+ bool hex_dump)
{
int i;
unsigned long *entries;
@@ -373,7 +377,8 @@ static void print_unreferenced(struct seq_file *seq,
object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
- hex_dump_object(seq, object);
+ if (hex_dump)
+ hex_dump_object(seq, object);
warn_or_seq_printf(seq, " backtrace (crc %x):\n", object->checksum);
for (i = 0; i < nr_entries; i++) {
@@ -382,6 +387,12 @@ static void print_unreferenced(struct seq_file *seq,
}
}
+static void print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ __print_unreferenced(seq, object, true);
+}
+
/*
* Print the kmemleak_object information. This function is used mainly for
* debugging special cases when kmemleak operations. It must be called with
@@ -1685,6 +1696,103 @@ unlock_put:
}
/*
+ * Print one leak inline. The hex dump is gated on OBJECT_ALLOCATED so it
+ * does not touch user memory that was freed concurrently; the rest of the
+ * report (backtrace, comm, pid) is always emitted since the kmemleak_object
+ * metadata is pinned by the caller.
+ */
+static void print_leak_locked(struct kmemleak_object *object, bool hex_dump)
+{
+ raw_spin_lock_irq(&object->lock);
+ __print_unreferenced(NULL, object,
+ hex_dump && (object->flags & OBJECT_ALLOCATED));
+ raw_spin_unlock_irq(&object->lock);
+}
+
+/*
+ * Per-scan dedup table for verbose leak printing. The xarray is keyed by
+ * stackdepot trace_handle and stores a pointer to the representative
+ * kmemleak_object. The per-scan repeat count lives in object->dup_count.
+ *
+ * dedup_record() must run outside object->lock: xa_store() may take
+ * mutexes (xa_node slab allocation) which lockdep would flag against the
+ * raw spinlock object->lock.
+ */
+static void dedup_record(struct xarray *dedup, struct kmemleak_object *object,
+ depot_stack_handle_t trace_handle)
+{
+ struct kmemleak_object *rep;
+ void *old;
+
+ /*
+ * No stack trace to dedup against: early-boot allocation tracked
+ * before kmemleak_init() set up object_cache, or stack_depot_save()
+ * failure under memory pressure.
+ */
+ if (!trace_handle) {
+ print_leak_locked(object, true);
+ return;
+ }
+
+ /* stack is available, now we can de-dup */
+ rep = xa_load(dedup, trace_handle);
+ if (rep) {
+ rep->dup_count++;
+ return;
+ }
+
+ /*
+ * Object is being torn down (use_count already hit zero); the
+ * tracked memory at object->pointer is unsafe to read, so skip.
+ */
+ if (!get_object(object))
+ return;
+
+ object->dup_count = 1;
+ old = xa_store(dedup, trace_handle, object, GFP_ATOMIC);
+ if (xa_is_err(old)) {
+ /* xa_node allocation failed; fall back to inline print. */
+ print_leak_locked(object, true);
+ put_object(object);
+ return;
+ }
+ /*
+ * scan_mutex serialises all writers to the dedup xarray, so xa_store()
+ * after a NULL xa_load() must always overwrite an empty slot.
+ */
+ WARN_ON_ONCE(old);
+}
+
+/*
+ * Drain the dedup table. Re-acquires object->lock and re-checks
+ * OBJECT_ALLOCATED before printing: while get_object() pins the
+ * kmemleak_object metadata, the underlying tracked allocation may have
+ * been freed since the scan walked it (kmemleak_free clears
+ * OBJECT_ALLOCATED under object->lock before the user memory goes away).
+ * The hex dump is skipped for coalesced entries since the bytes would
+ * differ across objects anyway.
+ */
+static void dedup_flush(struct xarray *dedup)
+{
+ struct kmemleak_object *object;
+ unsigned long idx;
+ unsigned int dup;
+ bool coalesced;
+
+ xa_for_each(dedup, idx, object) {
+ dup = object->dup_count;
+ coalesced = dup > 1;
+
+ print_leak_locked(object, !coalesced);
+ if (coalesced)
+ pr_warn(" ... and %u more object(s) with the same backtrace\n",
+ dup - 1);
+ put_object(object);
+ xa_erase(dedup, idx);
+ }
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1694,6 +1802,7 @@ static void kmemleak_scan(void)
struct kmemleak_object *object;
struct zone *zone;
int __maybe_unused i;
+ struct xarray dedup;
int new_leaks = 0;
jiffies_last_scan = jiffies;
@@ -1834,10 +1943,18 @@ static void kmemleak_scan(void)
return;
/*
- * Scanning result reporting.
+ * Scanning result reporting. When verbose printing is enabled, dedupe
+ * by stackdepot trace_handle so each unique backtrace is logged once
+ * per scan, annotated with the number of objects that share it. The
+ * per-leak count below still reflects every object, and
+ * /sys/kernel/debug/kmemleak still lists them individually.
*/
+ xa_init(&dedup);
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
+ depot_stack_handle_t trace_handle;
+ bool dedup_print;
+
if (need_resched())
kmemleak_cond_resched(object);
@@ -1849,18 +1966,33 @@ static void kmemleak_scan(void)
if (!color_white(object))
continue;
raw_spin_lock_irq(&object->lock);
+ trace_handle = 0;
+ dedup_print = false;
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
-
- if (kmemleak_verbose)
- print_unreferenced(NULL, object);
-
+ if (kmemleak_verbose) {
+ trace_handle = object->trace_handle;
+ dedup_print = true;
+ }
new_leaks++;
}
raw_spin_unlock_irq(&object->lock);
+
+ /*
+ * Defer the verbose print outside object->lock: xa_store()
+ * may take xa_node slab locks at a higher wait-context level
+ * which lockdep would flag against the raw_spinlock_t
+ * object->lock. rcu_read_lock() keeps the kmemleak_object
+ * alive across the call.
+ */
+ if (dedup_print)
+ dedup_record(&dedup, object, trace_handle);
}
rcu_read_unlock();
+ /* Flush'em all */
+ dedup_flush(&dedup);
+ xa_destroy(&dedup);
if (new_leaks) {
kmemleak_found_leaks = true;
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf56..cd9bb077072cc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1834,50 +1834,29 @@ static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
tlb_finish_mmu(madv_behavior->tlb);
}
-static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+/**
+ * check_input_range() - Check if the requested range is valid.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ *
+ * Returns: 0 if the input range is valid, otherwise an error code.
+ */
+static int check_input_range(unsigned long start, size_t len_in)
{
size_t len;
- if (!madvise_behavior_valid(behavior))
- return false;
-
if (!PAGE_ALIGNED(start))
- return false;
+ return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return false;
+ return -EINVAL;
if (start + len < start)
- return false;
-
- return true;
-}
+ return -EINVAL;
-/*
- * madvise_should_skip() - Return if the request is invalid or nothing.
- * @start: Start address of madvise-requested address range.
- * @len_in: Length of madvise-requested address range.
- * @behavior: Requested madvise behavior.
- * @err: Pointer to store an error code from the check.
- *
- * If the specified behaviour is invalid or nothing would occur, we skip the
- * operation. This function returns true in the cases, otherwise false. In
- * the former case we store an error on @err.
- */
-static bool madvise_should_skip(unsigned long start, size_t len_in,
- int behavior, int *err)
-{
- if (!is_valid_madvise(start, len_in, behavior)) {
- *err = -EINVAL;
- return true;
- }
- if (start + PAGE_ALIGN(len_in) == start) {
- *err = 0;
- return true;
- }
- return false;
+ return 0;
}
static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
@@ -2013,8 +1992,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
.tlb = &tlb,
};
- if (madvise_should_skip(start, len_in, behavior, &error))
+ if (!madvise_behavior_valid(behavior))
+ return -EINVAL;
+
+ error = check_input_range(start, len_in);
+ if (error || !len_in)
return error;
+
error = madvise_lock(&madv_behavior);
if (error)
return error;
@@ -2056,7 +2040,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
size_t len_in = iter_iov_len(iter);
int error;
- if (madvise_should_skip(start, len_in, behavior, &error))
+ error = check_input_range(start, len_in);
+ if (error || !len_in)
ret = error;
else
ret = madvise_do_behavior(start, len_in, &madv_behavior);
@@ -2131,6 +2116,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
+ if (!madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_mm;
+ }
+
/*
* We need only perform this check if we are attempting to manipulate a
* remote process's address space.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1a4fd2504bcdf..431cad99189f9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4005,11 +4005,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg1_alloc_events(memcg))
goto fail;
+ pstatc_pcpu = parent ? parent->vmstats_percpu : NULL;
for_each_possible_cpu(cpu) {
- if (parent)
- pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
+ statc->parent_pcpu = pstatc_pcpu;
statc->vmstats = memcg->vmstats;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d47aef256a324..eff405a21c68b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -459,7 +459,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
* Only do anything when FORCEKILL is set, otherwise just free the
* list (this is used for clean pages which do not need killing)
*/
-static void kill_procs(struct list_head *to_kill, int forcekill,
+static void kill_procs(struct list_head *to_kill, bool forcekill,
unsigned long pfn, int flags)
{
struct to_kill *tk, *next;
@@ -1418,7 +1418,7 @@ try_again:
* We raced with (possibly temporary) unhandlable
* page, retry.
*/
- if (pass++ < 3) {
+ if (pass++ < GET_PAGE_MAX_RETRY_NUM) {
shake_page(p);
goto try_again;
}
@@ -1582,7 +1582,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
{
LIST_HEAD(tokill);
bool unmap_success;
- int forcekill;
+ bool forcekill;
bool mlocked = folio_test_mlocked(folio);
/*
@@ -1703,7 +1703,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
unmap_mapping_range(mapping, start, size, 0);
}
- kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags);
+ kill_procs(to_kill, !!(flags & MF_MUST_KILL), pfn, flags);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 86a973119bd46..0c9d9c2cbf0e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3837,8 +3837,8 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
* Handle the case of a page which we actually need to copy to a new page,
* either due to COW or unsharing.
*
- * Called with mmap_lock locked and the old page referenced, but
- * without the ptl held.
+ * Called with either the VMA lock or the mmap_lock held (see FAULT_FLAG_VMA_LOCK)
+ * and the old page referenced, but without the ptl held.
*
* High level logic flow:
*
@@ -4237,9 +4237,9 @@ static bool wp_can_reuse_anon_folio(struct folio *folio,
* though the page will change only once the write actually happens. This
* avoids a few races, and potentially makes it more efficient.
*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_lock still held, but pte unmapped and unlocked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK) and pte both mapped and locked. We return with
+ * the same lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
@@ -4785,12 +4785,12 @@ static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
- * We return with the mmap_lock locked or unlocked in the same cases
- * as does filemap_fault().
+ * When returning, the lock may have been released in the same cases
+ * as done by filemap_fault().
*/
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
@@ -5330,9 +5330,10 @@ static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte,
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_lock still held, but pte unmapped and unlocked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK), and pte unmapped and unlocked.
+ * We return with the lock still held, but pte unmapped and unlocked.
+ * If VM_FAULT_RETRY is returned, the lock may have been released.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
@@ -5440,9 +5441,10 @@ oom:
}
/*
- * The mmap_lock must have been held on entry, and may have been
- * released depending on flags and vma->vm_ops->fault() return value.
- * See filemap_fault() and __lock_page_retry().
+ * Either the VMA lock or the mmap_lock must have been held on entry
+ * (see FAULT_FLAG_VMA_LOCK) and may have been released depending on
+ * flags and vma->vm_ops->fault() return value.
+ * See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
@@ -5480,7 +5482,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
if (unlikely(PageHWPoison(vmf->page))) {
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
- if (page_mapped(vmf->page))
+ if (folio_mapped(folio))
unmap_mapping_folio(folio);
/* Retry if a clean folio was removed from the cache. */
if (mapping_evict_folio(folio->mapping, folio))
@@ -6003,11 +6005,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults).
- * The mmap_lock may have been released depending on flags and our
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK).
+ * The lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
- * If mmap_lock is released, vma may become invalid (for example
+ * If the lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
@@ -6374,10 +6376,11 @@ static void fix_spurious_fault(struct vm_fault *vmf,
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
- * concurrent faults).
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (see FAULT_FLAG_VMA_LOCK).
*
- * The mmap_lock may have been released depending on flags and our return value.
+ * The mmap_lock or VMA lock may have been released depending on flags
+ * and our return value.
* See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
@@ -6458,8 +6461,8 @@ unlock:
/*
* On entry, we hold either the VMA lock or the mmap_lock
- * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
- * the result, the mmap_lock is not held on exit. See filemap_fault()
+ * (see FAULT_FLAG_VMA_LOCK). If VM_FAULT_RETRY is set in
+ * the result, the lock is not held on exit. See filemap_fault()
* and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
@@ -6691,9 +6694,9 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
/*
* By the time we get here, we already hold either the VMA lock or the
- * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
+ * mmap_lock (see FAULT_FLAG_VMA_LOCK).
*
- * The mmap_lock may have been released depending on flags and our
+ * The lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 40c7915dabe05..462d8dcd636dc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -576,6 +576,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
* @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
+ * @pgmap: device page map or %NULL if not ZONE_DEVICE
*
* Generic helper function to remove section mappings and sysfs entries
* for the section of the memory we are removing. Caller needs to make
@@ -583,7 +584,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
* calling offline_pages().
*/
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
const unsigned long end_pfn = pfn + nr_pages;
unsigned long cur_nr_pages;
@@ -598,7 +599,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- sparse_remove_section(pfn, cur_nr_pages, altmap);
+ sparse_remove_section(pfn, cur_nr_pages, altmap, pgmap);
}
}
@@ -1427,7 +1428,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
remove_memory_block_devices(cur_start, memblock_size);
- arch_remove_memory(cur_start, memblock_size, altmap);
+ arch_remove_memory(cur_start, memblock_size, altmap, NULL);
/* Verify that all vmemmap pages have actually been freed. */
WARN(altmap->alloc, "Altmap not fully unmapped");
@@ -1470,7 +1471,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
ret = create_memory_block_devices(cur_start, memblock_size, nid,
params.altmap, group);
if (ret) {
- arch_remove_memory(cur_start, memblock_size, NULL);
+ arch_remove_memory(cur_start, memblock_size, params.altmap, NULL);
kfree(params.altmap);
goto out;
}
@@ -1556,7 +1557,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
/* create memory block devices after memory was added */
ret = create_memory_block_devices(start, size, nid, NULL, group);
if (ret) {
- arch_remove_memory(start, size, params.altmap);
+ arch_remove_memory(start, size, params.altmap, NULL);
goto error;
}
}
@@ -2268,7 +2269,7 @@ static int try_remove_memory(u64 start, u64 size)
* No altmaps present, do the removal directly
*/
remove_memory_block_devices(start, size);
- arch_remove_memory(start, size, NULL);
+ arch_remove_memory(start, size, NULL, NULL);
} else {
/* all memblocks in the range have altmaps */
remove_memory_blocks_and_altmaps(start, size);
diff --git a/mm/memremap.c b/mm/memremap.c
index 053842d45cb10..81766d8224009 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -97,10 +97,10 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
PHYS_PFN(range_len(range)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
__remove_pages(PHYS_PFN(range->start),
- PHYS_PFN(range_len(range)), NULL);
+ PHYS_PFN(range_len(range)), NULL, pgmap);
} else {
arch_remove_memory(range->start, range_len(range),
- pgmap_altmap(pgmap));
+ pgmap_altmap(pgmap), pgmap);
kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
mem_hotplug_done();
diff --git a/mm/migrate.c b/mm/migrate.c
index 8a64291ab5b44..0c6a0ab6eccef 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1135,26 +1135,24 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
* This is safe because nobody is using it except us.
*/
enum {
- PAGE_WAS_MAPPED = BIT(0),
- PAGE_WAS_MLOCKED = BIT(1),
- PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
+ FOLIO_WAS_MAPPED = BIT(0),
+ FOLIO_WAS_MLOCKED = BIT(1),
+ FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED,
};
static void __migrate_folio_record(struct folio *dst,
- int old_page_state,
- struct anon_vma *anon_vma)
+ int old_folio_state, struct anon_vma *anon_vma)
{
- dst->private = (void *)anon_vma + old_page_state;
+ dst->private = (void *)anon_vma + old_folio_state;
}
static void __migrate_folio_extract(struct folio *dst,
- int *old_page_state,
- struct anon_vma **anon_vmap)
+ int *old_folio_state, struct anon_vma **anon_vmap)
{
unsigned long private = (unsigned long)dst->private;
- *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
- *old_page_state = private & PAGE_OLD_STATES;
+ *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES);
+ *old_folio_state = private & FOLIO_OLD_STATES;
dst->private = NULL;
}
@@ -1209,7 +1207,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
{
struct folio *dst;
int rc = -EAGAIN;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool locked = false;
bool dst_locked = false;
@@ -1253,7 +1251,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
}
locked = true;
if (folio_test_mlocked(src))
- old_page_state |= PAGE_WAS_MLOCKED;
+ old_folio_state |= FOLIO_WAS_MLOCKED;
if (folio_test_writeback(src)) {
/*
@@ -1302,7 +1300,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
dst_locked = true;
if (unlikely(page_has_movable_ops(&src->page))) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1328,11 +1326,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
- old_page_state |= PAGE_WAS_MAPPED;
+ old_folio_state |= FOLIO_WAS_MAPPED;
}
if (!folio_mapped(src)) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1344,7 +1342,7 @@ out:
if (rc == -EAGAIN)
ret = NULL;
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, locked, ret);
migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
@@ -1358,13 +1356,13 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct list_head *ret)
{
int rc;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool src_deferred_split = false;
bool src_partially_mapped = false;
struct list_head *prev;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
@@ -1404,10 +1402,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
* isolated from the unevictable LRU: but this case is the easiest.
*/
folio_add_lru(dst);
- if (old_page_state & PAGE_WAS_MLOCKED)
+ if (old_folio_state & FOLIO_WAS_MLOCKED)
lru_add_drain();
- if (old_page_state & PAGE_WAS_MAPPED)
+ if (old_folio_state & FOLIO_WAS_MAPPED)
remove_migration_ptes(src, dst, 0);
out_unlock_both:
@@ -1439,11 +1437,11 @@ out:
*/
if (rc == -EAGAIN) {
list_add(&dst->lru, prev);
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return rc;
}
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
@@ -1777,11 +1775,11 @@ static void migrate_folios_undo(struct list_head *src_folios,
dst = list_first_entry(dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, src_folios, lru) {
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
- migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret_folios);
list_del(&dst->lru);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 19cd14b341146..554754eb26ff2 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -801,8 +801,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
bool flush = false;
unsigned long i;
- VM_WARN_ON_FOLIO(!folio, folio);
- VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+ VM_WARN_ON_ONCE(!folio);
if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
return -EINVAL;
@@ -859,11 +858,9 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
if (userfaultfd_missing(vma))
goto unlock_abort;
- if (!pmd_none(*pmdp)) {
- if (!is_huge_zero_pmd(*pmdp))
- goto unlock_abort;
+ if (is_huge_zero_pmd(*pmdp))
flush = true;
- } else if (!pmd_none(*pmdp))
+ else if (!pmd_none(*pmdp))
goto unlock_abort;
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f9f8e1af921cd..bd466a3c10c8e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -674,6 +674,20 @@ static inline void fixup_hashdist(void)
static inline void fixup_hashdist(void) {}
#endif /* CONFIG_NUMA */
+#ifdef CONFIG_ZONE_DEVICE
+static __meminit void pageblock_migratetype_init_range(unsigned long pfn,
+ unsigned long nr_pages, int migratetype)
+{
+ const unsigned long end = pfn + nr_pages;
+
+ for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) {
+ init_pageblock_migratetype(pfn_to_page(pfn), migratetype, false);
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+ }
+}
+#endif
+
/*
* Initialize a reserved page unconditionally, finding its zone first.
*/
@@ -1012,21 +1026,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
page->zone_device_data = NULL;
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
- * because this is done early in section_activate()
- */
- if (pageblock_aligned(pfn)) {
- init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
- cond_resched();
- }
-
- /*
* ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
* directly to the driver page allocator which will set the page count
* to 1 when allocating the page.
@@ -1056,10 +1055,17 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* of how the sparse_vmemmap internals handle compound pages in the lack
* of an altmap. See vmemmap_populate_compound_pages().
*/
-static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+static inline unsigned long compound_nr_pages(unsigned long pfn,
+ struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
{
- if (!vmemmap_can_optimize(altmap, pgmap))
+ /*
+ * If DAX memory is hot-plugged into an unoccupied subsection
+ * of an early section, the unoptimized boot memmap is reused.
+ * See section_activate().
+ */
+ if (early_section(__pfn_to_section(pfn)) ||
+ !vmemmap_can_optimize(altmap, pgmap))
return pgmap_vmemmap_nr(pgmap);
return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
@@ -1122,13 +1128,18 @@ void __ref memmap_init_zone_device(struct zone *zone,
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+
if (pfns_per_compound == 1)
continue;
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- compound_nr_pages(altmap, pgmap));
+ compound_nr_pages(pfn, altmap, pgmap));
}
+ pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE);
+
pr_debug("%s initialised %lu pages in %ums\n", __func__,
nr_pages, jiffies_to_msecs(jiffies - start));
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 5754d1c364624..2311ae7c2ff45 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -504,7 +504,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
break;
case MAP_DROPPABLE:
if (VM_DROPPABLE == VM_NONE)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
/*
* A locked or stack area makes no sense to be droppable.
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d49c254174da7..69a99af777771 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ typedef int __bitwise fpi_t;
/* Free the page without taking locks. Rely on trylock only. */
#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+/* free_pages_prepare() has already been called for page(s) being freed. */
+#define FPI_PREPARED ((__force fpi_t)BIT(3))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -282,6 +285,14 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+/*
+ * When page allocations stall for longer than a threshold,
+ * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log. Only one warning
+ * will be printed during this duration for the entire system.
+ */
+#define ALLOC_STALL_WARN_MSECS (10 * 1000UL)
+static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES;
+
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static bool cond_accept_memory(struct zone *zone, unsigned int order,
int alloc_flags);
@@ -1211,14 +1222,18 @@ static inline bool should_skip_kasan_poison(struct page *page)
return page_kasan_tag(page) == KASAN_TAG_KERNEL;
}
-static void kernel_init_pages(struct page *page, int numpages)
+static void clear_highpages_kasan_tagged(struct page *page, int numpages)
{
- int i;
-
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++)
- clear_highpage_kasan_tagged(page + i);
+ if (!IS_ENABLED(CONFIG_HIGHMEM)) {
+ clear_pages(kasan_reset_tag(page_address(page)), numpages);
+ } else {
+ int i;
+
+ for (i = 0; i < numpages; i++)
+ clear_highpage_kasan_tagged(page + i);
+ }
kasan_enable_current();
}
@@ -1303,8 +1318,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
#endif /* CONFIG_MEM_ALLOC_PROFILING */
-__always_inline bool __free_pages_prepare(struct page *page,
- unsigned int order, fpi_t fpi_flags)
+static __always_inline bool __free_pages_prepare(struct page *page,
+ unsigned int order, fpi_t fpi_flags)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page);
@@ -1312,6 +1327,9 @@ __always_inline bool __free_pages_prepare(struct page *page,
bool compound = PageCompound(page);
struct folio *folio = page_folio(page);
+ if (fpi_flags & FPI_PREPARED)
+ return true;
+
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
@@ -1423,7 +1441,7 @@ __always_inline bool __free_pages_prepare(struct page *page,
init = false;
}
if (init)
- kernel_init_pages(page, 1 << order);
+ clear_highpages_kasan_tagged(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1451,7 +1469,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp,
int pindex)
{
- unsigned long flags;
unsigned int order;
struct page *page;
@@ -1464,7 +1481,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
while (count > 0) {
struct list_head *list;
@@ -1496,8 +1513,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
trace_mm_page_pcpu_drain(page, order, mt);
} while (count > 0 && !list_empty(list));
}
-
- spin_unlock_irqrestore(&zone->lock, flags);
}
/* Split a multi-block free page into its individual pageblocks. */
@@ -1848,7 +1863,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
}
/* If memory is still not initialized, initialize it now. */
if (init)
- kernel_init_pages(page, 1 << order);
+ clear_highpages_kasan_tagged(page, 1 << order);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
@@ -3424,7 +3439,7 @@ static void reserve_highatomic_pageblock(struct page *page, int order,
struct zone *zone)
{
int mt;
- unsigned long max_managed, flags;
+ unsigned long max_managed;
/*
* The number reserved as: minimum is 1 pageblock, maximum is
@@ -3438,29 +3453,26 @@ static void reserve_highatomic_pageblock(struct page *page, int order,
if (zone->nr_reserved_highatomic >= max_managed)
return;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
/* Recheck the nr_reserved_highatomic limit under the lock */
if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
+ return;
/* Yoink! */
mt = get_pageblock_migratetype(page);
/* Only reserve normal pageblocks (i.e., they can merge with others) */
if (!migratetype_is_mergeable(mt))
- goto out_unlock;
+ return;
if (order < pageblock_order) {
if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
+ return;
zone->nr_reserved_highatomic += pageblock_nr_pages;
} else {
change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
zone->nr_reserved_highatomic += 1 << order;
}
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
}
/*
@@ -3476,7 +3488,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
bool force)
{
struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
struct zoneref *z;
struct zone *zone;
struct page *page;
@@ -3493,7 +3504,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
pageblock_nr_pages)
continue;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &(zone->free_area[order]);
unsigned long size;
@@ -3540,12 +3551,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* so this should not fail on zone boundaries.
*/
WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (ret > 0)
return ret;
- }
}
- spin_unlock_irqrestore(&zone->lock, flags);
}
return false;
@@ -4678,6 +4686,40 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
return false;
}
+static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned int order, unsigned long alloc_start_time)
+{
+ static DEFINE_SPINLOCK(alloc_stall_lock);
+ unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time);
+
+ if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS))
+ return;
+ if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies)))
+ return;
+ if (gfp_mask & __GFP_NOWARN)
+ return;
+
+ if (!spin_trylock(&alloc_stall_lock))
+ return;
+
+ /* Check again, this time under the lock */
+ if (time_is_after_jiffies(alloc_stall_warn_jiffies)) {
+ spin_unlock(&alloc_stall_lock);
+ return;
+ }
+
+ WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS));
+ spin_unlock(&alloc_stall_lock);
+
+ pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl",
+ current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask,
+ nodemask_pr_args(nodemask));
+ cpuset_print_current_mems_allowed();
+ pr_cont("\n");
+ dump_stack();
+ warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -4698,6 +4740,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int reserve_flags;
bool compact_first = false;
bool can_retry_reserves = true;
+ unsigned long alloc_start_time = jiffies;
if (unlikely(nofail)) {
/*
@@ -4813,6 +4856,9 @@ retry:
if (current->flags & PF_MEMALLOC)
goto nopage;
+ /* If allocation has taken excessively long, warn about it */
+ check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time);
+
/* Try direct reclaim and then allocating */
if (!compact_first) {
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags,
@@ -5044,7 +5090,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct per_cpu_pages *pcp;
struct list_head *pcp_list;
struct alloc_context ac;
- gfp_t alloc_gfp;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
int nr_populated = 0, nr_account = 0;
@@ -5085,10 +5130,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
gfp &= gfp_allowed_mask;
- alloc_gfp = gfp;
- if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+ if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags))
goto out;
- gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
z = ac.preferred_zoneref;
@@ -5180,6 +5223,34 @@ failed:
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
+ * free_pages_bulk - Free an array of order-0 pages
+ * @page_array: Array of pages to free
+ * @nr_pages: The number of pages in the array
+ *
+ * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous
+ * run are released with a single __free_contig_range() call.
+ *
+ * This assumes page_array is sorted in ascending PFN order. Without that,
+ * the function still frees all pages, but contiguous runs may not be
+ * detected and the freeing pattern can degrade to freeing one page at a
+ * time.
+ *
+ * Context: Sleepable process context only; calls cond_resched()
+ */
+void free_pages_bulk(struct page **page_array, unsigned long nr_pages)
+{
+ while (nr_pages) {
+ unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages);
+
+ __free_contig_range(page_to_pfn(*page_array), nr_contig);
+
+ nr_pages -= nr_contig;
+ page_array += nr_contig;
+ cond_resched();
+ }
+}
+
+/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
@@ -6758,6 +6829,105 @@ void __init page_alloc_sysctl_init(void)
register_sysctl_init("vm", page_alloc_sysctl_table);
}
+static void free_prepared_contig_range(struct page *page,
+ unsigned long nr_pages)
+{
+ unsigned long pfn = page_to_pfn(page);
+
+ while (nr_pages) {
+ unsigned int order;
+
+ /* We are limited by the largest buddy order. */
+ order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER;
+ /* Don't exceed the number of pages to free. */
+ order = min_t(unsigned int, order, ilog2(nr_pages));
+ order = min_t(unsigned int, order, MAX_PAGE_ORDER);
+
+ /*
+ * Free the chunk as a single block. Our caller has already
+ * called free_pages_prepare() for each order-0 page.
+ */
+ __free_frozen_pages(page, order, FPI_PREPARED);
+
+ pfn += 1UL << order;
+ page += 1UL << order;
+ nr_pages -= 1UL << order;
+ }
+}
+
+static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages,
+ bool is_frozen)
+{
+ struct page *page, *start = NULL;
+ unsigned long nr_start = 0;
+ unsigned long start_sec;
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++) {
+ bool can_free = true;
+
+ /*
+ * Contiguous PFNs might not have contiguous "struct pages"
+ * in some kernel configs: page++ across a section boundary
+ * is undefined. Use pfn_to_page() for each PFN.
+ */
+ page = pfn_to_page(pfn + i);
+
+ VM_WARN_ON_ONCE(PageHead(page));
+ VM_WARN_ON_ONCE(PageTail(page));
+
+ if (!is_frozen)
+ can_free = put_page_testzero(page);
+
+ if (can_free)
+ can_free = free_pages_prepare(page, 0);
+
+ if (!can_free) {
+ if (start) {
+ free_prepared_contig_range(start, i - nr_start);
+ start = NULL;
+ }
+ continue;
+ }
+
+ if (start && memdesc_section(page->flags) != start_sec) {
+ free_prepared_contig_range(start, i - nr_start);
+ start = page;
+ nr_start = i;
+ start_sec = memdesc_section(page->flags);
+ } else if (!start) {
+ start = page;
+ nr_start = i;
+ start_sec = memdesc_section(page->flags);
+ }
+ }
+
+ if (start)
+ free_prepared_contig_range(start, nr_pages - nr_start);
+}
+
+/**
+ * __free_contig_range - Free contiguous range of order-0 pages.
+ * @pfn: Page frame number of the first page in the range.
+ * @nr_pages: Number of pages to free.
+ *
+ * For each order-0 struct page in the physically contiguous range, put a
+ * reference. Free any page who's reference count falls to zero. The
+ * implementation is functionally equivalent to, but significantly faster than
+ * calling __free_page() for each struct page in a loop.
+ *
+ * Memory allocated with alloc_pages(order>=1) then subsequently split to
+ * order-0 with split_page() is an example of appropriate contiguous pages that
+ * can be freed with this API.
+ *
+ * Context: May be called in interrupt context or while holding a normal
+ * spinlock, but not in NMI context or while holding a raw spinlock.
+ */
+void __free_contig_range(unsigned long pfn, unsigned long nr_pages)
+{
+ __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false);
+}
+
#ifdef CONFIG_CONTIG_ALLOC
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
@@ -6895,8 +7065,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages)
{
- for (; nr_pages--; pfn++)
- free_frozen_pages(pfn_to_page(pfn), 0);
+ __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true);
}
/**
@@ -7304,8 +7473,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn))))
return;
- for (; nr_pages--; pfn++)
- __free_page(pfn_to_page(pfn));
+ __free_contig_range(pfn, nr_pages);
}
EXPORT_SYMBOL(free_contig_range);
#endif /* CONFIG_CONTIG_ALLOC */
@@ -7363,7 +7531,7 @@ void zone_pcp_reset(struct zone *zone)
unsigned long __offline_isolated_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long already_offline = 0, flags;
+ unsigned long already_offline = 0;
unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
@@ -7371,7 +7539,7 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn,
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
while (pfn < end_pfn) {
page = pfn_to_page(pfn);
/*
@@ -7401,7 +7569,6 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn,
del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
pfn += (1 << order);
}
- spin_unlock_irqrestore(&zone->lock, flags);
return end_pfn - start_pfn - already_offline;
}
@@ -7473,11 +7640,9 @@ bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
- unsigned long flags;
unsigned int order;
- bool ret = false;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
int page_order = buddy_order(page_head);
@@ -7492,14 +7657,12 @@ bool take_page_off_buddy(struct page *page)
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
SetPageHWPoisonTakenOff(page);
- ret = true;
- break;
+ return true;
}
if (page_count(page_head) > 0)
break;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
+ return false;
}
/*
@@ -7508,23 +7671,19 @@ bool take_page_off_buddy(struct page *page)
bool put_page_back_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
- unsigned long flags;
- bool ret = false;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
if (put_page_testzero(page)) {
unsigned long pfn = page_to_pfn(page);
int migratetype = get_pfnblock_migratetype(page, pfn);
ClearPageHWPoisonTakenOff(page);
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
- if (TestClearPageHWPoison(page)) {
- ret = true;
- }
+ if (TestClearPageHWPoison(page))
+ return true;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
+ return false;
}
#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 70cea9e24d2fd..7ed76592e20d8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -326,8 +326,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
struct swap_iocb {
struct kiocb iocb;
- struct bio_vec bvec[SWAP_CLUSTER_MAX];
- int pages;
+ struct bio_vec bvecs[SWAP_CLUSTER_MAX];
+ int nr_bvecs;
int len;
};
static mempool_t *sio_pool;
@@ -348,7 +348,7 @@ int sio_pool_init(void)
static void sio_write_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec[0].bv_page;
+ struct page *page = sio->bvecs[0].bv_page;
int p;
if (ret != sio->len) {
@@ -362,15 +362,15 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
*/
pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
ret, swap_dev_pos(page_swap_entry(page)));
- for (p = 0; p < sio->pages; p++) {
- page = sio->bvec[p].bv_page;
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ page = sio->bvecs[p].bv_page;
set_page_dirty(page);
ClearPageReclaim(page);
}
}
- for (p = 0; p < sio->pages; p++)
- end_page_writeback(sio->bvec[p].bv_page);
+ for (p = 0; p < sio->nr_bvecs; p++)
+ end_page_writeback(sio->bvecs[p].bv_page);
mempool_free(sio, sio_pool);
}
@@ -397,13 +397,13 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
init_sync_kiocb(&sio->iocb, swap_file);
sio->iocb.ki_complete = sio_write_complete;
sio->iocb.ki_pos = pos;
- sio->pages = 0;
+ sio->nr_bvecs = 0;
sio->len = 0;
}
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
+ bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) {
+ sio->nr_bvecs += 1;
+ if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !swap_plug) {
swap_write_unplug(sio);
sio = NULL;
}
@@ -477,7 +477,7 @@ void swap_write_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_write_complete(&sio->iocb, ret);
@@ -489,8 +489,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
int p;
if (ret == sio->len) {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = page_folio(sio->bvec[p].bv_page);
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ struct folio *folio = page_folio(sio->bvecs[p].bv_page);
count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
@@ -499,8 +499,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
}
count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
} else {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = page_folio(sio->bvec[p].bv_page);
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ struct folio *folio = page_folio(sio->bvecs[p].bv_page);
folio_unlock(folio);
}
@@ -559,13 +559,13 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
init_sync_kiocb(&sio->iocb, sis->swap_file);
sio->iocb.ki_pos = pos;
sio->iocb.ki_complete = sio_read_complete;
- sio->pages = 0;
+ sio->nr_bvecs = 0;
sio->len = 0;
}
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
+ bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+ sio->nr_bvecs += 1;
+ if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !plug) {
swap_read_unplug(sio);
sio = NULL;
}
@@ -666,7 +666,7 @@ void __swap_read_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_read_complete(&sio->iocb, ret);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c48ff5c002449..7a9d631945a34 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -167,48 +167,40 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
{
struct zone *zone = page_zone(page);
struct page *unmovable;
- unsigned long flags;
unsigned long check_unmovable_start, check_unmovable_end;
if (PageUnaccepted(page))
accept_page(page);
- spin_lock_irqsave(&zone->lock, flags);
-
- /*
- * We assume the caller intended to SET migrate type to isolate.
- * If it is already set, then someone else must have raced and
- * set it before us.
- */
- if (is_migrate_isolate_page(page)) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return -EBUSY;
- }
+ scoped_guard(spinlock_irqsave, &zone->lock) {
+ /*
+ * We assume the caller intended to SET migrate type to
+ * isolate. If it is already set, then someone else must have
+ * raced and set it before us.
+ */
+ if (is_migrate_isolate_page(page))
+ return -EBUSY;
- /*
- * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
- * We just check MOVABLE pages.
- *
- * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock
- * to avoid redundant checks.
- */
- check_unmovable_start = max(page_to_pfn(page), start_pfn);
- check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
- end_pfn);
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by
+ * itself. We just check MOVABLE pages.
+ *
+ * Pass the intersection of [start_pfn, end_pfn) and the page's
+ * pageblock to avoid redundant checks.
+ */
+ check_unmovable_start = max(page_to_pfn(page), start_pfn);
+ check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
+ end_pfn);
- unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
- mode);
- if (!unmovable) {
- if (!pageblock_isolate_and_move_free_pages(zone, page)) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return -EBUSY;
+ unmovable = has_unmovable_pages(check_unmovable_start,
+ check_unmovable_end, mode);
+ if (!unmovable) {
+ if (!pageblock_isolate_and_move_free_pages(zone, page))
+ return -EBUSY;
+ zone->nr_isolate_pageblock++;
+ return 0;
}
- zone->nr_isolate_pageblock++;
- spin_unlock_irqrestore(&zone->lock, flags);
- return 0;
}
-
- spin_unlock_irqrestore(&zone->lock, flags);
if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) {
/*
* printk() with zone->lock held will likely trigger a
@@ -223,15 +215,14 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
static void unset_migratetype_isolate(struct page *page)
{
struct zone *zone;
- unsigned long flags;
bool isolated_page = false;
unsigned int order;
struct page *buddy;
zone = page_zone(page);
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
if (!is_migrate_isolate_page(page))
- goto out;
+ return;
/*
* Because freepage with more than pageblock_order on isolated
@@ -279,8 +270,6 @@ static void unset_migratetype_isolate(struct page *page)
__putback_isolated_page(page, order, get_pageblock_migratetype(page));
}
zone->nr_isolate_pageblock--;
-out:
- spin_unlock_irqrestore(&zone->lock, flags);
}
static inline struct page *
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 8178e0be557f8..2dddcb6510aa1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -573,7 +573,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
migratetype_names[pageblock_mt],
- &page->flags);
+ &page->flags.f);
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
diff --git a/mm/readahead.c b/mm/readahead.c
index 7b05082c89ea2..8c12b63ccd4a2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -324,6 +324,8 @@ static void do_page_cache_ra(struct readahead_control *ractl,
return;
end_index = (isize - 1) >> PAGE_SHIFT;
+ if (end_index > ractl->_max_index)
+ end_index = ractl->_max_index;
if (index > end_index)
return;
/* Don't read past the page containing the last byte of the file */
@@ -471,7 +473,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
pgoff_t start = readahead_index(ractl);
pgoff_t index = start;
unsigned int min_order = mapping_min_folio_order(mapping);
- pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ pgoff_t limit;
pgoff_t mark = index + ra->size - ra->async_size;
unsigned int nofs;
int err = 0;
@@ -484,6 +486,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
goto fallback;
}
+ limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ limit = min(limit, ractl->_max_index);
limit = min(limit, index + ra->size - 1);
new_order = min(mapping_max_folio_order(mapping), new_order);
diff --git a/mm/rmap.c b/mm/rmap.c
index 99e1b3dc390b7..1c77d5dc06e9f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -571,7 +571,7 @@ void __init anon_vma_init(void)
* In case it was remapped to a different anon_vma, the new anon_vma will be a
* child of the old anon_vma, and the anon_vma lifetime rules will therefore
* ensure that any anon_vma obtained from the page will still be valid for as
- * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ * long as we observe folio_mapped() [ hence all those folio_mapped() tests ].
*
* All users of this function must be very careful when walking the anon_vma
* chain and verify that the page in question is indeed mapped in it
@@ -1999,7 +1999,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
- * try_to_unmap() may return before page_mapped() has become false,
+ * try_to_unmap() may return before folio_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
@@ -2428,7 +2428,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
- * try_to_migrate() may return before page_mapped() has become false,
+ * try_to_migrate() may return before folio_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
@@ -2929,7 +2929,7 @@ static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
/*
* Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
- * because that depends on page_mapped(); but not all its usages
+ * because that depends on folio_mapped(); but not all its usages
* are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index 3b5dc21b323c2..bab3529af23c5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3101,10 +3101,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
cache_no_acl(inode);
if (sbinfo->noswap)
mapping_set_unevictable(inode->i_mapping);
-
- /* Don't consider 'deny' for emergencies and 'force' for testing */
- if (sbinfo->huge)
- mapping_set_large_folios(inode->i_mapping);
+ mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 6eadb9d116e43..112ccf9c71caf 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -87,15 +87,10 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap)
{
- void *ptr;
-
if (altmap)
return altmap_alloc_block_buf(size, altmap);
- ptr = sparse_buffer_alloc(size);
- if (!ptr)
- ptr = vmemmap_alloc_block(size, node);
- return ptr;
+ return vmemmap_alloc_block(size, node);
}
static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
@@ -151,7 +146,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
start, end - 1);
}
-pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap,
unsigned long ptpfn, unsigned long flags)
{
@@ -195,7 +190,7 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
return p;
}
-pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+static pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
{
pmd_t *pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
@@ -208,7 +203,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
-pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
+static pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
@@ -221,7 +216,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
return pud;
}
-p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
+static p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
@@ -234,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
return p4d;
}
-pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
{
pgd_t *pgd = pgd_offset_k(addr);
if (pgd_none(*pgd)) {
@@ -652,26 +647,61 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
}
}
+static int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ const unsigned int order = pgmap ? pgmap->vmemmap_shift : 0;
+ const unsigned long pages_per_compound = 1UL << order;
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION));
+ VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION);
+
+ if (!vmemmap_can_optimize(altmap, pgmap))
+ return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE);
+
+ if (order < PFN_SECTION_SHIFT) {
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound));
+ return VMEMMAP_RESERVE_NR * nr_pages / pages_per_compound;
+ }
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION));
+
+ if (IS_ALIGNED(pfn, pages_per_compound))
+ return VMEMMAP_RESERVE_NR;
+
+ return 0;
+}
+
static struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
{
- return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
+ struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap,
+ pgmap);
+
+ memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
+
+ return page;
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
+ memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
vmemmap_free(start, end, altmap);
}
+
static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long pfn = page_to_pfn(memmap);
+ memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION,
+ NULL, NULL));
vmemmap_free(start, end, NULL);
}
@@ -737,7 +767,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
* usage map, but still need to free the vmemmap range.
*/
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
@@ -774,14 +804,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* The memmap of early sections is always fully populated. See
* section_activate() and pfn_valid() .
*/
- if (!section_is_early) {
- memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
- depopulate_section_memmap(pfn, nr_pages, altmap);
- } else if (memmap) {
- memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
- PAGE_SIZE)));
+ if (!section_is_early)
+ depopulate_section_memmap(pfn, nr_pages, altmap, pgmap);
+ else if (memmap)
free_map_bootmem(memmap);
- }
if (empty)
ms->section_mem_map = (unsigned long)NULL;
@@ -823,10 +849,9 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
if (!memmap) {
- section_deactivate(pfn, nr_pages, altmap);
+ section_deactivate(pfn, nr_pages, altmap, pgmap);
return ERR_PTR(-ENOMEM);
}
- memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
return memmap;
}
@@ -885,13 +910,13 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
}
void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- section_deactivate(pfn, nr_pages, altmap);
+ section_deactivate(pfn, nr_pages, altmap, pgmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/sparse.c b/mm/sparse.c
index effdac6b0ab13..16ac6df3c89fa 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -201,13 +201,11 @@ static void __init memblocks_present(void)
int i, nid;
#ifdef CONFIG_SPARSEMEM_EXTREME
- if (unlikely(!mem_section)) {
- unsigned long size, align;
+ unsigned long size, align;
- size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
- align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_alloc_or_panic(size, align);
- }
+ size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_alloc_or_panic(size, align);
#endif
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
@@ -241,12 +239,9 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
struct dev_pagemap *pgmap)
{
unsigned long size = section_map_size();
- struct page *map = sparse_buffer_alloc(size);
+ struct page *map;
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
- if (map)
- return map;
-
map = memmap_alloc(size, size, addr, nid, false);
if (!map)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
@@ -256,55 +251,6 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-static void *sparsemap_buf __meminitdata;
-static void *sparsemap_buf_end __meminitdata;
-
-static inline void __meminit sparse_buffer_free(unsigned long size)
-{
- WARN_ON(!sparsemap_buf || size == 0);
- memblock_free(sparsemap_buf, size);
-}
-
-static void __init sparse_buffer_init(unsigned long size, int nid)
-{
- phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
- WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
- /*
- * Pre-allocated buffer is mainly used by __populate_section_memmap
- * and we want it to be properly aligned to the section size - this is
- * especially the case for VMEMMAP which maps memmap to PMDs
- */
- sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
- sparsemap_buf_end = sparsemap_buf + size;
-}
-
-static void __init sparse_buffer_fini(void)
-{
- unsigned long size = sparsemap_buf_end - sparsemap_buf;
-
- if (sparsemap_buf && size > 0)
- sparse_buffer_free(size);
- sparsemap_buf = NULL;
-}
-
-void * __meminit sparse_buffer_alloc(unsigned long size)
-{
- void *ptr = NULL;
-
- if (sparsemap_buf) {
- ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
- if (ptr + size > sparsemap_buf_end)
- ptr = NULL;
- else {
- /* Free redundant aligned space */
- if ((unsigned long)(ptr - sparsemap_buf) > 0)
- sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
- sparsemap_buf = ptr + size;
- }
- }
- return ptr;
-}
-
void __weak __meminit vmemmap_populate_print_last(void)
{
}
@@ -362,8 +308,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
goto failed;
}
- sparse_buffer_init(map_count * section_map_size(), nid);
-
sparse_vmemmap_init_nid_early(nid);
for_each_present_section_nr(pnum_begin, pnum) {
@@ -381,7 +325,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
__func__, nid);
pnum_begin = pnum;
sparse_usage_fini();
- sparse_buffer_fini();
goto failed;
}
memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
@@ -390,7 +333,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
}
}
sparse_usage_fini();
- sparse_buffer_fini();
return;
failed:
/*
diff --git a/mm/swap.c b/mm/swap.c
index 5cc44f0de9877..2dd84813f4dde 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -160,14 +160,42 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
int i;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
+ struct folio_batch free_fbatch;
+ bool is_lru_add = (move_fn == lru_add);
+
+ /*
+ * If we're adding to the LRU, preemptively filter dead folios. Use
+ * this dedicated folio batch for temp storage and deferred cleanup.
+ */
+ if (is_lru_add)
+ folio_batch_init(&free_fbatch);
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
/* block memcg migration while the folio moves between lru */
- if (move_fn != lru_add && !folio_test_clear_lru(folio))
+ if (!is_lru_add && !folio_test_clear_lru(folio))
continue;
+ /*
+ * Filter dead folios by moving them from the add batch to the temp
+ * batch for freeing after this loop.
+ *
+ * We're bypassing normal cleanup. Clear flags that are not
+ * applicable to dead folios.
+ *
+ * Since the folio may be part of a huge page, unqueue from
+ * deferred split list to avoid a dangling list entry.
+ */
+ if (is_lru_add && folio_ref_freeze(folio, 1)) {
+ __folio_clear_active(folio);
+ __folio_clear_unevictable(folio);
+ folio_unqueue_deferred_split(folio);
+ fbatch->folios[i] = NULL;
+ folio_batch_add(&free_fbatch, folio);
+ continue;
+ }
+
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -176,6 +204,13 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (lruvec)
lruvec_unlock_irqrestore(lruvec, flags);
+
+ /* Cleanup filtered dead folios. */
+ if (is_lru_add) {
+ mem_cgroup_uncharge_folios(&free_fbatch);
+ free_unref_folios(&free_fbatch);
+ }
+
folios_put(fbatch);
}
@@ -964,6 +999,10 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
struct folio *folio = folios->folios[i];
unsigned int nr_refs = refs ? refs[i] : 1;
+ /* Folio batch entry may have been preemptively removed during drain. */
+ if (!folio)
+ continue;
+
if (is_huge_zero_folio(folio))
continue;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9174f1eeffb09..74a1e324449dc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1054,6 +1054,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
swap_cluster_unlock(ci);
if (to_scan <= 0)
break;
+ cond_resched();
}
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bb6ae08d18f58..eabb86b13b7e5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3459,19 +3459,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
- for (i = 0; i < vm->nr_pages; i++) {
- struct page *page = vm->pages[i];
- BUG_ON(!page);
- /*
- * High-order allocs for huge vmallocs are split, so
- * can be freed as an array of order-0 allocations
- */
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_lruvec_page_state(page, NR_VMALLOC, -1);
- __free_page(page);
- cond_resched();
+ if (!(vm->flags & VM_MAP_PUT_PAGES)) {
+ for (i = 0; i < vm->nr_pages; i++)
+ mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1);
}
+ free_pages_bulk(vm->pages, vm->nr_pages);
+
kvfree(vm->pages);
kfree(vm);
}
@@ -3939,7 +3933,7 @@ fail:
__GFP_NOFAIL | __GFP_ZERO |\
__GFP_NORETRY | __GFP_RETRY_MAYFAIL |\
GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
- GFP_USER | __GFP_NOLOCKDEP)
+ GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN)
static gfp_t vmalloc_fix_flags(gfp_t flags)
{
@@ -3980,6 +3974,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags)
*
* %__GFP_NOWARN can be used to suppress failure messages.
*
+ * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages
+ * (when prot=%PAGE_KERNEL).
+ *
* Can not be called from interrupt nor NMI contexts.
* Return: the address of the area or %NULL on failure
*/
@@ -3993,6 +3990,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
+ bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN);
if (WARN_ON_ONCE(!size))
return NULL;
@@ -4023,7 +4021,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
again:
area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
- gfp_mask, caller);
+ gfp_mask & ~__GFP_SKIP_KASAN, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
@@ -4041,7 +4039,7 @@ again:
* kasan_unpoison_vmalloc().
*/
if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
- if (kasan_hw_tags_enabled()) {
+ if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) {
/*
* Modify protection bits to allow tagging.
* This must be done before mapping.
@@ -4078,7 +4076,8 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
+ if (!skip_vmalloc_kasan)
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 3fbb86996c4d2..f053554e58264 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -218,6 +218,7 @@ static void vmpressure_work_fn(struct work_struct *work)
/**
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
+ * @order: allocation order being reclaimed for
* @memcg: cgroup memory controller handle
* @tree: legacy subtree mode
* @scanned: number of pages scanned
@@ -236,7 +237,7 @@ static void vmpressure_work_fn(struct work_struct *work)
*
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr;
@@ -307,7 +308,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
level = vmpressure_calc_level(scanned, reclaimed);
- if (level > VMPRESSURE_LOW) {
+ /*
+ * Once we go above COSTLY_ORDER, reclaim relies heavily on
+ * compaction to make progress. Reclaim efficiency was never a
+ * great proxy for pressure to begin with, but it's outright
+ * misleading with these high orders. Don't throttle sockets
+ * because somebody is attempting something crazy like an order-7
+ * and predictably struggling.
+ */
+ if (level > VMPRESSURE_LOW && order <= PAGE_ALLOC_COSTLY_ORDER) {
/*
* Let the socket buffer allocator know that
* we are having trouble reclaiming LRU pages.
@@ -348,7 +357,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, true, vmpressure_win, 0);
+ vmpressure(gfp, 0, memcg, true, vmpressure_win, 0);
}
#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd1b1aa125819..4b09843876583 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -109,7 +109,7 @@ struct scan_control {
/* zone_reclaim_mode */
unsigned int may_unmap:1;
- /* zome_reclaim_mode, boost reclaim, cgroup restrictions */
+ /* zone_reclaim_mode, boost reclaim, cgroup restrictions */
unsigned int may_swap:1;
/* Not allow cache_trim_mode to be turned on as part of reclaim? */
@@ -5071,8 +5071,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
if (!sc->proactive)
- vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
- sc->nr_reclaimed - reclaimed);
+ vmpressure(sc->gfp_mask, sc->order, memcg, false,
+ sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed);
flush_reclaim_state(sc);
@@ -6175,7 +6175,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
/* Record the group's reclaim efficiency */
if (!sc->proactive)
- vmpressure(sc->gfp_mask, memcg, false,
+ vmpressure(sc->gfp_mask, sc->order, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
@@ -6220,7 +6220,7 @@ again:
/* Record the subtree's reclaim efficiency */
if (!sc->proactive)
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ vmpressure(sc->gfp_mask, sc->order, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned, nr_node_reclaimed);
if (nr_node_reclaimed)
@@ -6359,7 +6359,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
if (current_is_kswapd() || cgroup_reclaim(sc))
return;
- /* Throttle if making no progress at high prioities. */
+ /* Throttle if making no progress at high priorities. */
if (sc->priority == 1 && !sc->nr_reclaimed)
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}
@@ -7121,6 +7121,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.may_unmap = 1,
};
+ trace_mm_vmscan_balance_pgdat_begin(pgdat->node_id, order,
+ highest_zoneidx);
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire(_THIS_IP_);
@@ -7222,7 +7224,7 @@ restart:
/*
* There should be no need to raise the scanning priority if
- * enough pages are already being scanned that that high
+ * enough pages are already being scanned that the high
* watermark would be met at 100% efficiency.
*/
if (kswapd_shrink_node(pgdat, &sc))
@@ -7314,6 +7316,9 @@ out:
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
+ trace_mm_vmscan_balance_pgdat_end(pgdat->node_id, sc.order,
+ highest_zoneidx, sc.nr_reclaimed);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 49f96ee0c40f6..ffe843ca219c7 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -275,7 +275,7 @@ static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
desc.count = 1; /* give more than one skb per call */
tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
- rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
+ rdsdebug("tcp_read_sock for tc %p gfp %pGg returned %d\n", tc, &gfp,
desc.error);
if (skb_queue_empty_lockless(&sock->sk->sk_receive_queue) &&
diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c
index 42f54936f4bbd..f1ec7de58ae34 100644
--- a/tools/testing/selftests/cgroup/lib/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@@ -141,7 +141,7 @@ int cg_read_strcmp_wait(const char *cgroup, const char *control,
int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
if (cg_read(cgroup, control, buf, sizeof(buf)))
return -1;
@@ -171,7 +171,7 @@ long cg_read_long_fd(int fd)
long cg_read_key_long(const char *cgroup, const char *control, const char *key)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
char *ptr;
if (cg_read(cgroup, control, buf, sizeof(buf)))
@@ -207,7 +207,7 @@ long cg_read_key_long_poll(const char *cgroup, const char *control,
long cg_read_lc(const char *cgroup, const char *control)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
const char delim[] = "\n";
char *line;
long cnt = 0;
@@ -259,7 +259,7 @@ int cg_write_numeric(const char *cgroup, const char *control, long value)
static int cg_find_root(char *root, size_t len, const char *controller,
bool *nsdelegate)
{
- char buf[10 * PAGE_SIZE];
+ char buf[10 * BUF_SIZE];
char *fs, *mount, *type, *options;
const char delim[] = "\n\t ";
@@ -314,7 +314,7 @@ int cg_create(const char *cgroup)
int cg_wait_for_proc_count(const char *cgroup, int count)
{
- char buf[10 * PAGE_SIZE] = {0};
+ char buf[10 * BUF_SIZE] = {0};
int attempts;
char *ptr;
@@ -339,7 +339,7 @@ int cg_wait_for_proc_count(const char *cgroup, int count)
int cg_killall(const char *cgroup)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
char *ptr = buf;
/* If cgroup.kill exists use it. */
@@ -549,7 +549,7 @@ int cg_run_nowait(const char *cgroup,
int proc_mount_contains(const char *option)
{
- char buf[4 * PAGE_SIZE];
+ char buf[4 * BUF_SIZE];
ssize_t read;
read = read_text("/proc/mounts", buf, sizeof(buf));
@@ -561,7 +561,7 @@ int proc_mount_contains(const char *option)
int cgroup_feature(const char *feature)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
ssize_t read;
read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf));
@@ -588,7 +588,7 @@ ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t
int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
return -1;
diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
index 567b1082974c5..febc1723d0903 100644
--- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@@ -2,8 +2,8 @@
#include <stdbool.h>
#include <stdlib.h>
-#ifndef PAGE_SIZE
-#define PAGE_SIZE 4096
+#ifndef BUF_SIZE
+#define BUF_SIZE 4096
#endif
#define MB(x) (x << 20)
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 7b83c7e7c9d4f..88ca832d4fc13 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -87,7 +87,7 @@ static int test_cgcore_destroy(const char *root)
int ret = KSFT_FAIL;
char *cg_test = NULL;
int child_pid;
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
cg_test = cg_name(root, "cg_test");
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c
index 97fae92c8387e..160a9e6ad277d 100644
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -642,7 +642,7 @@ cleanup:
*/
static int proc_check_stopped(int pid)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
int len;
len = proc_read_text(pid, 0, "stat", buf, sizeof(buf));
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
index 12f59925500bd..1db0ba1226b9b 100644
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -24,7 +24,7 @@
* the maximum discrepancy between charge and vmstat entries is number
* of cpus multiplied by 64 pages.
*/
-#define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
+#define MAX_VMSTAT_ERROR (sysconf(_SC_PAGESIZE) * 64 * get_nprocs())
#define KMEM_DEAD_WAIT_RETRIES 80
@@ -353,7 +353,7 @@ static int test_percpu_basic(const char *root)
{
int ret = KSFT_FAIL;
char *parent, *child;
- long current, percpu;
+ long current, percpu, slab;
int i;
parent = cg_name(root, "percpu_basic_test");
@@ -383,13 +383,14 @@ static int test_percpu_basic(const char *root)
current = cg_read_long(parent, "memory.current");
percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
+ slab = cg_read_key_long(parent, "memory.stat", "slab ");
- if (current > 0 && percpu > 0 && labs(current - percpu) <
- MAX_VMSTAT_ERROR)
+ if (current > 0 && percpu > 0 && slab >= 0 &&
+ labs(current - (percpu + slab)) < MAX_VMSTAT_ERROR)
ret = KSFT_PASS;
else
- printf("memory.current %ld\npercpu %ld\n",
- current, percpu);
+ printf("memory.current %ld\npercpu %ld\nslab %ld\ndelta %ld\n",
+ current, percpu, slab, current - (percpu + slab));
cleanup_children:
for (i = 0; i < 1000; i++) {
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index b43da9bc20c49..44338dbaee819 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -26,6 +26,7 @@
static bool has_localevents;
static bool has_recursiveprot;
+static int page_size;
int get_temp_fd(void)
{
@@ -34,7 +35,7 @@ int get_temp_fd(void)
int alloc_pagecache(int fd, size_t size)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
struct stat st;
int i;
@@ -61,7 +62,7 @@ int alloc_anon(const char *cgroup, void *arg)
char *buf, *ptr;
buf = malloc(size);
- for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ for (ptr = buf; ptr < buf + size; ptr += page_size)
*ptr = 0;
free(buf);
@@ -70,7 +71,7 @@ int alloc_anon(const char *cgroup, void *arg)
int is_swap_enabled(void)
{
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
const char delim[] = "\n";
int cnt = 0;
char *line;
@@ -113,7 +114,7 @@ static int test_memcg_subtree_control(const char *root)
{
char *parent, *child, *parent2 = NULL, *child2 = NULL;
int ret = KSFT_FAIL;
- char buf[PAGE_SIZE];
+ char buf[BUF_SIZE];
/* Create two nested cgroups with the memory controller enabled */
parent = cg_name(root, "memcg_test_0");
@@ -184,7 +185,7 @@ static int alloc_anon_50M_check(const char *cgroup, void *arg)
return -1;
}
- for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ for (ptr = buf; ptr < buf + size; ptr += page_size)
*ptr = 0;
current = cg_read_long(cgroup, "memory.current");
@@ -414,7 +415,7 @@ static int alloc_anon_noexit(const char *cgroup, void *arg)
return -1;
}
- for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ for (ptr = buf; ptr < buf + size; ptr += page_size)
*ptr = 0;
while (getppid() == ppid)
@@ -1000,7 +1001,7 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
return -1;
}
- for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ for (ptr = buf; ptr < buf + size; ptr += page_size)
*ptr = 0;
mem_current = cg_read_long(cgroup, "memory.current");
@@ -1791,6 +1792,10 @@ int main(int argc, char **argv)
char root[PATH_MAX];
int i, proc_status;
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if (page_size <= 0)
+ page_size = BUF_SIZE;
+
ksft_print_header();
ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index a7bdcdd09d627..49b36ee791606 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -11,10 +11,16 @@
#include <string.h>
#include <sys/wait.h>
#include <sys/mman.h>
+#include <sys/random.h>
#include "kselftest.h"
#include "cgroup_util.h"
+static int page_size;
+
+#define PATH_ZSWAP "/sys/module/zswap"
+#define PATH_ZSWAP_ENABLED "/sys/module/zswap/parameters/enabled"
+
static int read_int(const char *path, size_t *value)
{
FILE *file;
@@ -70,11 +76,11 @@ static int allocate_and_read_bytes(const char *cgroup, void *arg)
if (!mem)
return -1;
- for (int i = 0; i < size; i += 4095)
+ for (int i = 0; i < size; i += page_size)
mem[i] = 'a';
/* Go through the allocated memory to (z)swap in and out pages */
- for (int i = 0; i < size; i += 4095) {
+ for (int i = 0; i < size; i += page_size) {
if (mem[i] != 'a')
ret = -1;
}
@@ -90,7 +96,7 @@ static int allocate_bytes(const char *cgroup, void *arg)
if (!mem)
return -1;
- for (int i = 0; i < size; i += 4095)
+ for (int i = 0; i < size; i += page_size)
mem[i] = 'a';
free(mem);
return 0;
@@ -115,6 +121,27 @@ fail:
}
/*
+ * Writeback is asynchronous; poll until at least one writeback has
+ * been recorded for @cg, or until @timeout_ms has elapsed.
+ */
+static long wait_for_writeback(const char *cg, int timeout_ms)
+{
+ long elapsed, count;
+ for (elapsed = 0; elapsed < timeout_ms; elapsed += 100) {
+ count = get_cg_wb_count(cg);
+
+ if (count < 0)
+ return -1;
+ if (count > 0)
+ return count;
+
+ usleep(100000);
+ }
+
+ return 0;
+}
+
+/*
* Sanity test to check that pages are written into zswap.
*/
static int test_zswap_usage(const char *root)
@@ -162,21 +189,25 @@ out:
static int test_swapin_nozswap(const char *root)
{
int ret = KSFT_FAIL;
- char *test_group;
- long swap_peak, zswpout;
+ char *test_group, mem_max_buf[32];
+ long swap_peak, zswpout, min_swap;
+ size_t allocation_size = page_size * 512;
+
+ min_swap = allocation_size / 4;
+ snprintf(mem_max_buf, sizeof(mem_max_buf), "%zu", allocation_size * 3/4);
test_group = cg_name(root, "no_zswap_test");
if (!test_group)
goto out;
if (cg_create(test_group))
goto out;
- if (cg_write(test_group, "memory.max", "8M"))
+ if (cg_write(test_group, "memory.max", mem_max_buf))
goto out;
if (cg_write(test_group, "memory.zswap.max", "0"))
goto out;
/* Allocate and read more than memory.max to trigger swapin */
- if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+ if (cg_run(test_group, allocate_and_read_bytes, (void *)allocation_size))
goto out;
/* Verify that pages are swapped out, but no zswap happened */
@@ -186,8 +217,9 @@ static int test_swapin_nozswap(const char *root)
goto out;
}
- if (swap_peak < MB(24)) {
- ksft_print_msg("at least 24MB of memory should be swapped out\n");
+ if (swap_peak < min_swap) {
+ ksft_print_msg("at least %ldKB of memory should be swapped out\n",
+ min_swap / 1024);
goto out;
}
@@ -237,7 +269,7 @@ static int test_zswapin(const char *root)
goto out;
}
- if (zswpin < MB(24) / PAGE_SIZE) {
+ if (zswpin < MB(24) / page_size) {
ksft_print_msg("at least 24MB should be brought back from zswap\n");
goto out;
}
@@ -257,16 +289,15 @@ out:
This will move it into zswap.
* 3. Save current zswap usage.
* 4. Move the memory allocated in step 1 back in from zswap.
- * 5. Set zswap.max to half the amount that was recorded in step 3.
+ * 5. Set zswap.max to 1/4 of the amount that was recorded in step 3.
* 6. Attempt to reclaim memory equal to the amount that was allocated,
this will either trigger writeback if it's enabled, or reclamation
will fail if writeback is disabled as there isn't enough zswap space.
*/
static int attempt_writeback(const char *cgroup, void *arg)
{
- long pagesize = sysconf(_SC_PAGESIZE);
- size_t memsize = MB(4);
- char buf[pagesize];
+ size_t memsize = page_size * 1024;
+ char buf[page_size];
long zswap_usage;
bool wb_enabled = *(bool *) arg;
int ret = -1;
@@ -281,11 +312,11 @@ static int attempt_writeback(const char *cgroup, void *arg)
* half empty, this will result in data that is still compressible
* and ends up in zswap, with material zswap usage.
*/
- for (int i = 0; i < pagesize; i++)
- buf[i] = i < pagesize/2 ? (char) i : 0;
+ for (int i = 0; i < page_size; i++)
+ buf[i] = i < page_size/2 ? (char) i : 0;
- for (int i = 0; i < memsize; i += pagesize)
- memcpy(&mem[i], buf, pagesize);
+ for (int i = 0; i < memsize; i += page_size)
+ memcpy(&mem[i], buf, page_size);
/* Try and reclaim allocated memory */
if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
@@ -296,19 +327,19 @@ static int attempt_writeback(const char *cgroup, void *arg)
zswap_usage = cg_read_long(cgroup, "memory.zswap.current");
/* zswpin */
- for (int i = 0; i < memsize; i += pagesize) {
- if (memcmp(&mem[i], buf, pagesize)) {
+ for (int i = 0; i < memsize; i += page_size) {
+ if (memcmp(&mem[i], buf, page_size)) {
ksft_print_msg("invalid memory\n");
goto out;
}
}
- if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2))
+ if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/4))
goto out;
/*
* If writeback is enabled, trying to reclaim memory now will trigger a
- * writeback as zswap.max is half of what was needed when reclaim ran the first time.
+ * writeback as zswap.max is 1/4 of what was needed when reclaim ran the first time.
* If writeback is disabled, memory reclaim will fail as zswap is limited and
* it can't writeback to swap.
*/
@@ -335,7 +366,10 @@ static int test_zswap_writeback_one(const char *cgroup, bool wb)
return -1;
/* Verify that zswap writeback occurred only if writeback was enabled */
- zswpwb_after = get_cg_wb_count(cgroup);
+ if (wb)
+ zswpwb_after = wait_for_writeback(cgroup, 5000);
+ else
+ zswpwb_after = get_cg_wb_count(cgroup);
if (zswpwb_after < 0)
return -1;
@@ -417,44 +451,71 @@ static int test_zswap_writeback_disabled(const char *root)
static int test_no_invasive_cgroup_shrink(const char *root)
{
int ret = KSFT_FAIL;
- size_t control_allocation_size = MB(10);
- char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL;
+ unsigned int off;
+ size_t allocation_size = page_size * 1024;
+ unsigned int nr_pages = allocation_size / page_size;
+ char zswap_max_buf[32], mem_max_buf[32];
+ char *zw_allocation = NULL, *wb_allocation = NULL;
+ char *zw_group = NULL, *wb_group = NULL;
+
+ snprintf(zswap_max_buf, sizeof(zswap_max_buf), "%d", page_size);
+ snprintf(mem_max_buf, sizeof(mem_max_buf), "%zu", allocation_size / 2);
wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
if (!wb_group)
return KSFT_FAIL;
- if (cg_write(wb_group, "memory.zswap.max", "10K"))
+ if (cg_write(wb_group, "memory.zswap.max", zswap_max_buf))
+ goto out;
+ if (cg_write(wb_group, "memory.max", mem_max_buf))
+ goto out;
+
+ zw_group = setup_test_group_1M(root, "per_memcg_wb_test2");
+ if (!zw_group)
goto out;
- control_group = setup_test_group_1M(root, "per_memcg_wb_test2");
- if (!control_group)
+ if (cg_write(zw_group, "memory.max", mem_max_buf))
goto out;
- /* Push some test_group2 memory into zswap */
- if (cg_enter_current(control_group))
+ /* Push some zw_group memory into zswap (simple data, easy to compress) */
+ if (cg_enter_current(zw_group))
goto out;
- control_allocation = malloc(control_allocation_size);
- for (int i = 0; i < control_allocation_size; i += 4095)
- control_allocation[i] = 'a';
- if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1)
+ zw_allocation = malloc(allocation_size);
+ for (int i = 0; i < nr_pages; i++) {
+ off = (unsigned long)i * page_size;
+ memset(&zw_allocation[off], 0, page_size);
+ memset(&zw_allocation[off], 'a', page_size/4);
+ }
+ if (cg_read_key_long(zw_group, "memory.stat", "zswapped") < 1)
goto out;
- /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */
- if (cg_run(wb_group, allocate_bytes, (void *)MB(10)))
+ /* Push wb_group memory into zswap with hard-to-compress data to trigger wb */
+ if (cg_enter_current(wb_group))
goto out;
+ wb_allocation = malloc(allocation_size);
+ if (!wb_allocation)
+ goto out;
+ for (int i = 0; i < nr_pages; i++) {
+ off = (unsigned long)i * page_size;
+ memset(&wb_allocation[off], 0, page_size);
+ getrandom(&wb_allocation[off], page_size/4, 0);
+ }
/* Verify that only zswapped memory from gwb_group has been written back */
- if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0)
+ if (wait_for_writeback(wb_group, 5000) > 0 && get_cg_wb_count(zw_group) == 0)
ret = KSFT_PASS;
out:
cg_enter_current(root);
- if (control_group) {
- cg_destroy(control_group);
- free(control_group);
+ if (zw_group) {
+ cg_destroy(zw_group);
+ free(zw_group);
}
- cg_destroy(wb_group);
- free(wb_group);
- if (control_allocation)
- free(control_allocation);
+ if (wb_group) {
+ cg_destroy(wb_group);
+ free(wb_group);
+ }
+ if (zw_allocation)
+ free(zw_allocation);
+ if (wb_allocation)
+ free(wb_allocation);
return ret;
}
@@ -473,7 +534,7 @@ static int no_kmem_bypass_child(const char *cgroup, void *arg)
values->child_allocated = true;
return -1;
}
- for (long i = 0; i < values->target_alloc_bytes; i += 4095)
+ for (long i = 0; i < values->target_alloc_bytes; i += page_size)
((char *)allocation)[i] = 'a';
values->child_allocated = true;
pause();
@@ -521,7 +582,7 @@ static int test_no_kmem_bypass(const char *root)
min_free_kb_low = sys_info.totalram / 500000;
values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) +
sys_info.totalram * 5 / 100;
- stored_pages_threshold = sys_info.totalram / 5 / 4096;
+ stored_pages_threshold = sys_info.totalram / 5 / page_size;
trigger_allocation_size = sys_info.totalram / 20;
/* Set up test memcg */
@@ -548,7 +609,7 @@ static int test_no_kmem_bypass(const char *root)
if (!trigger_allocation)
break;
- for (int i = 0; i < trigger_allocation_size; i += 4095)
+ for (int i = 0; i < trigger_allocation_size; i += page_size)
trigger_allocation[i] = 'b';
usleep(100000);
free(trigger_allocation);
@@ -559,8 +620,8 @@ static int test_no_kmem_bypass(const char *root)
/* If memory was pushed to zswap, verify it belongs to memcg */
if (stored_pages > stored_pages_threshold) {
int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped ");
- int delta = stored_pages * 4096 - zswapped;
- int result_ok = delta < stored_pages * 4096 / 4;
+ int delta = stored_pages * page_size - zswapped;
+ int result_ok = delta < stored_pages * page_size / 4;
ret = result_ok ? KSFT_PASS : KSFT_FAIL;
break;
@@ -614,7 +675,7 @@ static int allocate_random_and_wait(const char *cgroup, void *arg)
close(fd);
/* Touch all pages to ensure they're faulted in */
- for (size_t i = 0; i < size; i += PAGE_SIZE)
+ for (size_t i = 0; i < size; i += page_size)
mem[i] = mem[i];
/* Use MADV_PAGEOUT to push pages into zswap */
@@ -725,9 +786,18 @@ struct zswap_test {
};
#undef T
-static bool zswap_configured(void)
+static void check_zswap_enabled(void)
{
- return access("/sys/module/zswap", F_OK) == 0;
+ char value[2];
+
+ if (access(PATH_ZSWAP, F_OK))
+ ksft_exit_skip("zswap isn't configured\n");
+
+ if (read_text(PATH_ZSWAP_ENABLED, value, sizeof(value)) <= 0)
+ ksft_exit_fail_msg("Failed to read " PATH_ZSWAP_ENABLED "\n");
+
+ if (value[0] == 'N')
+ ksft_exit_skip("zswap is disabled (hint: echo 1 > " PATH_ZSWAP_ENABLED ")\n");
}
int main(int argc, char **argv)
@@ -735,13 +805,16 @@ int main(int argc, char **argv)
char root[PATH_MAX];
int i;
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if (page_size <= 0)
+ page_size = BUF_SIZE;
+
ksft_print_header();
ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
- if (!zswap_configured())
- ksft_exit_skip("zswap isn't configured\n");
+ check_zswap_enabled();
/*
* Check that memory controller is available:
diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index 2b4df655d9fd0..8b12cc0484405 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -132,14 +132,17 @@ class DamosQuota:
goals = None # quota goals
goal_tuner = None # quota goal tuner
reset_interval_ms = None # quota reset interval
+ fail_charge_num = None
+ fail_charge_denom = None
weight_sz_permil = None
weight_nr_accesses_permil = None
weight_age_permil = None
scheme = None # owner scheme
def __init__(self, sz=0, ms=0, goals=None, goal_tuner='consist',
- reset_interval_ms=0, weight_sz_permil=0,
- weight_nr_accesses_permil=0, weight_age_permil=0):
+ reset_interval_ms=0, fail_charge_num=0, fail_charge_denom=0,
+ weight_sz_permil=0, weight_nr_accesses_permil=0,
+ weight_age_permil=0):
self.sz = sz
self.ms = ms
self.reset_interval_ms = reset_interval_ms
@@ -151,6 +154,8 @@ class DamosQuota:
for idx, goal in enumerate(self.goals):
goal.idx = idx
goal.quota = self
+ self.fail_charge_num = fail_charge_num
+ self.fail_charge_denom = fail_charge_denom
def sysfs_dir(self):
return os.path.join(self.scheme.sysfs_dir(), 'quotas')
@@ -197,6 +202,18 @@ class DamosQuota:
os.path.join(self.sysfs_dir(), 'goal_tuner'), self.goal_tuner)
if err is not None:
return err
+
+ err = write_file(
+ os.path.join(self.sysfs_dir(), 'fail_charge_num'),
+ self.fail_charge_num)
+ if err is not None:
+ return err
+ err = write_file(
+ os.path.join(self.sysfs_dir(), 'fail_charge_denom'),
+ self.fail_charge_denom)
+ if err is not None:
+ return err
+
return None
class DamosWatermarks:
@@ -604,10 +621,11 @@ class DamonCtx:
targets = None
schemes = None
kdamond = None
+ pause = None
idx = None
def __init__(self, ops='paddr', monitoring_attrs=DamonAttrs(), targets=[],
- schemes=[]):
+ schemes=[], pause=False):
self.ops = ops
self.monitoring_attrs = monitoring_attrs
self.monitoring_attrs.context = self
@@ -622,6 +640,8 @@ class DamonCtx:
scheme.idx = idx
scheme.context = self
+ self.pause=pause
+
def sysfs_dir(self):
return os.path.join(self.kdamond.sysfs_dir(), 'contexts',
'%d' % self.idx)
@@ -662,6 +682,11 @@ class DamonCtx:
err = scheme.stage()
if err is not None:
return err
+
+ err = write_file(os.path.join(self.sysfs_dir(), 'pause'), self.pause)
+ if err is not None:
+ return err
+
return None
class Kdamond:
diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py
index af99b07a4f565..972948e6215f1 100755
--- a/tools/testing/selftests/damon/drgn_dump_damon_status.py
+++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py
@@ -112,6 +112,8 @@ def damos_quota_to_dict(quota):
['goals', damos_quota_goals_to_list],
['goal_tuner', int],
['esz', int],
+ ['fail_charge_num', int],
+ ['fail_charge_denom', int],
['weight_sz', int],
['weight_nr_accesses', int],
['weight_age', int],
@@ -200,6 +202,7 @@ def damon_ctx_to_dict(ctx):
['attrs', attrs_to_dict],
['adaptive_targets', targets_to_list],
['schemes', schemes_to_list],
+ ['pause', bool],
])
def main():
diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py
index 3aa5c91548a53..cd4d82c852113 100755
--- a/tools/testing/selftests/damon/sysfs.py
+++ b/tools/testing/selftests/damon/sysfs.py
@@ -73,6 +73,10 @@ def assert_quota_committed(quota, dump):
}
assert_true(dump['goal_tuner'] == tuner_val[quota.goal_tuner],
'goal_tuner', dump)
+ assert_true(dump['fail_charge_num'] == quota.fail_charge_num,
+ 'fail_charge_num', dump)
+ assert_true(dump['fail_charge_denom'] == quota.fail_charge_denom,
+ 'fail_charge_denom', dump)
assert_true(dump['weight_sz'] == quota.weight_sz_permil, 'weight_sz', dump)
assert_true(dump['weight_nr_accesses'] == quota.weight_nr_accesses_permil,
'weight_nr_accesses', dump)
@@ -123,11 +127,12 @@ def assert_scheme_committed(scheme, dump):
'pageout': 2,
'hugepage': 3,
'nohugeapge': 4,
- 'lru_prio': 5,
- 'lru_deprio': 6,
- 'migrate_hot': 7,
- 'migrate_cold': 8,
- 'stat': 9,
+ 'collapse': 5,
+ 'lru_prio': 6,
+ 'lru_deprio': 7,
+ 'migrate_hot': 8,
+ 'migrate_cold': 9,
+ 'stat': 10,
}
assert_true(dump['action'] == action_val[scheme.action], 'action', dump)
assert_true(dump['apply_interval_us'] == scheme. apply_interval_us,
@@ -190,20 +195,58 @@ def assert_ctx_committed(ctx, dump):
assert_monitoring_attrs_committed(ctx.monitoring_attrs, dump['attrs'])
assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets'])
assert_schemes_committed(ctx.schemes, dump['schemes'])
+ assert_true(dump['pause'] == ctx.pause, 'pause', dump)
def assert_ctxs_committed(kdamonds):
+ ctxs_paused_for_dump = []
+ kdamonds_paused_for_dump = []
+ # pause for safe state dumping
+ for kd in kdamonds.kdamonds:
+ for ctx in kd.contexts:
+ if ctx.pause is False:
+ ctx.pause = True
+ ctxs_paused_for_dump.append(ctx)
+ if not kd in kdamonds_paused_for_dump:
+ kdamonds_paused_for_dump.append(kd)
+ if kd in kdamonds_paused_for_dump:
+ err = kd.commit()
+ if err is not None:
+ print('pause fail (%s)' % err)
+ kdamonds.stop()
+ exit(1)
+
status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid)
if err is not None:
print(err)
kdamonds.stop()
exit(1)
+ # resume contexts paused for safe state dumping
+ for ctx in ctxs_paused_for_dump:
+ ctx.pause = False
+ for kd in kdamonds_paused_for_dump:
+ err = kd.commit()
+ if err is not None:
+ print('resume fail (%s)' % err)
+ kdamonds.stop()
+ exit(1)
+
+ # restore for comparison
+ for ctx in ctxs_paused_for_dump:
+ ctx.pause = True
+
ctxs = kdamonds.kdamonds[0].contexts
dump = status['contexts']
assert_true(len(ctxs) == len(dump), 'ctxs length', dump)
for idx, ctx in enumerate(ctxs):
assert_ctx_committed(ctx, dump[idx])
+ # restore for the caller
+ for kd in kdamonds.kdamonds:
+ for ctx in kd.contexts:
+ if ctx in ctxs_paused_for_dump:
+ ctx.pause = False
+
def main():
kdamonds = _damon_sysfs.Kdamonds(
[_damon_sysfs.Kdamond(
@@ -239,6 +282,8 @@ def main():
nid=1)],
goal_tuner='temporal',
reset_interval_ms=1500,
+ fail_charge_num=1,
+ fail_charge_denom=4096,
weight_sz_permil=20,
weight_nr_accesses_permil=200,
weight_age_permil=1000),
@@ -301,6 +346,7 @@ def main():
print('kdamond start failed: %s' % err)
exit(1)
kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True
+ kdamonds.kdamonds[0].contexts[0].pause = True
kdamonds.kdamonds[0].commit()
del kdamonds.kdamonds[0].contexts[0].targets[1]
assert_ctxs_committed(kdamonds)
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index cd24596cdd27e..41053fdaad88d 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -151,6 +151,7 @@ TEST_PROGS += ksft_gup_test.sh
TEST_PROGS += ksft_hmm.sh
TEST_PROGS += ksft_hugetlb.sh
TEST_PROGS += ksft_hugevm.sh
+TEST_PROGS += ksft_kmemleak_dedup.sh
TEST_PROGS += ksft_ksm.sh
TEST_PROGS += ksft_ksm_numa.sh
TEST_PROGS += ksft_madv_guard.sh
@@ -216,7 +217,8 @@ ifeq ($(CAN_BUILD_I386),1)
$(BINARIES_32): CFLAGS += -m32 -mxsave
$(BINARIES_32): LDLIBS += -lrt -ldl -lm
$(BINARIES_32): $(OUTPUT)/%_32: %.c
- $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+ $(call msg,CC,,$@)
+ $(Q)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t))))
endif
@@ -224,7 +226,8 @@ ifeq ($(CAN_BUILD_X86_64),1)
$(BINARIES_64): CFLAGS += -m64 -mxsave
$(BINARIES_64): LDLIBS += -lrt -ldl
$(BINARIES_64): $(OUTPUT)/%_64: %.c
- $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+ $(call msg,CC,,$@)
+ $(Q)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t))))
endif
@@ -261,7 +264,8 @@ $(OUTPUT)/migration: LDLIBS += -lnuma
$(OUTPUT)/rmap: LDLIBS += -lnuma
local_config.mk local_config.h: check_config.sh
- CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh
+ $(call msg,CHK,config,$@)
+ $(Q)CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh
EXTRA_CLEAN += local_config.mk local_config.h
diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh
index b84c82bbf8752..32beaefe279e5 100755
--- a/tools/testing/selftests/mm/check_config.sh
+++ b/tools/testing/selftests/mm/check_config.sh
@@ -16,7 +16,7 @@ echo "#include <sys/types.h>" > $tmpfile_c
echo "#include <liburing.h>" >> $tmpfile_c
echo "int func(void) { return 0; }" >> $tmpfile_c
-$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o
+$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
if [ -f $tmpfile_o ]; then
echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE
diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c
index 44940f75c461d..30c8be37fcb9d 100644
--- a/tools/testing/selftests/mm/droppable.c
+++ b/tools/testing/selftests/mm/droppable.c
@@ -26,7 +26,14 @@ int main(int argc, char *argv[])
ksft_set_plan(1);
alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
- assert(alloc != MAP_FAILED);
+ if (alloc == MAP_FAILED) {
+ if ((errno == EOPNOTSUPP) || (errno == EINVAL)) {
+ ksft_test_result_skip("MAP_DROPPABLE not supported\n");
+ exit(KSFT_SKIP);
+ }
+ ksft_test_result_fail("mmap error: %s\n", strerror(errno));
+ exit(KSFT_FAIL);
+ }
memset(alloc, 'A', alloc_size);
for (size_t i = 0; i < alloc_size; i += page_size)
assert(*(uint8_t *)(alloc + i));
diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
index 3fe7ef04ac62e..c8393ca52cab7 100644
--- a/tools/testing/selftests/mm/khugepaged.c
+++ b/tools/testing/selftests/mm/khugepaged.c
@@ -373,7 +373,7 @@ static void *file_setup_area(int nr_hpages)
unlink(finfo.path); /* Cleanup from previous failed tests */
printf("Creating %s for collapse%s...", finfo.path,
finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
- fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
+ fd = open(finfo.path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
777);
if (fd < 0) {
perror("open()");
@@ -381,9 +381,21 @@ static void *file_setup_area(int nr_hpages)
}
size = nr_hpages * hpage_pmd_size;
- p = alloc_mapping(nr_hpages);
+ if (ftruncate(fd, size)) {
+ perror("ftruncate()");
+ exit(EXIT_FAILURE);
+ }
+ p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (p != BASE_ADDR) {
+ perror("mmap()");
+ exit(EXIT_FAILURE);
+ }
fill_memory(p, 0, size);
- write(fd, p, size);
+ if (msync(p, size, MS_SYNC)) {
+ perror("msync()");
+ exit(EXIT_FAILURE);
+ }
close(fd);
munmap(p, size);
success("OK");
diff --git a/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh b/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh
new file mode 100755
index 0000000000000..d019502444901
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Regression test for kmemleak's per-scan verbose dedup.
+#
+# Loads samples/kmemleak's helper module to generate orphan allocations
+# (some of which share an allocation backtrace), runs a few kmemleak
+# scans with verbose printing enabled, and verifies that no two
+# "unreferenced object" reports within a single scan share the same
+# backtrace - which would mean dedup failed to collapse them.
+#
+# This test is intentionally permissive: the kmemleak-test module's
+# leaks frequently get reported across many separate scans (per-CPU
+# chunk reuse, slab freelist pointers, kernel stack residue), so dedup
+# may never have anything to fold within one scan. That is not a
+# regression. The test only fails when it actually catches dedup not
+# happening on input that should have triggered it - i.e. two reports
+# with identical backtraces in the same scan.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+ksft_skip=4
+KMEMLEAK=/sys/kernel/debug/kmemleak
+VERBOSE_PARAM=/sys/module/kmemleak/parameters/verbose
+MODULE=kmemleak-test
+
+skip() {
+ echo "SKIP: $*"
+ exit $ksft_skip
+}
+
+fail() {
+ echo "FAIL: $*"
+ exit 1
+}
+
+pass() {
+ echo "PASS: $*"
+ exit 0
+}
+
+[ "$(id -u)" -eq 0 ] || skip "must run as root"
+[ -r "$KMEMLEAK" ] || skip "no kmemleak debugfs (CONFIG_DEBUG_KMEMLEAK)"
+[ -w "$VERBOSE_PARAM" ] || skip "kmemleak verbose param missing"
+modinfo "$MODULE" >/dev/null 2>&1 ||
+ skip "$MODULE not built (CONFIG_SAMPLE_KMEMLEAK)"
+
+# The verdict depends entirely on dmesg contents, so a silently-empty
+# dmesg (dmesg_restrict=1 with CAP_SYSLOG dropped, restricted container,
+# etc.) would let the script report PASS without parsing anything. Probe
+# both read and clear up front and skip cleanly if either is denied.
+dmesg >/dev/null 2>&1 ||
+ skip "cannot read dmesg (need CAP_SYSLOG or dmesg_restrict=0)"
+dmesg -C >/dev/null 2>&1 ||
+ skip "cannot clear dmesg (need CAP_SYSLOG or dmesg_restrict=0)"
+
+# kmemleak can be present but disabled at runtime (boot arg kmemleak=off,
+# or it self-disabled after an internal error). In that state writes other
+# than "clear" return EPERM, so probe once and skip if so.
+if ! echo scan > "$KMEMLEAK" 2>/dev/null; then
+ skip "kmemleak is disabled (check dmesg or kmemleak= boot arg)"
+fi
+
+prev_verbose=$(cat "$VERBOSE_PARAM")
+# shellcheck disable=SC2317 # invoked indirectly via trap
+cleanup() {
+ echo "$prev_verbose" > "$VERBOSE_PARAM" 2>/dev/null
+ rmmod "$MODULE" 2>/dev/null
+ # Drain the leak set we generated. Subsequent selftests (e.g.
+ # tools/testing/selftests/net/netfilter/nft_interface_stress.sh)
+ # fail on any non-empty kmemleak report, so leaving the helper
+ # module's intentional leaks behind would poison the rest of a
+ # kselftest run.
+ #
+ # Caveat: kmemleak_clear() only greys objects that have already
+ # been reported (OBJECT_REPORTED && unreferenced_object()). Helper
+ # allocations that stayed "still referenced" throughout the test
+ # (stale pointers in per-CPU chunks, slab freelists, kernel stacks)
+ # were never reported and are therefore not greyed by this clear -
+ # they remain tracked and a later scan can still surface them. Such
+ # leftovers are inherent to the kmemleak-test sample module and are
+ # not specific to this test; consumers that fail on any kmemleak
+ # output (rather than on the test-specific backtraces) need to be
+ # robust to that, or this test should be excluded from the run.
+ echo clear > "$KMEMLEAK" 2>/dev/null
+}
+trap cleanup EXIT
+
+echo 1 > "$VERBOSE_PARAM"
+
+# Drain the existing leak set so the next scan only reports our objects.
+echo clear > "$KMEMLEAK"
+
+# Re-clear dmesg now (the up-front probe also cleared it, but anything
+# logged between then and here - module unload chatter, the probe scan,
+# the verbose-param write - would otherwise pollute the parse window).
+dmesg -C >/dev/null
+
+# If the module was left loaded by a previous aborted run, modprobe would
+# be a no-op and the init function would not run, so no new leaks would be
+# generated. Force a clean state first.
+rmmod "$MODULE" 2>/dev/null
+modprobe "$MODULE" || skip "failed to load $MODULE"
+# Removing the module orphans the list elements without freeing them.
+rmmod "$MODULE" || skip "failed to unload $MODULE"
+
+# Run a handful of scans so kmemleak has the chance to age and report
+# the orphans. We do not require any particular number to be reported:
+# the regression check below operates on whatever lands in dmesg.
+#
+# Note: with CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y the kernel's own scan
+# thread can report and mark these orphans (OBJECT_REPORTED) before our
+# manual scans run, after which our scans will see nothing. The
+# lower-bound check below catches the case where that happens and the
+# manual scans also produce nothing.
+SCAN_COUNT=4
+SCAN_SLEEP=6
+for _ in $(seq 1 "$SCAN_COUNT"); do
+ echo scan > "$KMEMLEAK"
+ sleep "$SCAN_SLEEP"
+done
+
+# Strip the leading "[ nnn.nnnnnn] " dmesg timestamp prefix. Without
+# this, two identical stack frames printed from two reports in the same
+# scan would produce different per-frame strings (different timestamps)
+# and the duplicate-backtrace check below would not match them, silently
+# passing a real dedup regression. Doing the strip here makes the rest
+# of the parser timestamp-agnostic regardless of what dmesg defaults to.
+log=$(dmesg | sed 's/^\[[^]]*\] //')
+
+# After running the workload (modprobe + scans), dmesg should contain at
+# least the helper module's pr_info lines and our manual-scan output. An
+# empty capture here means dmesg succeeded earlier but is now denying us
+# the buffer (race with dmesg_restrict toggling, etc.); refuse to give a
+# verdict on no evidence.
+[ -n "$log" ] || skip "dmesg returned empty after running workload"
+
+# Lower bound: if kmemleak's own per-scan tally counted leaks but the
+# verbose path emitted no "unreferenced object" line, the verbose printer
+# itself is regressed - fail rather than silently passing on no input.
+new_leaks=$(echo "$log" |
+ sed -n 's/.*kmemleak: \([0-9]\+\) new suspected.*/\1/p' |
+ awk '{s+=$1} END{print s+0}')
+printed=$(echo "$log" | grep -c 'kmemleak: unreferenced object')
+if [ "$new_leaks" -gt 0 ] && [ "$printed" -eq 0 ]; then
+ fail "verbose path broken: $new_leaks leaks counted, 0 printed in $SCAN_COUNT scans"
+fi
+
+# Walk the log: split into per-scan chunks at "N new suspected memory
+# leaks" boundaries; within each chunk, capture each "unreferenced
+# object" report's backtrace and check that no backtrace is reported
+# more than once. A duplicate within a single scan means dedup failed
+# to collapse two leaks that share an allocation site.
+violations=$(echo "$log" | awk '
+ function flush_block() {
+ if (in_block) {
+ # Skip empty backtraces: leaks with trace_handle == 0
+ # (early-boot allocations or stack_depot_save() failures
+ # under memory pressure) are intentionally not deduped,
+ # so multiple such reports in one scan are expected and
+ # must not be flagged as a regression.
+ if (bt != "")
+ seen[bt]++
+ in_block = 0
+ collecting = 0
+ bt = ""
+ }
+ }
+ function check_and_reset( b) {
+ for (b in seen)
+ if (seen[b] > 1)
+ printf("backtrace seen %d times in one scan:\n%s\n",
+ seen[b], b)
+ delete seen
+ }
+ # Scan boundary: the per-scan summary line.
+ /kmemleak: [0-9]+ new suspected memory leaks/ {
+ flush_block()
+ check_and_reset()
+ next
+ }
+ # Start of a new "unreferenced object" report.
+ /kmemleak: unreferenced object/ {
+ flush_block()
+ in_block = 1
+ next
+ }
+ # Inside a report, the "backtrace (crc ...):" line switches us to
+ # backtrace-collecting mode.
+ in_block && /kmemleak:[[:space:]]+backtrace \(crc/ {
+ collecting = 1
+ next
+ }
+ # Once collecting, capture only deeply-indented "kmemleak: " lines
+ # (stack frames have 4+ spaces of indentation under "kmemleak: ";
+ # headers and the "... and N more" tail line have less). This stops
+ # unrelated kmemleak warns landing between reports from being lumped
+ # into the backtrace key, which would mask a genuine duplicate.
+ in_block && collecting && /kmemleak:[[:space:]]{4,}/ {
+ bt = bt $0 "\n"
+ next
+ }
+ END {
+ flush_block()
+ check_and_reset()
+ }
+')
+
+if [ -n "$violations" ]; then
+ echo "$violations"
+ fail "kmemleak dedup regression: same backtrace reported more than once in a single scan"
+fi
+
+# Count the dedup summary lines so the report distinguishes "dedup
+# actually fired" from "no same-backtrace leaks turned up to dedup".
+dedup_lines=$(echo "$log" | grep -c 'more object(s) with the same backtrace')
+
+if [ "$dedup_lines" -gt 0 ]; then
+ pass "no dedup violations across $SCAN_COUNT scans; dedup fired ($dedup_lines summary line(s) observed)"
+else
+ pass "no dedup violations across $SCAN_COUNT scans; dedup had nothing to collapse"
+fi
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index b474f2b20def2..e16e288cc7c1f 100644
--- a/tools/testing/selftests/mm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <sys/mman.h>
+#include <linux/mman.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
@@ -163,14 +164,17 @@ static int lock_check(unsigned long addr)
return (vma_rss == vma_size);
}
-static int unlock_lock_check(char *map)
+static int unlock_lock_check(char *map, bool mlock_supported)
{
- if (is_vmflag_set((unsigned long)map, LOCKED)) {
+ if (!is_vmflag_set((unsigned long)map, LOCKED))
+ return 0;
+
+ if (mlock_supported)
ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED);
- return 1;
- }
+ else
+ ksft_print_msg("VMA flag %s is present on an unsupported VMA\n", LOCKED);
- return 0;
+ return 1;
}
static void test_mlock_lock(void)
@@ -196,7 +200,7 @@ static void test_mlock_lock(void)
ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
}
- ksft_test_result(!unlock_lock_check(map), "%s: Unlocked\n", __func__);
+ ksft_test_result(!unlock_lock_check(map, true), "%s: Unlocked\n", __func__);
munmap(map, 2 * page_size);
}
@@ -296,7 +300,7 @@ static void test_munlockall0(void)
ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
}
- ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
+ ksft_test_result(!unlock_lock_check(map, true), "%s: No locked memory\n", __func__);
munmap(map, 2 * page_size);
}
@@ -336,7 +340,67 @@ static void test_munlockall1(void)
ksft_exit_fail_msg("munlockall() %s\n", strerror(errno));
}
- ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
+ ksft_test_result(!unlock_lock_check(map, true), "%s: No locked memory\n", __func__);
+ munmap(map, 2 * page_size);
+}
+
+/* Droppable memory should not be lockable. */
+static void test_mlock_droppable(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
+
+ /* Ensure MCL_FUTURE is not set. */
+ if (munlockall()) {
+ ksft_test_result_fail("munlockall() %s\n", strerror(errno));
+ return;
+ }
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
+ if (map == MAP_FAILED) {
+ if ((errno == EOPNOTSUPP) || (errno == EINVAL))
+ ksft_test_result_skip("%s: MAP_DROPPABLE not supported\n", __func__);
+ else
+ ksft_test_result_fail("mmap error: %s\n", strerror(errno));
+ return;
+ }
+
+ if (mlock2_(map, 2 * page_size, 0))
+ ksft_test_result_fail("mlock2(0): %s\n", strerror(errno));
+ else
+ ksft_test_result(!unlock_lock_check(map, false),
+ "%s: droppable memory not locked\n", __func__);
+
+ munmap(map, 2 * page_size);
+}
+
+static void test_mlockall_future_droppable(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ ksft_test_result_fail("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno));
+ return;
+ }
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
+
+ if (map == MAP_FAILED) {
+ if ((errno == EOPNOTSUPP) || (errno == EINVAL))
+ ksft_test_result_skip("%s: MAP_DROPPABLE not supported\n", __func__);
+ else
+ ksft_test_result_fail("mmap error: %s\n", strerror(errno));
+ munlockall();
+ return;
+ }
+
+ ksft_test_result(!unlock_lock_check(map, false), "%s: droppable memory not locked\n",
+ __func__);
+
+ munlockall();
munmap(map, 2 * page_size);
}
@@ -442,7 +506,7 @@ int main(int argc, char **argv)
munmap(map, size);
- ksft_set_plan(13);
+ ksft_set_plan(15);
test_mlock_lock();
test_mlock_onfault();
@@ -451,6 +515,8 @@ int main(int argc, char **argv)
test_lock_onfault_of_present();
test_vma_management(true);
test_mlockall();
+ test_mlock_droppable();
+ test_mlockall_future_droppable();
ksft_finished();
}
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 308576437228c..131d9d6db8679 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -76,27 +76,6 @@ enum {
.expect_failure = should_fail \
}
-/* compute square root using binary search */
-static unsigned long get_sqrt(unsigned long val)
-{
- unsigned long low = 1;
-
- /* assuming rand_size is less than 1TB */
- unsigned long high = (1UL << 20);
-
- while (low <= high) {
- unsigned long mid = low + (high - low) / 2;
- unsigned long temp = mid * mid;
-
- if (temp == val)
- return mid;
- if (temp < val)
- low = mid + 1;
- high = mid - 1;
- }
- return low;
-}
-
/*
* Returns false if the requested remap region overlaps with an
* existing mapping (e.g text, stack) else returns true.
@@ -995,11 +974,9 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
char *rand_addr)
{
void *addr, *tmp_addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL;
- unsigned long long t, d;
struct timespec t_start = {0, 0}, t_end = {0, 0};
long long start_ns, end_ns, align_mask, ret, offset;
unsigned long long threshold;
- unsigned long num_chunks;
if (threshold_mb == VALIDATION_NO_THRESHOLD)
threshold = c.region_size;
@@ -1068,87 +1045,21 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
goto clean_up_dest_preamble;
}
- /*
- * Verify byte pattern after remapping. Employ an algorithm with a
- * square root time complexity in threshold: divide the range into
- * chunks, if memcmp() returns non-zero, only then perform an
- * iteration in that chunk to find the mismatch index.
- */
- num_chunks = get_sqrt(threshold);
- for (unsigned long i = 0; i < num_chunks; ++i) {
- size_t chunk_size = threshold / num_chunks;
- unsigned long shift = i * chunk_size;
-
- if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size))
- continue;
-
- /* brute force iteration only over mismatch segment */
- for (t = shift; t < shift + chunk_size; ++t) {
- if (((char *) dest_addr)[t] != rand_addr[t]) {
- ksft_print_msg("Data after remap doesn't match at offset %llu\n",
- t);
- ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff,
- ((char *) dest_addr)[t] & 0xff);
- ret = -1;
- goto clean_up_dest;
- }
- }
- }
-
- /*
- * if threshold is not divisible by num_chunks, then check the
- * last chunk
- */
- for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) {
- if (((char *) dest_addr)[t] != rand_addr[t]) {
- ksft_print_msg("Data after remap doesn't match at offset %llu\n",
- t);
- ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff,
- ((char *) dest_addr)[t] & 0xff);
- ret = -1;
- goto clean_up_dest;
- }
+ /* Verify byte pattern after remapping */
+ if (memcmp(dest_addr, rand_addr, threshold)) {
+ ksft_print_msg("Data after remap doesn't match\n");
+ ret = -1;
+ goto clean_up_dest;
}
/* Verify the dest preamble byte pattern after remapping */
- if (!c.dest_preamble_size)
- goto no_preamble;
-
- num_chunks = get_sqrt(c.dest_preamble_size);
-
- for (unsigned long i = 0; i < num_chunks; ++i) {
- size_t chunk_size = c.dest_preamble_size / num_chunks;
- unsigned long shift = i * chunk_size;
-
- if (!memcmp(dest_preamble_addr + shift, rand_addr + shift,
- chunk_size))
- continue;
-
- /* brute force iteration only over mismatched segment */
- for (d = shift; d < shift + chunk_size; ++d) {
- if (((char *) dest_preamble_addr)[d] != rand_addr[d]) {
- ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n",
- d);
- ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff,
- ((char *) dest_preamble_addr)[d] & 0xff);
- ret = -1;
- goto clean_up_dest;
- }
- }
- }
-
- for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) {
- if (((char *) dest_preamble_addr)[d] != rand_addr[d]) {
- ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n",
- d);
- ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff,
- ((char *) dest_preamble_addr)[d] & 0xff);
- ret = -1;
- goto clean_up_dest;
- }
+ if (c.dest_preamble_size &&
+ memcmp(dest_preamble_addr, rand_addr, c.dest_preamble_size)) {
+ ksft_print_msg("Preamble data after remap doesn't match\n");
+ ret = -1;
+ goto clean_up_dest;
}
-no_preamble:
start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec;
end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec;
ret = end_ns - start_ns;
diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
index cd4610baf5d7d..3fffd5f7e6fb4 100644
--- a/tools/testing/selftests/mm/process_madv.c
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -310,6 +310,34 @@ TEST_F(process_madvise, invalid_vlen)
}
/*
+ * Test that invalid advice is rejected even when the iovec has zero total
+ * length. A request with valid advice and zero length is a noop, but
+ * invalid advice should still fail with EINVAL.
+ */
+TEST_F(process_madvise, invalid_advice_zero_length)
+{
+ struct iovec vec = {
+ .iov_base = NULL,
+ .iov_len = 0,
+ };
+ int pidfd = self->pidfd;
+ ssize_t ret;
+
+ errno = 0;
+ ret = sys_process_madvise(pidfd, &vec, 1, -1, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ errno = 0;
+ ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, 0);
+
+ ret = sys_process_madvise(pidfd, NULL, 0, -1, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+}
+
+/*
* Test process_madvise() with an invalid flag value. Currently, only a flag
* value of 0 is supported. This test is reserved for the future, e.g., if
* synchronous flags are added.
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index c17b133a81d24..3b61677fe9840 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -382,6 +382,7 @@ else
fi
CATEGORY="mmap" run_test ./map_populate
+CATEGORY="mmap" run_test ./droppable
CATEGORY="mlock" run_test ./mlock-random-test