aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
authorMark Brown <broonie@kernel.org>2026-05-30 00:25:45 +0100
committerMark Brown <broonie@kernel.org>2026-05-30 00:25:46 +0100
commitfe9618ab266d20638357eff97d84540aeb22d69b (patch)
tree4139a361d7bb521ee94831414b25a1567e4c11a8 /mm
parent99befc896988c8b8b3b948b19c9d1a4e40025c07 (diff)
parent1d8f40ed9011a5a660e952235a0e8db991de509a (diff)
downloadlinux-next-history-fe9618ab266d20638357eff97d84540aeb22d69b.tar.gz
Merge branch 'slab/for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig73
-rw-r--r--mm/kasan/kasan_test_c.c5
-rw-r--r--mm/kfence/kfence_test.c13
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c52
-rw-r--r--mm/slub.c655
6 files changed, 413 insertions, 389 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e649a950be93f..9e0ca48249054 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -248,22 +248,75 @@ config SLUB_STATS
out which slabs are relevant to a particular load.
Try running: slabinfo -DA
-config RANDOM_KMALLOC_CACHES
- default n
+config KMALLOC_PARTITION_CACHES
depends on !SLUB_TINY
- bool "Randomize slab caches for normal kmalloc"
+ bool "Partitioned slab caches for normal kmalloc"
+ default RANDOM_KMALLOC_CACHES
help
- A hardening feature that creates multiple copies of slab caches for
- normal kmalloc allocation and makes kmalloc randomly pick one based
- on code address, which makes the attackers more difficult to spray
- vulnerable memory objects on the heap for the purpose of exploiting
- memory vulnerabilities.
+ A hardening feature that creates multiple isolated copies of slab
+ caches for normal kmalloc allocations. This makes it more difficult
+ to exploit memory-safety vulnerabilities by attacking vulnerable
+ co-located memory objects. Several modes are provided.
Currently the number of copies is set to 16, a reasonably large value
that effectively diverges the memory objects allocated for different
subsystems or modules into different caches, at the expense of a
- limited degree of memory and CPU overhead that relates to hardware and
- system workload.
+ limited degree of memory and CPU overhead that relates to hardware
+ and system workload.
+
+choice
+ prompt "Partitioned slab cache mode"
+ depends on KMALLOC_PARTITION_CACHES
+ default KMALLOC_PARTITION_TYPED if CC_HAS_ALLOC_TOKEN
+ default KMALLOC_PARTITION_RANDOM
+ help
+ Selects the slab cache partitioning mode.
+
+config KMALLOC_PARTITION_RANDOM
+ bool "Randomize slab caches for normal kmalloc"
+ help
+ Randomly pick a slab cache based on code address and a per-boot
+ random seed.
+
+ This makes it harder for attackers to predict object co-location.
+ The placement is random: while attackers don't know which kmalloc
+ cache an object will be allocated from, they might circumvent
+ the randomization by retrying attacks across multiple machines until
+ the target objects are co-located.
+
+config KMALLOC_PARTITION_TYPED
+ bool "Type based slab cache selection for normal kmalloc"
+ depends on CC_HAS_ALLOC_TOKEN
+ help
+ Rely on Clang's allocation tokens to choose a slab cache, where token
+ IDs are derived from the allocated type.
+
+ Unlike KMALLOC_PARTITION_RANDOM, cache assignment is deterministic based
+ on type, which guarantees that objects of certain types are not
+ placed in the same cache. This effectively mitigates certain classes
+ of exploits that probabilistic defenses like KMALLOC_PARTITION_RANDOM
+ only make harder but not impossible. However, this also means the
+ cache assignment is predictable.
+
+ Clang's default token ID calculation returns a bounded hash with
+ disjoint ranges for pointer-containing and pointerless objects: when
+ used as the slab cache index, this prevents buffer overflows on
+ primitive buffers from directly corrupting pointer-containing
+ objects.
+
+ The current effectiveness of Clang's type inference can be judged by
+ -Rpass=alloc-token, which provides diagnostics where (after dead-code
+ elimination) type inference failed.
+
+ Requires Clang 22 or later.
+
+endchoice
+
+config RANDOM_KMALLOC_CACHES
+ bool
+ transitional
+ help
+ Transitional config for migration to KMALLOC_PARTITION_CACHES.
endmenu # Slab allocator options
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 3f4ed29178b3c..b9e167ed5be32 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1225,14 +1225,13 @@ static void kmem_cache_bulk(struct kunit *test)
struct kmem_cache *cache;
size_t size = 200;
char *p[10];
- bool ret;
int i;
cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
- ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p);
- if (!ret) {
+ if (!kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p),
+ (void **)&p)) {
kunit_err(test, "Allocation failed: %s\n", __func__);
kmem_cache_destroy(cache);
return;
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 10424cd25e5a6..de2d0f7d62b15 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -214,7 +214,7 @@ static void test_cache_destroy(void)
static inline size_t kmalloc_cache_alignment(size_t size)
{
/* just to get ->align so no need to pass in the real caller */
- enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, 0);
+ enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, __kmalloc_token(0));
return kmalloc_caches[type][__kmalloc_index(size, false)]->align;
}
@@ -285,7 +285,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
if (is_kfence_address(alloc)) {
struct slab *slab = virt_to_slab(alloc);
- enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, _RET_IP_);
+ enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, __kmalloc_token(size));
struct kmem_cache *s = test_cache ?:
kmalloc_caches[type][__kmalloc_index(size, false)];
@@ -761,9 +761,10 @@ static void test_memcache_alloc_bulk(struct kunit *test)
timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
do {
void *objects[100];
- int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
- objects);
- if (!num)
+ int i;
+
+ if (!kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC,
+ ARRAY_SIZE(objects), objects))
continue;
for (i = 0; i < ARRAY_SIZE(objects); i++) {
if (is_kfence_address(objects[i])) {
@@ -771,7 +772,7 @@ static void test_memcache_alloc_bulk(struct kunit *test)
break;
}
}
- kmem_cache_free_bulk(test_cache, num, objects);
+ kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects);
/*
* kmem_cache_alloc_bulk() disables interrupts, and calling it
* in a tight loop may not give KFENCE a chance to switch the
diff --git a/mm/slab.h b/mm/slab.h
index bf2f87acf5e3a..1bf9c3021ae3d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -362,12 +362,12 @@ static inline unsigned int size_index_elem(unsigned int bytes)
* KMALLOC_MAX_CACHE_SIZE and the caller must check that.
*/
static inline struct kmem_cache *
-kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller)
+kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, kmalloc_token_t token)
{
unsigned int index;
if (!b)
- b = &kmalloc_caches[kmalloc_type(flags, caller)];
+ b = &kmalloc_caches[kmalloc_type(flags, token)];
if (size <= 192)
index = kmalloc_size_index[size_index_elem(size)];
else
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8b661fff5eedb..b6426d7ceec92 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -742,7 +742,7 @@ kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
{ /* initialization for https://llvm.org/pr42570 */ };
EXPORT_SYMBOL(kmalloc_caches);
-#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+#ifdef CONFIG_KMALLOC_PARTITION_RANDOM
unsigned long random_kmalloc_seed __ro_after_init;
EXPORT_SYMBOL(random_kmalloc_seed);
#endif
@@ -787,7 +787,7 @@ size_t kmalloc_size_roundup(size_t size)
* The flags don't matter since size_index is common to all.
* Neither does the caller for just getting ->object_size.
*/
- return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
+ return kmalloc_slab(size, NULL, GFP_KERNEL, __kmalloc_token(0))->object_size;
}
/* Above the smaller buckets, size is a multiple of page size. */
@@ -821,26 +821,26 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
#define KMALLOC_RCL_NAME(sz)
#endif
-#ifdef CONFIG_RANDOM_KMALLOC_CACHES
-#define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
-#define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
-#define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz,
-#define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz,
-#define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz,
-#define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz,
-#define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz,
-#define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz,
-#define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz,
-#define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz,
-#define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz,
-#define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
-#define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
-#define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
-#define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
-#define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
-#define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
-#else // CONFIG_RANDOM_KMALLOC_CACHES
-#define KMALLOC_RANDOM_NAME(N, sz)
+#ifdef CONFIG_KMALLOC_PARTITION_CACHES
+#define __KMALLOC_PARTITION_CONCAT(a, b) a ## b
+#define KMALLOC_PARTITION_NAME(N, sz) __KMALLOC_PARTITION_CONCAT(KMA_PART_, N)(sz)
+#define KMA_PART_1(sz) .name[KMALLOC_PARTITION_START + 1] = "kmalloc-part-01-" #sz,
+#define KMA_PART_2(sz) KMA_PART_1(sz) .name[KMALLOC_PARTITION_START + 2] = "kmalloc-part-02-" #sz,
+#define KMA_PART_3(sz) KMA_PART_2(sz) .name[KMALLOC_PARTITION_START + 3] = "kmalloc-part-03-" #sz,
+#define KMA_PART_4(sz) KMA_PART_3(sz) .name[KMALLOC_PARTITION_START + 4] = "kmalloc-part-04-" #sz,
+#define KMA_PART_5(sz) KMA_PART_4(sz) .name[KMALLOC_PARTITION_START + 5] = "kmalloc-part-05-" #sz,
+#define KMA_PART_6(sz) KMA_PART_5(sz) .name[KMALLOC_PARTITION_START + 6] = "kmalloc-part-06-" #sz,
+#define KMA_PART_7(sz) KMA_PART_6(sz) .name[KMALLOC_PARTITION_START + 7] = "kmalloc-part-07-" #sz,
+#define KMA_PART_8(sz) KMA_PART_7(sz) .name[KMALLOC_PARTITION_START + 8] = "kmalloc-part-08-" #sz,
+#define KMA_PART_9(sz) KMA_PART_8(sz) .name[KMALLOC_PARTITION_START + 9] = "kmalloc-part-09-" #sz,
+#define KMA_PART_10(sz) KMA_PART_9(sz) .name[KMALLOC_PARTITION_START + 10] = "kmalloc-part-10-" #sz,
+#define KMA_PART_11(sz) KMA_PART_10(sz) .name[KMALLOC_PARTITION_START + 11] = "kmalloc-part-11-" #sz,
+#define KMA_PART_12(sz) KMA_PART_11(sz) .name[KMALLOC_PARTITION_START + 12] = "kmalloc-part-12-" #sz,
+#define KMA_PART_13(sz) KMA_PART_12(sz) .name[KMALLOC_PARTITION_START + 13] = "kmalloc-part-13-" #sz,
+#define KMA_PART_14(sz) KMA_PART_13(sz) .name[KMALLOC_PARTITION_START + 14] = "kmalloc-part-14-" #sz,
+#define KMA_PART_15(sz) KMA_PART_14(sz) .name[KMALLOC_PARTITION_START + 15] = "kmalloc-part-15-" #sz,
+#else // CONFIG_KMALLOC_PARTITION_CACHES
+#define KMALLOC_PARTITION_NAME(N, sz)
#endif
#define INIT_KMALLOC_INFO(__size, __short_size) \
@@ -849,7 +849,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
KMALLOC_RCL_NAME(__short_size) \
KMALLOC_CGROUP_NAME(__short_size) \
KMALLOC_DMA_NAME(__short_size) \
- KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \
+ KMALLOC_PARTITION_NAME(KMALLOC_PARTITION_CACHES_NR, __short_size) \
.size = __size, \
}
@@ -961,8 +961,8 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
flags |= SLAB_CACHE_DMA;
}
-#ifdef CONFIG_RANDOM_KMALLOC_CACHES
- if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
+#ifdef CONFIG_KMALLOC_PARTITION_CACHES
+ if (type >= KMALLOC_PARTITION_START && type <= KMALLOC_PARTITION_END)
flags |= SLAB_NO_MERGE;
#endif
@@ -1010,7 +1010,7 @@ void __init create_kmalloc_caches(void)
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
new_kmalloc_cache(i, type);
}
-#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+#ifdef CONFIG_KMALLOC_PARTITION_RANDOM
random_kmalloc_seed = get_random_u64();
#endif
diff --git a/mm/slub.c b/mm/slub.c
index a2bf3756ca7d0..67abbbf68fc10 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -227,6 +227,17 @@ struct partial_bulk_context {
struct list_head slabs;
};
+/* Structure used to iterate over objects within a slab */
+struct slab_obj_iter {
+ unsigned long pos;
+ void *start;
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+ unsigned long freelist_count;
+ unsigned long page_limit;
+ bool random;
+#endif
+};
+
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
@@ -351,8 +362,8 @@ enum stat_item {
CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */
SHEAF_FLUSH, /* Objects flushed from a sheaf */
SHEAF_REFILL, /* Objects refilled to a sheaf */
- SHEAF_ALLOC, /* Allocation of an empty sheaf */
- SHEAF_FREE, /* Freeing of an empty sheaf */
+ SHEAF_ALLOC, /* Allocation of an empty sheaf including oversized ones */
+ SHEAF_FREE, /* Freeing of an empty sheaf including oversized ones */
BARN_GET, /* Got full sheaf from barn */
BARN_GET_FAIL, /* Failed to get full sheaf from barn */
BARN_PUT, /* Put full sheaf to barn */
@@ -2129,11 +2140,11 @@ static inline size_t obj_exts_alloc_size(struct kmem_cache *s,
if (!is_kmalloc_normal(s))
return sz;
- obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0);
+ obj_exts_cache = kmalloc_slab(sz, NULL, gfp, __kmalloc_token(0));
/*
- * We can't simply compare s with obj_exts_cache, because random kmalloc
- * caches have multiple caches per size, selected by caller address.
- * Since caller address may differ between kmalloc_slab() and actual
+ * We can't simply compare s with obj_exts_cache, because partitioned kmalloc
+ * caches have multiple caches per size, selected by caller address or type.
+ * Since caller address or type may differ between kmalloc_slab() and actual
* allocation, bump size when sizes are equal.
*/
if (s->object_size == obj_exts_cache->object_size)
@@ -2733,7 +2744,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
return *head != NULL;
}
-static void *setup_object(struct kmem_cache *s, void *object)
+static inline void *setup_object(struct kmem_cache *s, void *object)
{
setup_object_debug(s, object);
object = kasan_init_slab_obj(s, object);
@@ -2751,11 +2762,6 @@ static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp,
struct slab_sheaf *sheaf;
size_t sheaf_size;
- if (gfp & __GFP_NO_OBJ_EXT)
- return NULL;
-
- gfp &= ~OBJCGS_CLEAR_MASK;
-
/*
* Prevent recursion to the same cache, or a deep stack of kmallocs of
* varying sizes (sheaf capacity might differ for each kmalloc size
@@ -2780,6 +2786,11 @@ static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp,
static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s,
gfp_t gfp)
{
+ if (gfp & __GFP_NO_OBJ_EXT)
+ return NULL;
+
+ gfp &= ~OBJCGS_CLEAR_MASK;
+
return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity);
}
@@ -3329,87 +3340,14 @@ static void __init init_freelist_randomization(void)
mutex_unlock(&slab_mutex);
}
-/* Get the next entry on the pre-computed freelist randomized */
-static void *next_freelist_entry(struct kmem_cache *s,
- unsigned long *pos, void *start,
- unsigned long page_limit,
- unsigned long freelist_count)
-{
- unsigned int idx;
-
- /*
- * If the target page allocation failed, the number of objects on the
- * page might be smaller than the usual size defined by the cache.
- */
- do {
- idx = s->random_seq[*pos];
- *pos += 1;
- if (*pos >= freelist_count)
- *pos = 0;
- } while (unlikely(idx >= page_limit));
-
- return (char *)start + idx;
-}
-
static DEFINE_PER_CPU(struct rnd_state, slab_rnd_state);
-/* Shuffle the single linked freelist based on a random pre-computed sequence */
-static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
- bool allow_spin)
-{
- void *start;
- void *cur;
- void *next;
- unsigned long idx, pos, page_limit, freelist_count;
-
- if (slab->objects < 2 || !s->random_seq)
- return false;
-
- freelist_count = oo_objects(s->oo);
- if (allow_spin) {
- pos = get_random_u32_below(freelist_count);
- } else {
- struct rnd_state *state;
-
- /*
- * An interrupt or NMI handler might interrupt and change
- * the state in the middle, but that's safe.
- */
- state = &get_cpu_var(slab_rnd_state);
- pos = prandom_u32_state(state) % freelist_count;
- put_cpu_var(slab_rnd_state);
- }
-
- page_limit = slab->objects * s->size;
- start = fixup_red_left(s, slab_address(slab));
-
- /* First entry is used as the base of the freelist */
- cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
- cur = setup_object(s, cur);
- slab->freelist = cur;
-
- for (idx = 1; idx < slab->objects; idx++) {
- next = next_freelist_entry(s, &pos, start, page_limit,
- freelist_count);
- next = setup_object(s, next);
- set_freepointer(s, cur, next);
- cur = next;
- }
- set_freepointer(s, cur, NULL);
-
- return true;
-}
#else
static inline int init_cache_random_seq(struct kmem_cache *s)
{
return 0;
}
static inline void init_freelist_randomization(void) { }
-static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
- bool allow_spin)
-{
- return false;
-}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
static __always_inline void account_slab(struct slab *slab, int order,
@@ -3438,15 +3376,14 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
-(PAGE_SIZE << order));
}
+/* Allocate and initialize a slab without building its freelist. */
static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
bool allow_spin = gfpflags_allow_spinning(flags);
struct slab *slab;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
- void *start, *p, *next;
- int idx;
- bool shuffle;
+ void *start;
flags &= gfp_allowed_mask;
@@ -3497,21 +3434,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
alloc_slab_obj_exts_early(s, slab);
account_slab(slab, oo_order(oo), s, flags);
- shuffle = shuffle_freelist(s, slab, allow_spin);
-
- if (!shuffle) {
- start = fixup_red_left(s, start);
- start = setup_object(s, start);
- slab->freelist = start;
- for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
- next = p + s->size;
- next = setup_object(s, next);
- set_freepointer(s, p, next);
- p = next;
- }
- set_freepointer(s, p, NULL);
- }
-
return slab;
}
@@ -3599,15 +3521,21 @@ static inline void slab_clear_node_partial(struct slab *slab)
/*
* Management of partially allocated slabs.
*/
+static inline void set_node_partial_state(struct kmem_cache_node *n,
+ struct slab *slab)
+{
+ slab_set_node_partial(slab);
+ n->nr_partial++;
+}
+
static inline void
__add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode)
{
- n->nr_partial++;
if (mode == ADD_TO_TAIL)
list_add_tail(&slab->slab_list, &n->partial);
else
list_add(&slab->slab_list, &n->partial);
- slab_set_node_partial(slab);
+ set_node_partial_state(n, slab);
}
static inline void add_partial(struct kmem_cache_node *n,
@@ -3617,13 +3545,19 @@ static inline void add_partial(struct kmem_cache_node *n,
__add_partial(n, slab, mode);
}
+static inline void clear_node_partial_state(struct kmem_cache_node *n,
+ struct slab *slab)
+{
+ slab_clear_node_partial(slab);
+ n->nr_partial--;
+}
+
static inline void remove_partial(struct kmem_cache_node *n,
struct slab *slab)
{
lockdep_assert_held(&n->list_lock);
list_del(&slab->slab_list);
- slab_clear_node_partial(slab);
- n->nr_partial--;
+ clear_node_partial_state(n, slab);
}
/*
@@ -3665,30 +3599,112 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
return object;
}
+/* Return the next free object in allocation order. */
+static inline void *next_slab_obj(struct kmem_cache *s,
+ struct slab_obj_iter *iter)
+{
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+ if (iter->random) {
+ unsigned long idx;
+
+ /*
+ * If the target page allocation failed, the number of objects on the
+ * page might be smaller than the usual size defined by the cache.
+ */
+ do {
+ idx = s->random_seq[iter->pos];
+ iter->pos++;
+ if (iter->pos >= iter->freelist_count)
+ iter->pos = 0;
+ } while (unlikely(idx >= iter->page_limit));
+
+ return setup_object(s, (char *)iter->start + idx);
+ }
+#endif
+ return setup_object(s, (char *)iter->start + iter->pos++ * s->size);
+}
+
+/* Build a freelist from the objects not yet allocated from a fresh slab. */
+static inline void build_slab_freelist(struct kmem_cache *s, struct slab *slab,
+ struct slab_obj_iter *iter)
+{
+ unsigned int nr = slab->objects - slab->inuse;
+ unsigned int i;
+ void *cur, *next;
+
+ if (!nr) {
+ slab->freelist = NULL;
+ return;
+ }
+
+ cur = next_slab_obj(s, iter);
+ slab->freelist = cur;
+
+ for (i = 1; i < nr; i++) {
+ next = next_slab_obj(s, iter);
+ set_freepointer(s, cur, next);
+ cur = next;
+ }
+
+ set_freepointer(s, cur, NULL);
+}
+
+/* Initialize an iterator over free objects in allocation order. */
+static inline void init_slab_obj_iter(struct kmem_cache *s, struct slab *slab,
+ struct slab_obj_iter *iter,
+ bool allow_spin)
+{
+ iter->pos = 0;
+ iter->start = fixup_red_left(s, slab_address(slab));
+
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+ iter->random = (slab->objects >= 2 && s->random_seq);
+ if (!iter->random)
+ return;
+
+ iter->freelist_count = oo_objects(s->oo);
+ iter->page_limit = slab->objects * s->size;
+
+ if (allow_spin) {
+ iter->pos = get_random_u32_below(iter->freelist_count);
+ } else {
+ struct rnd_state *state;
+
+ /*
+ * An interrupt or NMI handler might interrupt and change
+ * the state in the middle, but that's safe.
+ */
+ state = &get_cpu_var(slab_rnd_state);
+ iter->pos = prandom_u32_state(state) % iter->freelist_count;
+ put_cpu_var(slab_rnd_state);
+ }
+#endif
+}
+
/*
* Called only for kmem_cache_debug() caches to allocate from a freshly
* allocated slab. Allocate a single object instead of whole freelist
* and put the slab to the partial (or full) list.
*/
static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
- int orig_size, gfp_t gfpflags)
+ int orig_size, bool allow_spin)
{
- bool allow_spin = gfpflags_allow_spinning(gfpflags);
- int nid = slab_nid(slab);
- struct kmem_cache_node *n = get_node(s, nid);
+ struct kmem_cache_node *n;
+ struct slab_obj_iter iter;
+ bool needs_add_partial;
unsigned long flags;
void *object;
- if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
- /* Unlucky, discard newly allocated slab. */
- free_new_slab_nolock(s, slab);
- return NULL;
- }
-
- object = slab->freelist;
- slab->freelist = get_freepointer(s, object);
+ init_slab_obj_iter(s, slab, &iter, allow_spin);
+ object = next_slab_obj(s, &iter);
slab->inuse = 1;
+ needs_add_partial = (slab->objects > 1);
+ build_slab_freelist(s, slab, &iter);
+
+ /* alloc_debug_processing() always expects a valid freepointer */
+ set_freepointer(s, object, slab->freelist);
+
if (!alloc_debug_processing(s, slab, object, orig_size)) {
/*
* It's not really expected that this would fail on a
@@ -3696,20 +3712,32 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
* corruption in theory could cause that.
* Leak memory of allocated slab.
*/
- if (!allow_spin)
- spin_unlock_irqrestore(&n->list_lock, flags);
return NULL;
}
- if (allow_spin)
+ n = get_node(s, slab_nid(slab));
+ if (allow_spin) {
spin_lock_irqsave(&n->list_lock, flags);
+ } else if (!spin_trylock_irqsave(&n->list_lock, flags)) {
+ /*
+ * Unlucky, discard newly allocated slab.
+ * The slab is not fully free, but it's fine as
+ * objects are not allocated to users.
+ */
+ free_new_slab_nolock(s, slab);
+ return NULL;
+ }
- if (slab->inuse == slab->objects)
- add_full(s, n, slab);
- else
+ if (needs_add_partial)
add_partial(n, slab, ADD_TO_HEAD);
+ else
+ add_full(s, n, slab);
- inc_slabs_node(s, nid, slab->objects);
+ /*
+ * Debug caches require nr_slabs updates under n->list_lock so validation
+ * cannot race with slab (de)allocations and observe inconsistent state.
+ */
+ inc_slabs_node(s, slab_nid(slab), slab->objects);
spin_unlock_irqrestore(&n->list_lock, flags);
return object;
@@ -3723,6 +3751,7 @@ static bool get_partial_node_bulk(struct kmem_cache *s,
bool allow_spin)
{
struct slab *slab, *slab2;
+ struct slab *first = NULL, *last = NULL;
unsigned int total_free = 0;
unsigned long flags;
@@ -3741,8 +3770,15 @@ static bool get_partial_node_bulk(struct kmem_cache *s,
struct freelist_counters flc;
unsigned int slab_free;
- if (!pfmemalloc_match(slab, pc->flags))
+ if (!pfmemalloc_match(slab, pc->flags)) {
+ if (first) {
+ list_bulk_move_tail(&pc->slabs,
+ &first->slab_list,
+ &last->slab_list);
+ first = NULL;
+ }
continue;
+ }
/*
* determine the number of free objects in the slab racily
@@ -3759,15 +3795,20 @@ static bool get_partial_node_bulk(struct kmem_cache *s,
&& total_free + slab_free > pc->max_objects)
break;
- remove_partial(n, slab);
-
- list_add(&slab->slab_list, &pc->slabs);
+ if (!first)
+ first = slab;
+ last = slab;
+ clear_node_partial_state(n, slab);
total_free += slab_free;
if (total_free >= pc->max_objects)
break;
}
+ if (first)
+ list_bulk_move_tail(&pc->slabs, &first->slab_list,
+ &last->slab_list);
+
spin_unlock_irqrestore(&n->list_lock, flags);
return total_free > 0;
}
@@ -4311,7 +4352,8 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
* Assumes this is performed only for caches without debugging so we
* don't need to worry about adding the slab to the full list.
*/
-static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
+static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab,
+ unsigned int *count)
{
struct freelist_counters old, new;
@@ -4327,6 +4369,7 @@ static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *sla
} while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze"));
+ *count = old.objects - old.inuse;
return old.freelist;
}
@@ -4349,44 +4392,41 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
void **p, unsigned int count, bool allow_spin)
{
unsigned int allocated = 0;
- struct kmem_cache_node *n;
- bool needs_add_partial;
+ struct slab_obj_iter iter;
+ bool needs_add_partial = true;
unsigned long flags;
- void *object;
/*
* Are we going to put the slab on the partial list?
* Note slab->inuse is 0 on a new slab.
*/
- needs_add_partial = (slab->objects > count);
-
- if (!allow_spin && needs_add_partial) {
-
- n = get_node(s, slab_nid(slab));
-
- if (!spin_trylock_irqsave(&n->list_lock, flags)) {
- /* Unlucky, discard newly allocated slab */
- free_new_slab_nolock(s, slab);
- return 0;
- }
+ if (count >= slab->objects) {
+ needs_add_partial = false;
+ count = slab->objects;
}
- object = slab->freelist;
- while (object && allocated < count) {
- p[allocated] = object;
- object = get_freepointer(s, object);
- maybe_wipe_obj_freeptr(s, p[allocated]);
+ init_slab_obj_iter(s, slab, &iter, allow_spin);
- slab->inuse++;
+ while (allocated < count) {
+ p[allocated] = next_slab_obj(s, &iter);
allocated++;
}
- slab->freelist = object;
+ slab->inuse = count;
+ build_slab_freelist(s, slab, &iter);
if (needs_add_partial) {
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
if (allow_spin) {
- n = get_node(s, slab_nid(slab));
spin_lock_irqsave(&n->list_lock, flags);
+ } else if (!spin_trylock_irqsave(&n->list_lock, flags)) {
+ /*
+ * Unlucky, discard newly allocated slab.
+ * The slab is not fully free, but it's fine as
+ * objects are not allocated to users.
+ */
+ free_new_slab_nolock(s, slab);
+ return 0;
}
add_partial(n, slab, ADD_TO_HEAD);
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4457,15 +4497,13 @@ new_objects:
stat(s, ALLOC_SLAB);
if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
- object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
+ object = alloc_single_from_new_slab(s, slab, orig_size, allow_spin);
if (likely(object))
goto success;
} else {
- alloc_from_new_slab(s, slab, &object, 1, allow_spin);
-
/* we don't need to check SLAB_STORE_USER here */
- if (likely(object))
+ if (alloc_from_new_slab(s, slab, &object, 1, allow_spin))
return object;
}
@@ -4981,8 +5019,8 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
return ret;
}
-static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p);
+static bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p);
/*
* returns a sheaf that has at least the requested size
@@ -5002,21 +5040,20 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
if (unlikely(size > s->sheaf_capacity)) {
- sheaf = kzalloc_flex(*sheaf, objects, size, gfp);
+ sheaf = __alloc_empty_sheaf(s, gfp, size);
if (!sheaf)
return NULL;
stat(s, SHEAF_PREFILL_OVERSIZE);
- sheaf->cache = s;
sheaf->capacity = size;
/*
* we do not need to care about pfmemalloc here because oversize
- * sheaves area always flushed and freed when returned
+ * sheaves are always flushed and freed when returned
*/
if (!__kmem_cache_alloc_bulk(s, gfp, size,
&sheaf->objects[0])) {
- kfree(sheaf);
+ free_empty_sheaf(s, sheaf);
return NULL;
}
@@ -5084,7 +5121,7 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
if (unlikely((sheaf->capacity != s->sheaf_capacity)
|| sheaf->pfmemalloc)) {
sheaf_flush_unused(s, sheaf);
- kfree(sheaf);
+ free_empty_sheaf(s, sheaf);
return;
}
@@ -5154,9 +5191,8 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
- &sheaf->objects[sheaf->size])) {
+ &sheaf->objects[sheaf->size]))
return -ENOMEM;
- }
sheaf->size = sheaf->capacity;
return 0;
@@ -5275,7 +5311,7 @@ EXPORT_SYMBOL(__kmalloc_large_node_noprof);
static __always_inline
void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
- unsigned long caller)
+ unsigned long caller, kmalloc_token_t token)
{
struct kmem_cache *s;
void *ret;
@@ -5290,37 +5326,28 @@ void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
if (unlikely(!size))
return ZERO_SIZE_PTR;
- s = kmalloc_slab(size, b, flags, caller);
+ s = kmalloc_slab(size, b, flags, token);
ret = slab_alloc_node(s, NULL, flags, node, caller, size);
ret = kasan_kmalloc(s, ret, size, flags);
trace_kmalloc(caller, ret, size, s->size, flags, node);
return ret;
}
-void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+void *__kmalloc_node_noprof(DECL_KMALLOC_PARAMS(size, b, token), gfp_t flags, int node)
{
- return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
+ return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node,
+ _RET_IP_, PASS_TOKEN_PARAM(token));
}
EXPORT_SYMBOL(__kmalloc_node_noprof);
-void *__kmalloc_noprof(size_t size, gfp_t flags)
+void *__kmalloc_noprof(DECL_TOKEN_PARAMS(size, token), gfp_t flags)
{
- return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
+ return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_,
+ PASS_TOKEN_PARAM(token));
}
EXPORT_SYMBOL(__kmalloc_noprof);
-/**
- * kmalloc_nolock - Allocate an object of given size from any context.
- * @size: size to allocate
- * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
- * allowed.
- * @node: node number of the target node.
- *
- * Return: pointer to the new object or NULL in case of error.
- * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
- * There is no reason to call it again and expect !NULL.
- */
-void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
+void *_kmalloc_nolock_noprof(DECL_TOKEN_PARAMS(size, token), gfp_t gfp_flags, int node)
{
gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
struct kmem_cache *s;
@@ -5347,7 +5374,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
retry:
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
- s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
+ s = kmalloc_slab(size, NULL, alloc_gfp, PASS_TOKEN_PARAM(token));
if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
/*
@@ -5400,12 +5427,13 @@ success:
ret = kasan_kmalloc(s, ret, size, alloc_gfp);
return ret;
}
-EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
+EXPORT_SYMBOL_GPL(_kmalloc_nolock_noprof);
-void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
+void *__kmalloc_node_track_caller_noprof(DECL_KMALLOC_PARAMS(size, b, token), gfp_t flags,
int node, unsigned long caller)
{
- return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
+ return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node,
+ caller, PASS_TOKEN_PARAM(token));
}
EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
@@ -5500,6 +5528,34 @@ static noinline void free_to_partial_list(
}
/*
+ * Try returning (remainder of) the freelist that we just detached from the
+ * slab. Optimistically assume the slab is still full, so we don't need to find
+ * the tail of the detached freelist.
+ *
+ * Fail if the slab isn't full anymore due to a concurrent free.
+ */
+static bool __slab_try_return_freelist(struct kmem_cache *s, struct slab *slab,
+ void *head, int cnt)
+{
+ struct freelist_counters old, new;
+
+ old.freelist = slab->freelist;
+ old.counters = slab->counters;
+
+ if (old.freelist)
+ return false;
+
+ new.freelist = head;
+ new.counters = old.counters;
+ new.inuse -= cnt;
+
+ if (!slab_update_freelist(s, slab, &old, &new, "__slab_try_return_freelist"))
+ return false;
+
+ return true;
+}
+
+/*
* Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
*
@@ -6636,7 +6692,7 @@ void kfree_nolock(const void *object)
EXPORT_SYMBOL_GPL(kfree_nolock);
static __always_inline __realloc_size(2) void *
-__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
+__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid, kmalloc_token_t token)
{
void *ret;
size_t ks = 0;
@@ -6708,7 +6764,7 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags,
return (void *)p;
alloc_new:
- ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
+ ret = __kmalloc_node_track_caller_noprof(PASS_KMALLOC_PARAMS(new_size, NULL, token), flags, nid, _RET_IP_);
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
@@ -6719,45 +6775,7 @@ alloc_new:
return ret;
}
-/**
- * krealloc_node_align - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @align: desired alignment.
- * @flags: the type of memory to allocate.
- * @nid: NUMA node or NUMA_NO_NODE
- *
- * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
- * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
- * Documentation/core-api/memory-allocation.rst for more details.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * When slub_debug_orig_size() is off, krealloc() only knows about the bucket
- * size of an allocation (but not the exact size it was allocated with) and
- * hence implements the following semantics for shrinking and growing buffers
- * with __GFP_ZERO::
- *
- * new bucket
- * 0 size size
- * |--------|----------------|
- * | keep | zero |
- *
- * Otherwise, the original allocation size 'orig_size' could be used to
- * precisely clear the requested size, and the new size will also be stored
- * as the new 'orig_size'.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
+void *krealloc_node_align_noprof(const void *p, DECL_TOKEN_PARAMS(new_size, token), unsigned long align,
gfp_t flags, int nid)
{
void *ret;
@@ -6767,7 +6785,7 @@ void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long a
return ZERO_SIZE_PTR;
}
- ret = __do_krealloc(p, new_size, align, flags, nid);
+ ret = __do_krealloc(p, new_size, align, flags, nid, PASS_TOKEN_PARAM(token));
if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);
@@ -6799,28 +6817,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
return flags;
}
-/**
- * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @b: which set of kmalloc buckets to allocate from.
- * @align: desired alignment.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
- * Documentation/core-api/memory-allocation.rst for more details.
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+void *__kvmalloc_node_noprof(DECL_KMALLOC_PARAMS(size, b, token), unsigned long align,
gfp_t flags, int node)
{
bool allow_block;
@@ -6832,7 +6829,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
*/
ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
kmalloc_gfp_adjust(flags, size),
- node, _RET_IP_);
+ node, _RET_IP_, PASS_TOKEN_PARAM(token));
if (ret || size <= PAGE_SIZE)
return ret;
@@ -6917,34 +6914,7 @@ void kvfree_sensitive(const void *addr, size_t len)
}
EXPORT_SYMBOL(kvfree_sensitive);
-/**
- * kvrealloc_node_align - reallocate memory; contents remain unchanged
- * @p: object to reallocate memory for
- * @size: the size to reallocate
- * @align: desired alignment
- * @flags: the flags for the page level allocator
- * @nid: NUMA node id
- *
- * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
- * and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
- * Documentation/core-api/memory-allocation.rst for more details.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * This function must not be called concurrently with itself or kvfree() for the
- * same memory allocation.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+void *kvrealloc_node_align_noprof(const void *p, DECL_TOKEN_PARAMS(size, token), unsigned long align,
gfp_t flags, int nid)
{
void *n;
@@ -6952,10 +6922,10 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig
if (is_vmalloc_addr(p))
return vrealloc_node_align_noprof(p, size, align, flags, nid);
- n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
+ n = krealloc_node_align_noprof(p, PASS_TOKEN_PARAMS(size, token), align, kmalloc_gfp_adjust(flags, size), nid);
if (!n) {
/* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_node_align_noprof(size, align, flags, nid);
+ n = __kvmalloc_node_noprof(PASS_KMALLOC_PARAMS(size, NULL, token), align, flags, nid);
if (!n)
return NULL;
@@ -7126,60 +7096,56 @@ __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int mi
list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+ unsigned int count;
+
list_del(&slab->slab_list);
- object = get_freelist_nofreeze(s, slab);
+ object = get_freelist_nofreeze(s, slab, &count);
- while (object && refilled < max) {
+ while (count && refilled < max) {
p[refilled] = object;
object = get_freepointer(s, object);
maybe_wipe_obj_freeptr(s, p[refilled]);
refilled++;
+ count--;
}
/*
* Freelist had more objects than we can accommodate, we need to
- * free them back. We can treat it like a detached freelist, just
- * need to find the tail object.
+ * free them back. First we try to be optimistic and assume the
+ * slab is still full since we just detached its freelist.
+ * Otherwise we must find the tail object.
*/
- if (unlikely(object)) {
+ if (unlikely(count)) {
void *head = object;
void *tail;
- int cnt = 0;
+
+ if (__slab_try_return_freelist(s, slab, head, count)) {
+ list_add(&slab->slab_list, &pc.slabs);
+ break;
+ }
do {
tail = object;
- cnt++;
object = get_freepointer(s, object);
} while (object);
- __slab_free(s, slab, head, tail, cnt, _RET_IP_);
+ __slab_free(s, slab, head, tail, count, _RET_IP_);
}
if (refilled >= max)
break;
}
- if (unlikely(!list_empty(&pc.slabs))) {
+ if (!list_empty(&pc.slabs)) {
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
-
- if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
- continue;
+ list_for_each_entry(slab, &pc.slabs, slab_list)
+ set_node_partial_state(n, slab);
- list_del(&slab->slab_list);
- add_partial(n, slab, ADD_TO_HEAD);
- }
+ list_splice_tail(&pc.slabs, &n->partial);
spin_unlock_irqrestore(&n->list_lock, flags);
-
- /* any slabs left are completely free and for discard */
- list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
-
- list_del(&slab->slab_list);
- discard_slab(s, slab);
- }
}
return refilled;
@@ -7275,10 +7241,6 @@ new_slab:
stat(s, ALLOC_SLAB);
- /*
- * TODO: possible optimization - if we know we will consume the whole
- * slab we might skip creating the freelist?
- */
refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
/* allow_spin = */ true);
@@ -7289,9 +7251,8 @@ out:
return refilled;
}
-static inline
-int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+static bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p)
{
int i;
@@ -7312,30 +7273,43 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
stat_add(s, ALLOC_SLOWPATH, i);
}
- return i;
+ return true;
error:
__kmem_cache_free_bulk(s, i, p);
- return 0;
-
+ return false;
}
-/*
- * Note that interrupts must be enabled when calling this function and gfp
- * flags must allow spinning.
+/**
+ * kmem_cache_alloc_bulk - Allocate multiple objects
+ * @s: The cache to allocate from
+ * @flags: GFP_* flags. See kmalloc().
+ * @size: Number of objects to allocate
+ * @p: Array of allocated objects
+ *
+ * Allocate @size objects from @s and places them into @p. @size must be larger
+ * than 0.
+ *
+ * Interrupts must be enabled when calling this function and @flags must allow
+ * spinning.
+ *
+ * Unlike alloc_pages_bulk(), this function does not check for already allocated
+ * objects in @p, and thus the caller does not need to zero it.
+ *
+ * Return: %true if the allocation succeeded, or %false if it failed.
*/
-int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+bool kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p)
{
unsigned int i = 0;
void *kfence_obj;
if (!size)
- return 0;
+ return false;
s = slab_pre_alloc_hook(s, flags);
if (unlikely(!s))
- return 0;
+ return false;
/*
* to make things simpler, only assume at most once kfence allocated
@@ -7352,18 +7326,18 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
}
i = alloc_from_pcs_bulk(s, flags, size, p);
-
if (i < size) {
/*
* If we ran out of memory, don't bother with freeing back to
* the percpu sheaves, we have bigger problems.
*/
- if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
+ if (unlikely(!__kmem_cache_alloc_bulk(s, flags, size - i,
+ p + i))) {
if (i > 0)
__kmem_cache_free_bulk(s, i, p);
if (kfence_obj)
__kfence_free(kfence_obj);
- return 0;
+ return false;
}
}
@@ -7378,16 +7352,9 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
}
out:
- /*
- * memcg and kmem_cache debug support and memory initialization.
- * Done outside of the IRQ disabled fastpath loop.
- */
- if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
- slab_want_init_on_alloc(flags, s), s->object_size))) {
- return 0;
- }
-
- return size;
+ /* memcg and kmem_cache debug support and memory initialization */
+ return likely(slab_post_alloc_hook(s, NULL, flags, size, p,
+ slab_want_init_on_alloc(flags, s), s->object_size));
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
@@ -7609,6 +7576,7 @@ static void early_kmem_cache_node_alloc(int node)
{
struct slab *slab;
struct kmem_cache_node *n;
+ struct slab_obj_iter iter;
BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
@@ -7620,14 +7588,18 @@ static void early_kmem_cache_node_alloc(int node)
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
}
- n = slab->freelist;
+ init_slab_obj_iter(kmem_cache_node, slab, &iter, true);
+
+ n = next_slab_obj(kmem_cache_node, &iter);
BUG_ON(!n);
+
+ slab->inuse = 1;
+ build_slab_freelist(kmem_cache_node, slab, &iter);
+
#ifdef CONFIG_SLUB_DEBUG
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
#endif
n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
- slab->freelist = get_freepointer(kmem_cache_node, n);
- slab->inuse = 1;
kmem_cache_node->per_node[node].node = n;
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, slab->objects);
@@ -8245,8 +8217,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
if (free == slab->objects) {
list_move(&slab->slab_list, &discard);
- slab_clear_node_partial(slab);
- n->nr_partial--;
+ clear_node_partial_state(n, slab);
dec_slabs_node(s, node, slab->objects);
} else if (free <= SHRINK_PROMOTE_MAX)
list_move(&slab->slab_list, promote + free - 1);
@@ -8470,7 +8441,7 @@ static void __init bootstrap_kmalloc_sheaves(void)
{
enum kmalloc_cache_type type;
- for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) {
+ for (type = KMALLOC_NORMAL; type <= KMALLOC_PARTITION_END; type++) {
for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) {
if (kmalloc_caches[type][idx])
bootstrap_cache_sheaves(kmalloc_caches[type][idx]);