diff options
| author | Mark Brown <broonie@kernel.org> | 2026-05-30 00:25:45 +0100 |
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2026-05-30 00:25:46 +0100 |
| commit | fe9618ab266d20638357eff97d84540aeb22d69b (patch) | |
| tree | 4139a361d7bb521ee94831414b25a1567e4c11a8 /mm | |
| parent | 99befc896988c8b8b3b948b19c9d1a4e40025c07 (diff) | |
| parent | 1d8f40ed9011a5a660e952235a0e8db991de509a (diff) | |
| download | linux-next-history-fe9618ab266d20638357eff97d84540aeb22d69b.tar.gz | |
Merge branch 'slab/for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 73 | ||||
| -rw-r--r-- | mm/kasan/kasan_test_c.c | 5 | ||||
| -rw-r--r-- | mm/kfence/kfence_test.c | 13 | ||||
| -rw-r--r-- | mm/slab.h | 4 | ||||
| -rw-r--r-- | mm/slab_common.c | 52 | ||||
| -rw-r--r-- | mm/slub.c | 655 |
6 files changed, 413 insertions, 389 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e649a950be93f..9e0ca48249054 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -248,22 +248,75 @@ config SLUB_STATS out which slabs are relevant to a particular load. Try running: slabinfo -DA -config RANDOM_KMALLOC_CACHES - default n +config KMALLOC_PARTITION_CACHES depends on !SLUB_TINY - bool "Randomize slab caches for normal kmalloc" + bool "Partitioned slab caches for normal kmalloc" + default RANDOM_KMALLOC_CACHES help - A hardening feature that creates multiple copies of slab caches for - normal kmalloc allocation and makes kmalloc randomly pick one based - on code address, which makes the attackers more difficult to spray - vulnerable memory objects on the heap for the purpose of exploiting - memory vulnerabilities. + A hardening feature that creates multiple isolated copies of slab + caches for normal kmalloc allocations. This makes it more difficult + to exploit memory-safety vulnerabilities by attacking vulnerable + co-located memory objects. Several modes are provided. Currently the number of copies is set to 16, a reasonably large value that effectively diverges the memory objects allocated for different subsystems or modules into different caches, at the expense of a - limited degree of memory and CPU overhead that relates to hardware and - system workload. + limited degree of memory and CPU overhead that relates to hardware + and system workload. + +choice + prompt "Partitioned slab cache mode" + depends on KMALLOC_PARTITION_CACHES + default KMALLOC_PARTITION_TYPED if CC_HAS_ALLOC_TOKEN + default KMALLOC_PARTITION_RANDOM + help + Selects the slab cache partitioning mode. + +config KMALLOC_PARTITION_RANDOM + bool "Randomize slab caches for normal kmalloc" + help + Randomly pick a slab cache based on code address and a per-boot + random seed. + + This makes it harder for attackers to predict object co-location. + The placement is random: while attackers don't know which kmalloc + cache an object will be allocated from, they might circumvent + the randomization by retrying attacks across multiple machines until + the target objects are co-located. + +config KMALLOC_PARTITION_TYPED + bool "Type based slab cache selection for normal kmalloc" + depends on CC_HAS_ALLOC_TOKEN + help + Rely on Clang's allocation tokens to choose a slab cache, where token + IDs are derived from the allocated type. + + Unlike KMALLOC_PARTITION_RANDOM, cache assignment is deterministic based + on type, which guarantees that objects of certain types are not + placed in the same cache. This effectively mitigates certain classes + of exploits that probabilistic defenses like KMALLOC_PARTITION_RANDOM + only make harder but not impossible. However, this also means the + cache assignment is predictable. + + Clang's default token ID calculation returns a bounded hash with + disjoint ranges for pointer-containing and pointerless objects: when + used as the slab cache index, this prevents buffer overflows on + primitive buffers from directly corrupting pointer-containing + objects. + + The current effectiveness of Clang's type inference can be judged by + -Rpass=alloc-token, which provides diagnostics where (after dead-code + elimination) type inference failed. + + Requires Clang 22 or later. + +endchoice + +config RANDOM_KMALLOC_CACHES + bool + transitional + help + Transitional config for migration to KMALLOC_PARTITION_CACHES. endmenu # Slab allocator options diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 3f4ed29178b3c..b9e167ed5be32 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1225,14 +1225,13 @@ static void kmem_cache_bulk(struct kunit *test) struct kmem_cache *cache; size_t size = 200; char *p[10]; - bool ret; int i; cache = kmem_cache_create("test_cache", size, 0, 0, NULL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); - ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p); - if (!ret) { + if (!kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), + (void **)&p)) { kunit_err(test, "Allocation failed: %s\n", __func__); kmem_cache_destroy(cache); return; diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 10424cd25e5a6..de2d0f7d62b15 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -214,7 +214,7 @@ static void test_cache_destroy(void) static inline size_t kmalloc_cache_alignment(size_t size) { /* just to get ->align so no need to pass in the real caller */ - enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, 0); + enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, __kmalloc_token(0)); return kmalloc_caches[type][__kmalloc_index(size, false)]->align; } @@ -285,7 +285,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat if (is_kfence_address(alloc)) { struct slab *slab = virt_to_slab(alloc); - enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, _RET_IP_); + enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, __kmalloc_token(size)); struct kmem_cache *s = test_cache ?: kmalloc_caches[type][__kmalloc_index(size, false)]; @@ -761,9 +761,10 @@ static void test_memcache_alloc_bulk(struct kunit *test) timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); do { void *objects[100]; - int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), - objects); - if (!num) + int i; + + if (!kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, + ARRAY_SIZE(objects), objects)) continue; for (i = 0; i < ARRAY_SIZE(objects); i++) { if (is_kfence_address(objects[i])) { @@ -771,7 +772,7 @@ static void test_memcache_alloc_bulk(struct kunit *test) break; } } - kmem_cache_free_bulk(test_cache, num, objects); + kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects); /* * kmem_cache_alloc_bulk() disables interrupts, and calling it * in a tight loop may not give KFENCE a chance to switch the diff --git a/mm/slab.h b/mm/slab.h index bf2f87acf5e3a..1bf9c3021ae3d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -362,12 +362,12 @@ static inline unsigned int size_index_elem(unsigned int bytes) * KMALLOC_MAX_CACHE_SIZE and the caller must check that. */ static inline struct kmem_cache * -kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) +kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, kmalloc_token_t token) { unsigned int index; if (!b) - b = &kmalloc_caches[kmalloc_type(flags, caller)]; + b = &kmalloc_caches[kmalloc_type(flags, token)]; if (size <= 192) index = kmalloc_size_index[size_index_elem(size)]; else diff --git a/mm/slab_common.c b/mm/slab_common.c index 8b661fff5eedb..b6426d7ceec92 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -742,7 +742,7 @@ kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init = { /* initialization for https://llvm.org/pr42570 */ }; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_RANDOM_KMALLOC_CACHES +#ifdef CONFIG_KMALLOC_PARTITION_RANDOM unsigned long random_kmalloc_seed __ro_after_init; EXPORT_SYMBOL(random_kmalloc_seed); #endif @@ -787,7 +787,7 @@ size_t kmalloc_size_roundup(size_t size) * The flags don't matter since size_index is common to all. * Neither does the caller for just getting ->object_size. */ - return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size; + return kmalloc_slab(size, NULL, GFP_KERNEL, __kmalloc_token(0))->object_size; } /* Above the smaller buckets, size is a multiple of page size. */ @@ -821,26 +821,26 @@ EXPORT_SYMBOL(kmalloc_size_roundup); #define KMALLOC_RCL_NAME(sz) #endif -#ifdef CONFIG_RANDOM_KMALLOC_CACHES -#define __KMALLOC_RANDOM_CONCAT(a, b) a ## b -#define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz) -#define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz, -#define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz, -#define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz, -#define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz, -#define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz, -#define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz, -#define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz, -#define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz, -#define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz, -#define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz, -#define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz, -#define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz, -#define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz, -#define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz, -#define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz, -#else // CONFIG_RANDOM_KMALLOC_CACHES -#define KMALLOC_RANDOM_NAME(N, sz) +#ifdef CONFIG_KMALLOC_PARTITION_CACHES +#define __KMALLOC_PARTITION_CONCAT(a, b) a ## b +#define KMALLOC_PARTITION_NAME(N, sz) __KMALLOC_PARTITION_CONCAT(KMA_PART_, N)(sz) +#define KMA_PART_1(sz) .name[KMALLOC_PARTITION_START + 1] = "kmalloc-part-01-" #sz, +#define KMA_PART_2(sz) KMA_PART_1(sz) .name[KMALLOC_PARTITION_START + 2] = "kmalloc-part-02-" #sz, +#define KMA_PART_3(sz) KMA_PART_2(sz) .name[KMALLOC_PARTITION_START + 3] = "kmalloc-part-03-" #sz, +#define KMA_PART_4(sz) KMA_PART_3(sz) .name[KMALLOC_PARTITION_START + 4] = "kmalloc-part-04-" #sz, +#define KMA_PART_5(sz) KMA_PART_4(sz) .name[KMALLOC_PARTITION_START + 5] = "kmalloc-part-05-" #sz, +#define KMA_PART_6(sz) KMA_PART_5(sz) .name[KMALLOC_PARTITION_START + 6] = "kmalloc-part-06-" #sz, +#define KMA_PART_7(sz) KMA_PART_6(sz) .name[KMALLOC_PARTITION_START + 7] = "kmalloc-part-07-" #sz, +#define KMA_PART_8(sz) KMA_PART_7(sz) .name[KMALLOC_PARTITION_START + 8] = "kmalloc-part-08-" #sz, +#define KMA_PART_9(sz) KMA_PART_8(sz) .name[KMALLOC_PARTITION_START + 9] = "kmalloc-part-09-" #sz, +#define KMA_PART_10(sz) KMA_PART_9(sz) .name[KMALLOC_PARTITION_START + 10] = "kmalloc-part-10-" #sz, +#define KMA_PART_11(sz) KMA_PART_10(sz) .name[KMALLOC_PARTITION_START + 11] = "kmalloc-part-11-" #sz, +#define KMA_PART_12(sz) KMA_PART_11(sz) .name[KMALLOC_PARTITION_START + 12] = "kmalloc-part-12-" #sz, +#define KMA_PART_13(sz) KMA_PART_12(sz) .name[KMALLOC_PARTITION_START + 13] = "kmalloc-part-13-" #sz, +#define KMA_PART_14(sz) KMA_PART_13(sz) .name[KMALLOC_PARTITION_START + 14] = "kmalloc-part-14-" #sz, +#define KMA_PART_15(sz) KMA_PART_14(sz) .name[KMALLOC_PARTITION_START + 15] = "kmalloc-part-15-" #sz, +#else // CONFIG_KMALLOC_PARTITION_CACHES +#define KMALLOC_PARTITION_NAME(N, sz) #endif #define INIT_KMALLOC_INFO(__size, __short_size) \ @@ -849,7 +849,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup); KMALLOC_RCL_NAME(__short_size) \ KMALLOC_CGROUP_NAME(__short_size) \ KMALLOC_DMA_NAME(__short_size) \ - KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \ + KMALLOC_PARTITION_NAME(KMALLOC_PARTITION_CACHES_NR, __short_size) \ .size = __size, \ } @@ -961,8 +961,8 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type) flags |= SLAB_CACHE_DMA; } -#ifdef CONFIG_RANDOM_KMALLOC_CACHES - if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END) +#ifdef CONFIG_KMALLOC_PARTITION_CACHES + if (type >= KMALLOC_PARTITION_START && type <= KMALLOC_PARTITION_END) flags |= SLAB_NO_MERGE; #endif @@ -1010,7 +1010,7 @@ void __init create_kmalloc_caches(void) for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) new_kmalloc_cache(i, type); } -#ifdef CONFIG_RANDOM_KMALLOC_CACHES +#ifdef CONFIG_KMALLOC_PARTITION_RANDOM random_kmalloc_seed = get_random_u64(); #endif diff --git a/mm/slub.c b/mm/slub.c index a2bf3756ca7d0..67abbbf68fc10 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -227,6 +227,17 @@ struct partial_bulk_context { struct list_head slabs; }; +/* Structure used to iterate over objects within a slab */ +struct slab_obj_iter { + unsigned long pos; + void *start; +#ifdef CONFIG_SLAB_FREELIST_RANDOM + unsigned long freelist_count; + unsigned long page_limit; + bool random; +#endif +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); @@ -351,8 +362,8 @@ enum stat_item { CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ SHEAF_FLUSH, /* Objects flushed from a sheaf */ SHEAF_REFILL, /* Objects refilled to a sheaf */ - SHEAF_ALLOC, /* Allocation of an empty sheaf */ - SHEAF_FREE, /* Freeing of an empty sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf including oversized ones */ + SHEAF_FREE, /* Freeing of an empty sheaf including oversized ones */ BARN_GET, /* Got full sheaf from barn */ BARN_GET_FAIL, /* Failed to get full sheaf from barn */ BARN_PUT, /* Put full sheaf to barn */ @@ -2129,11 +2140,11 @@ static inline size_t obj_exts_alloc_size(struct kmem_cache *s, if (!is_kmalloc_normal(s)) return sz; - obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0); + obj_exts_cache = kmalloc_slab(sz, NULL, gfp, __kmalloc_token(0)); /* - * We can't simply compare s with obj_exts_cache, because random kmalloc - * caches have multiple caches per size, selected by caller address. - * Since caller address may differ between kmalloc_slab() and actual + * We can't simply compare s with obj_exts_cache, because partitioned kmalloc + * caches have multiple caches per size, selected by caller address or type. + * Since caller address or type may differ between kmalloc_slab() and actual * allocation, bump size when sizes are equal. */ if (s->object_size == obj_exts_cache->object_size) @@ -2733,7 +2744,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, return *head != NULL; } -static void *setup_object(struct kmem_cache *s, void *object) +static inline void *setup_object(struct kmem_cache *s, void *object) { setup_object_debug(s, object); object = kasan_init_slab_obj(s, object); @@ -2751,11 +2762,6 @@ static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, struct slab_sheaf *sheaf; size_t sheaf_size; - if (gfp & __GFP_NO_OBJ_EXT) - return NULL; - - gfp &= ~OBJCGS_CLEAR_MASK; - /* * Prevent recursion to the same cache, or a deep stack of kmallocs of * varying sizes (sheaf capacity might differ for each kmalloc size @@ -2780,6 +2786,11 @@ static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) { + if (gfp & __GFP_NO_OBJ_EXT) + return NULL; + + gfp &= ~OBJCGS_CLEAR_MASK; + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); } @@ -3329,87 +3340,14 @@ static void __init init_freelist_randomization(void) mutex_unlock(&slab_mutex); } -/* Get the next entry on the pre-computed freelist randomized */ -static void *next_freelist_entry(struct kmem_cache *s, - unsigned long *pos, void *start, - unsigned long page_limit, - unsigned long freelist_count) -{ - unsigned int idx; - - /* - * If the target page allocation failed, the number of objects on the - * page might be smaller than the usual size defined by the cache. - */ - do { - idx = s->random_seq[*pos]; - *pos += 1; - if (*pos >= freelist_count) - *pos = 0; - } while (unlikely(idx >= page_limit)); - - return (char *)start + idx; -} - static DEFINE_PER_CPU(struct rnd_state, slab_rnd_state); -/* Shuffle the single linked freelist based on a random pre-computed sequence */ -static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab, - bool allow_spin) -{ - void *start; - void *cur; - void *next; - unsigned long idx, pos, page_limit, freelist_count; - - if (slab->objects < 2 || !s->random_seq) - return false; - - freelist_count = oo_objects(s->oo); - if (allow_spin) { - pos = get_random_u32_below(freelist_count); - } else { - struct rnd_state *state; - - /* - * An interrupt or NMI handler might interrupt and change - * the state in the middle, but that's safe. - */ - state = &get_cpu_var(slab_rnd_state); - pos = prandom_u32_state(state) % freelist_count; - put_cpu_var(slab_rnd_state); - } - - page_limit = slab->objects * s->size; - start = fixup_red_left(s, slab_address(slab)); - - /* First entry is used as the base of the freelist */ - cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count); - cur = setup_object(s, cur); - slab->freelist = cur; - - for (idx = 1; idx < slab->objects; idx++) { - next = next_freelist_entry(s, &pos, start, page_limit, - freelist_count); - next = setup_object(s, next); - set_freepointer(s, cur, next); - cur = next; - } - set_freepointer(s, cur, NULL); - - return true; -} #else static inline int init_cache_random_seq(struct kmem_cache *s) { return 0; } static inline void init_freelist_randomization(void) { } -static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab, - bool allow_spin) -{ - return false; -} #endif /* CONFIG_SLAB_FREELIST_RANDOM */ static __always_inline void account_slab(struct slab *slab, int order, @@ -3438,15 +3376,14 @@ static __always_inline void unaccount_slab(struct slab *slab, int order, -(PAGE_SIZE << order)); } +/* Allocate and initialize a slab without building its freelist. */ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; - void *start, *p, *next; - int idx; - bool shuffle; + void *start; flags &= gfp_allowed_mask; @@ -3497,21 +3434,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) alloc_slab_obj_exts_early(s, slab); account_slab(slab, oo_order(oo), s, flags); - shuffle = shuffle_freelist(s, slab, allow_spin); - - if (!shuffle) { - start = fixup_red_left(s, start); - start = setup_object(s, start); - slab->freelist = start; - for (idx = 0, p = start; idx < slab->objects - 1; idx++) { - next = p + s->size; - next = setup_object(s, next); - set_freepointer(s, p, next); - p = next; - } - set_freepointer(s, p, NULL); - } - return slab; } @@ -3599,15 +3521,21 @@ static inline void slab_clear_node_partial(struct slab *slab) /* * Management of partially allocated slabs. */ +static inline void set_node_partial_state(struct kmem_cache_node *n, + struct slab *slab) +{ + slab_set_node_partial(slab); + n->nr_partial++; +} + static inline void __add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) { - n->nr_partial++; if (mode == ADD_TO_TAIL) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); - slab_set_node_partial(slab); + set_node_partial_state(n, slab); } static inline void add_partial(struct kmem_cache_node *n, @@ -3617,13 +3545,19 @@ static inline void add_partial(struct kmem_cache_node *n, __add_partial(n, slab, mode); } +static inline void clear_node_partial_state(struct kmem_cache_node *n, + struct slab *slab) +{ + slab_clear_node_partial(slab); + n->nr_partial--; +} + static inline void remove_partial(struct kmem_cache_node *n, struct slab *slab) { lockdep_assert_held(&n->list_lock); list_del(&slab->slab_list); - slab_clear_node_partial(slab); - n->nr_partial--; + clear_node_partial_state(n, slab); } /* @@ -3665,30 +3599,112 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } +/* Return the next free object in allocation order. */ +static inline void *next_slab_obj(struct kmem_cache *s, + struct slab_obj_iter *iter) +{ +#ifdef CONFIG_SLAB_FREELIST_RANDOM + if (iter->random) { + unsigned long idx; + + /* + * If the target page allocation failed, the number of objects on the + * page might be smaller than the usual size defined by the cache. + */ + do { + idx = s->random_seq[iter->pos]; + iter->pos++; + if (iter->pos >= iter->freelist_count) + iter->pos = 0; + } while (unlikely(idx >= iter->page_limit)); + + return setup_object(s, (char *)iter->start + idx); + } +#endif + return setup_object(s, (char *)iter->start + iter->pos++ * s->size); +} + +/* Build a freelist from the objects not yet allocated from a fresh slab. */ +static inline void build_slab_freelist(struct kmem_cache *s, struct slab *slab, + struct slab_obj_iter *iter) +{ + unsigned int nr = slab->objects - slab->inuse; + unsigned int i; + void *cur, *next; + + if (!nr) { + slab->freelist = NULL; + return; + } + + cur = next_slab_obj(s, iter); + slab->freelist = cur; + + for (i = 1; i < nr; i++) { + next = next_slab_obj(s, iter); + set_freepointer(s, cur, next); + cur = next; + } + + set_freepointer(s, cur, NULL); +} + +/* Initialize an iterator over free objects in allocation order. */ +static inline void init_slab_obj_iter(struct kmem_cache *s, struct slab *slab, + struct slab_obj_iter *iter, + bool allow_spin) +{ + iter->pos = 0; + iter->start = fixup_red_left(s, slab_address(slab)); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM + iter->random = (slab->objects >= 2 && s->random_seq); + if (!iter->random) + return; + + iter->freelist_count = oo_objects(s->oo); + iter->page_limit = slab->objects * s->size; + + if (allow_spin) { + iter->pos = get_random_u32_below(iter->freelist_count); + } else { + struct rnd_state *state; + + /* + * An interrupt or NMI handler might interrupt and change + * the state in the middle, but that's safe. + */ + state = &get_cpu_var(slab_rnd_state); + iter->pos = prandom_u32_state(state) % iter->freelist_count; + put_cpu_var(slab_rnd_state); + } +#endif +} + /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist * and put the slab to the partial (or full) list. */ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, - int orig_size, gfp_t gfpflags) + int orig_size, bool allow_spin) { - bool allow_spin = gfpflags_allow_spinning(gfpflags); - int nid = slab_nid(slab); - struct kmem_cache_node *n = get_node(s, nid); + struct kmem_cache_node *n; + struct slab_obj_iter iter; + bool needs_add_partial; unsigned long flags; void *object; - if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { - /* Unlucky, discard newly allocated slab. */ - free_new_slab_nolock(s, slab); - return NULL; - } - - object = slab->freelist; - slab->freelist = get_freepointer(s, object); + init_slab_obj_iter(s, slab, &iter, allow_spin); + object = next_slab_obj(s, &iter); slab->inuse = 1; + needs_add_partial = (slab->objects > 1); + build_slab_freelist(s, slab, &iter); + + /* alloc_debug_processing() always expects a valid freepointer */ + set_freepointer(s, object, slab->freelist); + if (!alloc_debug_processing(s, slab, object, orig_size)) { /* * It's not really expected that this would fail on a @@ -3696,20 +3712,32 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, * corruption in theory could cause that. * Leak memory of allocated slab. */ - if (!allow_spin) - spin_unlock_irqrestore(&n->list_lock, flags); return NULL; } - if (allow_spin) + n = get_node(s, slab_nid(slab)); + if (allow_spin) { spin_lock_irqsave(&n->list_lock, flags); + } else if (!spin_trylock_irqsave(&n->list_lock, flags)) { + /* + * Unlucky, discard newly allocated slab. + * The slab is not fully free, but it's fine as + * objects are not allocated to users. + */ + free_new_slab_nolock(s, slab); + return NULL; + } - if (slab->inuse == slab->objects) - add_full(s, n, slab); - else + if (needs_add_partial) add_partial(n, slab, ADD_TO_HEAD); + else + add_full(s, n, slab); - inc_slabs_node(s, nid, slab->objects); + /* + * Debug caches require nr_slabs updates under n->list_lock so validation + * cannot race with slab (de)allocations and observe inconsistent state. + */ + inc_slabs_node(s, slab_nid(slab), slab->objects); spin_unlock_irqrestore(&n->list_lock, flags); return object; @@ -3723,6 +3751,7 @@ static bool get_partial_node_bulk(struct kmem_cache *s, bool allow_spin) { struct slab *slab, *slab2; + struct slab *first = NULL, *last = NULL; unsigned int total_free = 0; unsigned long flags; @@ -3741,8 +3770,15 @@ static bool get_partial_node_bulk(struct kmem_cache *s, struct freelist_counters flc; unsigned int slab_free; - if (!pfmemalloc_match(slab, pc->flags)) + if (!pfmemalloc_match(slab, pc->flags)) { + if (first) { + list_bulk_move_tail(&pc->slabs, + &first->slab_list, + &last->slab_list); + first = NULL; + } continue; + } /* * determine the number of free objects in the slab racily @@ -3759,15 +3795,20 @@ static bool get_partial_node_bulk(struct kmem_cache *s, && total_free + slab_free > pc->max_objects) break; - remove_partial(n, slab); - - list_add(&slab->slab_list, &pc->slabs); + if (!first) + first = slab; + last = slab; + clear_node_partial_state(n, slab); total_free += slab_free; if (total_free >= pc->max_objects) break; } + if (first) + list_bulk_move_tail(&pc->slabs, &first->slab_list, + &last->slab_list); + spin_unlock_irqrestore(&n->list_lock, flags); return total_free > 0; } @@ -4311,7 +4352,8 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) * Assumes this is performed only for caches without debugging so we * don't need to worry about adding the slab to the full list. */ -static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) +static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab, + unsigned int *count) { struct freelist_counters old, new; @@ -4327,6 +4369,7 @@ static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *sla } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); + *count = old.objects - old.inuse; return old.freelist; } @@ -4349,44 +4392,41 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, void **p, unsigned int count, bool allow_spin) { unsigned int allocated = 0; - struct kmem_cache_node *n; - bool needs_add_partial; + struct slab_obj_iter iter; + bool needs_add_partial = true; unsigned long flags; - void *object; /* * Are we going to put the slab on the partial list? * Note slab->inuse is 0 on a new slab. */ - needs_add_partial = (slab->objects > count); - - if (!allow_spin && needs_add_partial) { - - n = get_node(s, slab_nid(slab)); - - if (!spin_trylock_irqsave(&n->list_lock, flags)) { - /* Unlucky, discard newly allocated slab */ - free_new_slab_nolock(s, slab); - return 0; - } + if (count >= slab->objects) { + needs_add_partial = false; + count = slab->objects; } - object = slab->freelist; - while (object && allocated < count) { - p[allocated] = object; - object = get_freepointer(s, object); - maybe_wipe_obj_freeptr(s, p[allocated]); + init_slab_obj_iter(s, slab, &iter, allow_spin); - slab->inuse++; + while (allocated < count) { + p[allocated] = next_slab_obj(s, &iter); allocated++; } - slab->freelist = object; + slab->inuse = count; + build_slab_freelist(s, slab, &iter); if (needs_add_partial) { + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); if (allow_spin) { - n = get_node(s, slab_nid(slab)); spin_lock_irqsave(&n->list_lock, flags); + } else if (!spin_trylock_irqsave(&n->list_lock, flags)) { + /* + * Unlucky, discard newly allocated slab. + * The slab is not fully free, but it's fine as + * objects are not allocated to users. + */ + free_new_slab_nolock(s, slab); + return 0; } add_partial(n, slab, ADD_TO_HEAD); spin_unlock_irqrestore(&n->list_lock, flags); @@ -4457,15 +4497,13 @@ new_objects: stat(s, ALLOC_SLAB); if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); + object = alloc_single_from_new_slab(s, slab, orig_size, allow_spin); if (likely(object)) goto success; } else { - alloc_from_new_slab(s, slab, &object, 1, allow_spin); - /* we don't need to check SLAB_STORE_USER here */ - if (likely(object)) + if (alloc_from_new_slab(s, slab, &object, 1, allow_spin)) return object; } @@ -4981,8 +5019,8 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, return ret; } -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p); +static bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); /* * returns a sheaf that has at least the requested size @@ -5002,21 +5040,20 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) if (unlikely(size > s->sheaf_capacity)) { - sheaf = kzalloc_flex(*sheaf, objects, size, gfp); + sheaf = __alloc_empty_sheaf(s, gfp, size); if (!sheaf) return NULL; stat(s, SHEAF_PREFILL_OVERSIZE); - sheaf->cache = s; sheaf->capacity = size; /* * we do not need to care about pfmemalloc here because oversize - * sheaves area always flushed and freed when returned + * sheaves are always flushed and freed when returned */ if (!__kmem_cache_alloc_bulk(s, gfp, size, &sheaf->objects[0])) { - kfree(sheaf); + free_empty_sheaf(s, sheaf); return NULL; } @@ -5084,7 +5121,7 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, if (unlikely((sheaf->capacity != s->sheaf_capacity) || sheaf->pfmemalloc)) { sheaf_flush_unused(s, sheaf); - kfree(sheaf); + free_empty_sheaf(s, sheaf); return; } @@ -5154,9 +5191,8 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, - &sheaf->objects[sheaf->size])) { + &sheaf->objects[sheaf->size])) return -ENOMEM; - } sheaf->size = sheaf->capacity; return 0; @@ -5275,7 +5311,7 @@ EXPORT_SYMBOL(__kmalloc_large_node_noprof); static __always_inline void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node, - unsigned long caller) + unsigned long caller, kmalloc_token_t token) { struct kmem_cache *s; void *ret; @@ -5290,37 +5326,28 @@ void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node, if (unlikely(!size)) return ZERO_SIZE_PTR; - s = kmalloc_slab(size, b, flags, caller); + s = kmalloc_slab(size, b, flags, token); ret = slab_alloc_node(s, NULL, flags, node, caller, size); ret = kasan_kmalloc(s, ret, size, flags); trace_kmalloc(caller, ret, size, s->size, flags, node); return ret; } -void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +void *__kmalloc_node_noprof(DECL_KMALLOC_PARAMS(size, b, token), gfp_t flags, int node) { - return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_); + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, + _RET_IP_, PASS_TOKEN_PARAM(token)); } EXPORT_SYMBOL(__kmalloc_node_noprof); -void *__kmalloc_noprof(size_t size, gfp_t flags) +void *__kmalloc_noprof(DECL_TOKEN_PARAMS(size, token), gfp_t flags) { - return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_); + return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_, + PASS_TOKEN_PARAM(token)); } EXPORT_SYMBOL(__kmalloc_noprof); -/** - * kmalloc_nolock - Allocate an object of given size from any context. - * @size: size to allocate - * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT - * allowed. - * @node: node number of the target node. - * - * Return: pointer to the new object or NULL in case of error. - * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. - * There is no reason to call it again and expect !NULL. - */ -void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +void *_kmalloc_nolock_noprof(DECL_TOKEN_PARAMS(size, token), gfp_t gfp_flags, int node) { gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; struct kmem_cache *s; @@ -5347,7 +5374,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; - s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + s = kmalloc_slab(size, NULL, alloc_gfp, PASS_TOKEN_PARAM(token)); if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) /* @@ -5400,12 +5427,13 @@ success: ret = kasan_kmalloc(s, ret, size, alloc_gfp); return ret; } -EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); +EXPORT_SYMBOL_GPL(_kmalloc_nolock_noprof); -void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, +void *__kmalloc_node_track_caller_noprof(DECL_KMALLOC_PARAMS(size, b, token), gfp_t flags, int node, unsigned long caller) { - return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller); + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, + caller, PASS_TOKEN_PARAM(token)); } EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof); @@ -5500,6 +5528,34 @@ static noinline void free_to_partial_list( } /* + * Try returning (remainder of) the freelist that we just detached from the + * slab. Optimistically assume the slab is still full, so we don't need to find + * the tail of the detached freelist. + * + * Fail if the slab isn't full anymore due to a concurrent free. + */ +static bool __slab_try_return_freelist(struct kmem_cache *s, struct slab *slab, + void *head, int cnt) +{ + struct freelist_counters old, new; + + old.freelist = slab->freelist; + old.counters = slab->counters; + + if (old.freelist) + return false; + + new.freelist = head; + new.counters = old.counters; + new.inuse -= cnt; + + if (!slab_update_freelist(s, slab, &old, &new, "__slab_try_return_freelist")) + return false; + + return true; +} + +/* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. * @@ -6636,7 +6692,7 @@ void kfree_nolock(const void *object) EXPORT_SYMBOL_GPL(kfree_nolock); static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid, kmalloc_token_t token) { void *ret; size_t ks = 0; @@ -6708,7 +6764,7 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, return (void *)p; alloc_new: - ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); + ret = __kmalloc_node_track_caller_noprof(PASS_KMALLOC_PARAMS(new_size, NULL, token), flags, nid, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -6719,45 +6775,7 @@ alloc_new: return ret; } -/** - * krealloc_node_align - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @align: desired alignment. - * @flags: the type of memory to allocate. - * @nid: NUMA node or NUMA_NO_NODE - * - * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size - * is 0 and @p is not a %NULL pointer, the object pointed to is freed. - * - * Only alignments up to those guaranteed by kmalloc() will be honored. Please see - * Documentation/core-api/memory-allocation.rst for more details. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * When slub_debug_orig_size() is off, krealloc() only knows about the bucket - * size of an allocation (but not the exact size it was allocated with) and - * hence implements the following semantics for shrinking and growing buffers - * with __GFP_ZERO:: - * - * new bucket - * 0 size size - * |--------|----------------| - * | keep | zero | - * - * Otherwise, the original allocation size 'orig_size' could be used to - * precisely clear the requested size, and the new size will also be stored - * as the new 'orig_size'. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, +void *krealloc_node_align_noprof(const void *p, DECL_TOKEN_PARAMS(new_size, token), unsigned long align, gfp_t flags, int nid) { void *ret; @@ -6767,7 +6785,7 @@ void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long a return ZERO_SIZE_PTR; } - ret = __do_krealloc(p, new_size, align, flags, nid); + ret = __do_krealloc(p, new_size, align, flags, nid, PASS_TOKEN_PARAM(token)); if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); @@ -6799,28 +6817,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) return flags; } -/** - * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @b: which set of kmalloc buckets to allocate from. - * @align: desired alignment. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. - * @node: numa node to allocate from - * - * Only alignments up to those guaranteed by kmalloc() will be honored. Please see - * Documentation/core-api/memory-allocation.rst for more details. - * - * Uses kmalloc to get the memory but if the allocation fails then falls back - * to the vmalloc allocator. Use kvfree for freeing the memory. - * - * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not. - * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is - * preferable to the vmalloc fallback, due to visible performance drawbacks. - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, +void *__kvmalloc_node_noprof(DECL_KMALLOC_PARAMS(size, b, token), unsigned long align, gfp_t flags, int node) { bool allow_block; @@ -6832,7 +6829,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, */ ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), kmalloc_gfp_adjust(flags, size), - node, _RET_IP_); + node, _RET_IP_, PASS_TOKEN_PARAM(token)); if (ret || size <= PAGE_SIZE) return ret; @@ -6917,34 +6914,7 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -/** - * kvrealloc_node_align - reallocate memory; contents remain unchanged - * @p: object to reallocate memory for - * @size: the size to reallocate - * @align: desired alignment - * @flags: the flags for the page level allocator - * @nid: NUMA node id - * - * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 - * and @p is not a %NULL pointer, the object pointed to is freed. - * - * Only alignments up to those guaranteed by kmalloc() will be honored. Please see - * Documentation/core-api/memory-allocation.rst for more details. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * This function must not be called concurrently with itself or kvfree() for the - * same memory allocation. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, +void *kvrealloc_node_align_noprof(const void *p, DECL_TOKEN_PARAMS(size, token), unsigned long align, gfp_t flags, int nid) { void *n; @@ -6952,10 +6922,10 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig if (is_vmalloc_addr(p)) return vrealloc_node_align_noprof(p, size, align, flags, nid); - n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); + n = krealloc_node_align_noprof(p, PASS_TOKEN_PARAMS(size, token), align, kmalloc_gfp_adjust(flags, size), nid); if (!n) { /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_node_align_noprof(size, align, flags, nid); + n = __kvmalloc_node_noprof(PASS_KMALLOC_PARAMS(size, NULL, token), align, flags, nid); if (!n) return NULL; @@ -7126,60 +7096,56 @@ __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int mi list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + unsigned int count; + list_del(&slab->slab_list); - object = get_freelist_nofreeze(s, slab); + object = get_freelist_nofreeze(s, slab, &count); - while (object && refilled < max) { + while (count && refilled < max) { p[refilled] = object; object = get_freepointer(s, object); maybe_wipe_obj_freeptr(s, p[refilled]); refilled++; + count--; } /* * Freelist had more objects than we can accommodate, we need to - * free them back. We can treat it like a detached freelist, just - * need to find the tail object. + * free them back. First we try to be optimistic and assume the + * slab is still full since we just detached its freelist. + * Otherwise we must find the tail object. */ - if (unlikely(object)) { + if (unlikely(count)) { void *head = object; void *tail; - int cnt = 0; + + if (__slab_try_return_freelist(s, slab, head, count)) { + list_add(&slab->slab_list, &pc.slabs); + break; + } do { tail = object; - cnt++; object = get_freepointer(s, object); } while (object); - __slab_free(s, slab, head, tail, cnt, _RET_IP_); + __slab_free(s, slab, head, tail, count, _RET_IP_); } if (refilled >= max) break; } - if (unlikely(!list_empty(&pc.slabs))) { + if (!list_empty(&pc.slabs)) { spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { - - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) - continue; + list_for_each_entry(slab, &pc.slabs, slab_list) + set_node_partial_state(n, slab); - list_del(&slab->slab_list); - add_partial(n, slab, ADD_TO_HEAD); - } + list_splice_tail(&pc.slabs, &n->partial); spin_unlock_irqrestore(&n->list_lock, flags); - - /* any slabs left are completely free and for discard */ - list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { - - list_del(&slab->slab_list); - discard_slab(s, slab); - } } return refilled; @@ -7275,10 +7241,6 @@ new_slab: stat(s, ALLOC_SLAB); - /* - * TODO: possible optimization - if we know we will consume the whole - * slab we might skip creating the freelist? - */ refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, /* allow_spin = */ true); @@ -7289,9 +7251,8 @@ out: return refilled; } -static inline -int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +static bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) { int i; @@ -7312,30 +7273,43 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, stat_add(s, ALLOC_SLOWPATH, i); } - return i; + return true; error: __kmem_cache_free_bulk(s, i, p); - return 0; - + return false; } -/* - * Note that interrupts must be enabled when calling this function and gfp - * flags must allow spinning. +/** + * kmem_cache_alloc_bulk - Allocate multiple objects + * @s: The cache to allocate from + * @flags: GFP_* flags. See kmalloc(). + * @size: Number of objects to allocate + * @p: Array of allocated objects + * + * Allocate @size objects from @s and places them into @p. @size must be larger + * than 0. + * + * Interrupts must be enabled when calling this function and @flags must allow + * spinning. + * + * Unlike alloc_pages_bulk(), this function does not check for already allocated + * objects in @p, and thus the caller does not need to zero it. + * + * Return: %true if the allocation succeeded, or %false if it failed. */ -int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +bool kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) { unsigned int i = 0; void *kfence_obj; if (!size) - return 0; + return false; s = slab_pre_alloc_hook(s, flags); if (unlikely(!s)) - return 0; + return false; /* * to make things simpler, only assume at most once kfence allocated @@ -7352,18 +7326,18 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, } i = alloc_from_pcs_bulk(s, flags, size, p); - if (i < size) { /* * If we ran out of memory, don't bother with freeing back to * the percpu sheaves, we have bigger problems. */ - if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (unlikely(!__kmem_cache_alloc_bulk(s, flags, size - i, + p + i))) { if (i > 0) __kmem_cache_free_bulk(s, i, p); if (kfence_obj) __kfence_free(kfence_obj); - return 0; + return false; } } @@ -7378,16 +7352,9 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, } out: - /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled fastpath loop. - */ - if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size))) { - return 0; - } - - return size; + /* memcg and kmem_cache debug support and memory initialization */ + return likely(slab_post_alloc_hook(s, NULL, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size)); } EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); @@ -7609,6 +7576,7 @@ static void early_kmem_cache_node_alloc(int node) { struct slab *slab; struct kmem_cache_node *n; + struct slab_obj_iter iter; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); @@ -7620,14 +7588,18 @@ static void early_kmem_cache_node_alloc(int node) pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } - n = slab->freelist; + init_slab_obj_iter(kmem_cache_node, slab, &iter, true); + + n = next_slab_obj(kmem_cache_node, &iter); BUG_ON(!n); + + slab->inuse = 1; + build_slab_freelist(kmem_cache_node, slab, &iter); + #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); - slab->freelist = get_freepointer(kmem_cache_node, n); - slab->inuse = 1; kmem_cache_node->per_node[node].node = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); @@ -8245,8 +8217,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); - slab_clear_node_partial(slab); - n->nr_partial--; + clear_node_partial_state(n, slab); dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); @@ -8470,7 +8441,7 @@ static void __init bootstrap_kmalloc_sheaves(void) { enum kmalloc_cache_type type; - for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { + for (type = KMALLOC_NORMAL; type <= KMALLOC_PARTITION_END; type++) { for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { if (kmalloc_caches[type][idx]) bootstrap_cache_sheaves(kmalloc_caches[type][idx]); |
