aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 09:18:14 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 09:18:14 +0100
commit9c87e61e3c5797277407ba5eae4eac8a52be3fa3 (patch)
treee3f902cb5363b5b90ab74a4b7e26fafbc15aaeaf /net
parentb85966adbf5de0668a815c6e3527f87e0c387fb4 (diff)
parente4287bf34f97a88c7d9322f5bde828724c073a6b (diff)
downloadath-9c87e61e3c5797277407ba5eae4eac8a52be3fa3.tar.gz
Merge tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov: "Major changes: - Recover from BPF arena page faults using a scratch page and add ptep_try_set() for lockless empty-slot installs on x86 and arm64. This allows BPF kfuncs to access arena pointers directly. The 'arena_direct_access' stable branch was created for this work and was pulled into sched-ext and bpf-next trees (Tejun Heo, Kumar Kartikeya Dwivedi) - Lift old restriction and support 6+ arguments in BPF programs and kfuncs on x86 and arm64 (Yonghong Song, Puranjay Mohan) Other features and fixes: - Add 24-bit BTF vlen and reclaim unused bits in the BTF UAPI to ease addition of new BTF kinds (Alan Maguire) - Raise the maximum BPF call chain depth from 8 to 16 frames (Alexei Starovoitov) - Refactor object relationship tracking in the verifier and fix a dynptr use-after-free bug (Amery Hung) - Harden the signed program loader and reject exclusive maps as inner maps (Daniel Borkmann) - Replace the verifier min/max bounds fields with a circular number (cnum) representation and improve 32->64 bit range refinements (Eduard Zingerman) - Introduce the arena library and runtime (libarena) with a buddy allocator, rbtree and SPMC queue data structures, ASAN support and a parallel test harness. Allow subprograms to return arena pointers and switch to a BTF type-tag based __arena annotation (Emil Tsalapatis) - Cache build IDs in the sleepable stackmap path and avoid faultable build ID reads under mm locks (Ihor Solodrai) - Introduce the tracing_multi link to attach a single BPF program to many kernel functions at once. Allow specifying the uprobe_multi target via FD (Jiri Olsa) - Extend the bpf_list family of kfuncs with bpf_list_add/del(), and bpf_list_is_first/is_last/empty() (Kaitao Cheng) - Extend the BPF syscall with common attributes support for prog_load, btf_load and map_create (Leon Hwang) - Wrap rhashtable as BPF map (Mykyta Yatsenko, Herbert Xu) - Add sleepable support for tracepoint programs and fix deadlocks in LRU map due to NMI reentry (Mykyta Yatsenko) - Fix OOB access in bpf_flow_keys, fix nullness analysis of inner arrays, enforce write checks for global subprograms (Nuoqi Gui) - Report the maximum combined stack depth and print a breakdown of instructions processed per subprogram (Paul Chaignon) - Add an XDP load-balancer benchmark and arm64 JIT support for stack arguments (Puranjay Mohan) - Add kfuncs to traverse over wakeup_sources (Samuel Wu) - Allow sleepable BPF programs to use LPM trie maps directly (Vlad Poenaru) - Many more fixes and cleanups across the verifier, BTF, sockmap, devmap, bpffs, security hooks, s390/riscv/loongarch JITs, rqspinlock, libbpf, bpftool, selftests" * tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (336 commits) selftests/bpf: Work around llvm stack overflow in crypto progs selftests/bpf: add test for bpf_msg_pop_data() overflow bpf, sockmap: fix integer overflow in bpf_msg_pop_data() bounds check sockmap: Fix use-after-free in udp_bpf_recvmsg() bpf, sockmap: keep sk_msg copy state in sync bpf, sockmap: Fix wrong rsge offset in bpf_msg_push_data() bpf, sockmap: reject overflowing copy + len in bpf_msg_push_data() selftsets/bpf: Retry map update on helper_fill_hashmap() selftests/bpf: Add test for sleepable lsm_cgroup rejection selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket selftests/bpf: Avoid static LLVM linking for cross builds selftests/bpf: Use common CFLAGS for urandom_read selftests/bpf: Initialize operation name before use tools/bpf: build: Append extra cflags libbpf: Initialize CFLAGS before including Makefile.include bpftool: Append extra host flags bpftool: Avoid adding EXTRA_CFLAGS to HOST_CFLAGS bpftool: Pass host flags to bootstrap libbpf selftests/bpf: correct CONFIG_PPC64 macro name in comment ...
Diffstat (limited to 'net')
-rw-r--r--net/bpf/bpf_dummy_struct_ops.c14
-rw-r--r--net/bpf/test_run.c68
-rw-r--r--net/core/filter.c117
-rw-r--r--net/core/lwt_bpf.c12
-rw-r--r--net/ipv4/udp_bpf.c9
5 files changed, 167 insertions, 53 deletions
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ae5a54c350b9e..191a6b3ee2541 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -132,7 +132,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops;
const struct btf_type *func_proto;
struct bpf_dummy_ops_test_args *args;
- struct bpf_tramp_links *tlinks = NULL;
+ struct bpf_tramp_nodes *tnodes = NULL;
struct bpf_tramp_link *link = NULL;
void *image = NULL;
unsigned int op_idx;
@@ -158,8 +158,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
if (err)
goto out;
- tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX);
- if (!tlinks) {
+ tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX);
+ if (!tnodes) {
err = -ENOMEM;
goto out;
}
@@ -171,11 +171,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
}
/* prog doesn't take the ownership of the reference from caller */
bpf_prog_inc(prog);
- bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog,
- prog->expected_attach_type);
+ bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops,
+ prog, prog->expected_attach_type, 0);
op_idx = prog->expected_attach_type;
- err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node,
&st_ops->func_models[op_idx],
&dummy_ops_test_ret_function,
&image, &image_off,
@@ -198,7 +198,7 @@ out:
bpf_struct_ops_image_free(image);
if (link)
bpf_link_put(&link->link);
- kfree(tlinks);
+ kfree(tnodes);
return err;
}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index dbf0d8eae8d89..7fdee8f52ee2b 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -702,6 +702,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_TRACE_FSESSION:
+ case BPF_TRACE_FENTRY_MULTI:
+ case BPF_TRACE_FEXIT_MULTI:
+ case BPF_TRACE_FSESSION_MULTI:
if (bpf_fentry_test1(1) != 2 ||
bpf_fentry_test2(2, 3) != 5 ||
bpf_fentry_test3(4, 5, 6) != 15 ||
@@ -747,14 +750,35 @@ static void
__bpf_prog_test_run_raw_tp(void *data)
{
struct bpf_raw_tp_test_run_info *info = data;
+ struct srcu_ctr __percpu *scp = NULL;
struct bpf_trace_run_ctx run_ctx = {};
struct bpf_run_ctx *old_run_ctx;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- rcu_read_lock();
+ if (info->prog->sleepable) {
+ scp = rcu_read_lock_tasks_trace();
+ migrate_disable();
+ } else {
+ rcu_read_lock();
+ }
+
+ if (unlikely(!bpf_prog_get_recursion_context(info->prog))) {
+ bpf_prog_inc_misses_counter(info->prog);
+ goto out;
+ }
+
info->retval = bpf_prog_run(info->prog, info->ctx);
- rcu_read_unlock();
+
+out:
+ bpf_prog_put_recursion_context(info->prog);
+
+ if (info->prog->sleepable) {
+ migrate_enable();
+ rcu_read_unlock_tasks_trace(scp);
+ } else {
+ rcu_read_unlock();
+ }
bpf_reset_run_ctx(old_run_ctx);
}
@@ -782,6 +806,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0)
return -EINVAL;
+ /*
+ * Sleepable programs cannot run with preemption disabled or in
+ * hardirq context (smp_call_function_single), reject the flag.
+ */
+ if (prog->sleepable && (kattr->test.flags & BPF_F_TEST_RUN_ON_CPU))
+ return -EINVAL;
+
if (ctx_size_in) {
info.ctx = memdup_user(ctx_in, ctx_size_in);
if (IS_ERR(info.ctx))
@@ -790,24 +821,31 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
info.ctx = NULL;
}
+ info.retval = 0;
info.prog = prog;
- current_cpu = get_cpu();
- if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
- cpu == current_cpu) {
+ if (prog->sleepable) {
__bpf_prog_test_run_raw_tp(&info);
- } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
- /* smp_call_function_single() also checks cpu_online()
- * after csd_lock(). However, since cpu is from user
- * space, let's do an extra quick check to filter out
- * invalid value before smp_call_function_single().
- */
- err = -ENXIO;
} else {
- err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp,
- &info, 1);
+ current_cpu = get_cpu();
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
+ cpu == current_cpu) {
+ __bpf_prog_test_run_raw_tp(&info);
+ } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ /*
+ * smp_call_function_single() also checks cpu_online()
+ * after csd_lock(). However, since cpu is from user
+ * space, let's do an extra quick check to filter out
+ * invalid value before smp_call_function_single().
+ */
+ err = -ENXIO;
+ } else {
+ err = smp_call_function_single(cpu,
+ __bpf_prog_test_run_raw_tp,
+ &info, 1);
+ }
+ put_cpu();
}
- put_cpu();
if (!err &&
copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32)))
diff --git a/net/core/filter.c b/net/core/filter.c
index 40037413dd4ec..2e96b4b847ce1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2654,6 +2654,37 @@ static void sk_msg_reset_curr(struct sk_msg *msg)
}
}
+static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i)
+{
+ return test_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_clear_elem_copy(struct sk_msg *msg, u32 i)
+{
+ __clear_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i, bool sg_copy)
+{
+ __assign_bit(i, msg->sg.copy, sg_copy);
+}
+
+static void sk_msg_clear_copy_range(struct sk_msg *msg, u32 start, u32 end)
+{
+ while (start != end) {
+ sk_msg_clear_elem_copy(msg, start);
+ sk_msg_iter_var_next(start);
+ }
+}
+
+static void sk_msg_sg_move(struct sk_msg *msg, u32 dst, u32 src)
+{
+ msg->sg.data[dst] = msg->sg.data[src];
+
+ sk_msg_set_elem_copy(msg, dst,
+ sk_msg_elem_is_copy(msg, src));
+}
+
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.func = bpf_msg_cork_bytes,
.gpl_only = false,
@@ -2692,7 +2723,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
+ if (!sk_msg_elem_is_copy(msg, i) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2733,13 +2764,13 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
poffset += len;
sge->length = 0;
put_page(sg_page(sge));
- __clear_bit(i, msg->sg.copy);
+ sk_msg_clear_elem_copy(msg, i);
sk_msg_iter_var_next(i);
} while (i != last_sge);
sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
- __clear_bit(first_sge, msg->sg.copy);
+ sk_msg_clear_elem_copy(msg, first_sge);
/* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and
@@ -2749,8 +2780,14 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
shift = last_sge > first_sge ?
last_sge - first_sge - 1 :
NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
- if (!shift)
+ if (!shift) {
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
goto out;
+ }
+
+ i = first_sge;
+ sk_msg_iter_var_next(i);
+ sk_msg_clear_copy_range(msg, i, last_sge);
i = first_sge;
sk_msg_iter_var_next(i);
@@ -2764,18 +2801,18 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
if (move_from == msg->sg.end)
break;
- msg->sg.data[i] = msg->sg.data[move_from];
- sk_msg_sg_copy_assign(msg, i, msg, move_from);
+ sk_msg_sg_move(msg, i, move_from);
msg->sg.data[move_from].length = 0;
msg->sg.data[move_from].page_link = 0;
msg->sg.data[move_from].offset = 0;
- __clear_bit(move_from, msg->sg.copy);
+ sk_msg_clear_elem_copy(msg, move_from);
sk_msg_iter_var_next(i);
} while (1);
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
msg->sg.end - shift + NR_MSG_FRAG_IDS :
msg->sg.end - shift;
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
out:
sk_msg_reset_curr(msg);
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
@@ -2796,9 +2833,10 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
+ bool sge_copy = false, nsge_copy = false, nnsge_copy = false;
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
- bool sge_copy, nsge_copy, nnsge_copy, rsge_copy = false;
+ bool rsge_copy = false;
u8 *raw, *to, *from;
struct page *page;
@@ -2834,6 +2872,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
if (!space || (space == 1 && start != offset))
copy = msg->sg.data[i].length;
+ if (unlikely(copy + len < copy))
+ return -EINVAL;
+
page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
get_order(copy + len));
if (unlikely(!page))
@@ -2871,7 +2912,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
rsge = sk_msg_elem_cpy(msg, i);
- rsge_copy = test_bit(i, msg->sg.copy);
+ rsge_copy = sk_msg_elem_is_copy(msg, i);
psge->length = start - offset;
rsge.length -= psge->length;
@@ -2896,21 +2937,21 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Shift one or two slots as needed */
sge = sk_msg_elem_cpy(msg, new);
- sge_copy = test_bit(new, msg->sg.copy);
sg_unmark_end(&sge);
+ sge_copy = sk_msg_elem_is_copy(msg, new);
nsge = sk_msg_elem_cpy(msg, i);
- nsge_copy = test_bit(i, msg->sg.copy);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
if (rsge.length) {
sk_msg_iter_var_next(i);
nnsge = sk_msg_elem_cpy(msg, i);
- nnsge_copy = test_bit(i, msg->sg.copy);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_next(msg, end);
}
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
- __assign_bit(i, msg->sg.copy, sge_copy);
+ sk_msg_set_elem_copy(msg, i, sge_copy);
sge = nsge;
sge_copy = nsge_copy;
sk_msg_iter_var_next(i);
@@ -2918,10 +2959,10 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
nsge = nnsge;
nsge_copy = nnsge_copy;
nnsge = sk_msg_elem_cpy(msg, i);
- nnsge_copy = test_bit(i, msg->sg.copy);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
} else {
nsge = sk_msg_elem_cpy(msg, i);
- nsge_copy = test_bit(i, msg->sg.copy);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
}
}
@@ -2929,14 +2970,15 @@ place_new:
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- __clear_bit(new, msg->sg.copy);
+ sk_msg_clear_elem_copy(msg, new);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
sk_msg_iter_var_next(new);
msg->sg.data[new] = rsge;
- __assign_bit(new, msg->sg.copy, rsge_copy);
+ sk_msg_set_elem_copy(msg, new, rsge_copy);
}
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
sk_msg_reset_curr(msg);
sk_msg_compute_data_pointers(msg);
@@ -2962,12 +3004,11 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i)
do {
prev = i;
sk_msg_iter_var_next(i);
- msg->sg.data[prev] = msg->sg.data[i];
- sk_msg_sg_copy_assign(msg, prev, msg, i);
+ sk_msg_sg_move(msg, prev, i);
} while (i != msg->sg.end);
sk_msg_iter_prev(msg, end);
- __clear_bit(msg->sg.end, msg->sg.copy);
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
}
static void sk_msg_shift_right(struct sk_msg *msg, int i)
@@ -2977,28 +3018,29 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i)
sk_msg_iter_next(msg, end);
sge = sk_msg_elem_cpy(msg, i);
- sge_copy = test_bit(i, msg->sg.copy);
+ sge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_var_next(i);
tmp = sk_msg_elem_cpy(msg, i);
- tmp_copy = test_bit(i, msg->sg.copy);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
- __assign_bit(i, msg->sg.copy, sge_copy);
+ sk_msg_set_elem_copy(msg, i, sge_copy);
sk_msg_iter_var_next(i);
sge = tmp;
sge_copy = tmp_copy;
tmp = sk_msg_elem_cpy(msg, i);
- tmp_copy = test_bit(i, msg->sg.copy);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
}
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
}
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
u32 i = 0, l = 0, space, offset = 0;
- u64 last = start + len;
- int pop;
+ u64 last = (u64)start + len;
+ u32 pop;
if (unlikely(flags))
return -EINVAL;
@@ -3047,10 +3089,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
*/
if (start != offset) {
struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+ bool sge_copy = sk_msg_elem_is_copy(msg, i);
int a = start - offset;
int b = sge->length - pop - a;
- u32 sge_i = i;
- bool sge_copy = test_bit(i, msg->sg.copy);
+ u32 sge_idx = i;
sk_msg_iter_var_next(i);
@@ -3063,7 +3105,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
sg_set_page(nsge,
sg_page(sge),
b, sge->offset + pop + a);
- __assign_bit(i, msg->sg.copy, sge_copy);
+ sk_msg_set_elem_copy(msg, i, sge_copy);
} else {
struct page *page, *orig;
u8 *to, *from;
@@ -3080,7 +3122,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
memcpy(to, from, a);
memcpy(to + a, from + a + pop, b);
sg_set_page(sge, page, a + b, 0);
- __clear_bit(sge_i, msg->sg.copy);
+ sk_msg_clear_elem_copy(msg, sge_idx);
put_page(orig);
}
pop = 0;
@@ -5571,11 +5613,24 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
KERNEL_SOCKPTR(optval), *optlen);
}
+static bool sk_allows_sol_ip_sockopt(struct sock *sk)
+{
+ switch (sk->sk_family) {
+ case AF_INET:
+ return true;
+ case AF_INET6:
+ /* Allow getting/setting sockopt for possible ipv4-mapped ipv6 socket. */
+ return sk->sk_type != SOCK_RAW && !ipv6_only_sock(sk);
+ default:
+ return false;
+ }
+}
+
static int sol_ip_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
{
- if (sk->sk_family != AF_INET)
+ if (!sk_allows_sol_ip_sockopt(sk))
return -EINVAL;
switch (optname) {
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index f71ef82a5f3d3..bf588f508b79e 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -599,6 +599,7 @@ static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
{
+ bool is_udp_tunnel;
struct iphdr *iph;
bool ipv4;
int err;
@@ -612,10 +613,16 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
ipv4 = true;
if (unlikely(len < iph->ihl * 4))
return -EINVAL;
+ is_udp_tunnel = iph->protocol == IPPROTO_UDP;
+ if (unlikely(is_udp_tunnel && len < iph->ihl * 4 + sizeof(struct udphdr)))
+ return -EINVAL;
} else if (iph->version == 6) {
ipv4 = false;
if (unlikely(len < sizeof(struct ipv6hdr)))
return -EINVAL;
+ is_udp_tunnel = ((struct ipv6hdr *)iph)->nexthdr == NEXTHDR_UDP;
+ if (unlikely(is_udp_tunnel && len < sizeof(struct ipv6hdr) + sizeof(struct udphdr)))
+ return -EINVAL;
} else {
return -EINVAL;
}
@@ -637,6 +644,11 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
if (ingress)
skb_postpush_rcsum(skb, iph, len);
skb_reset_network_header(skb);
+ if (is_udp_tunnel) {
+ size_t iph_sz = ipv4 ? iph->ihl * 4 : sizeof(struct ipv6hdr);
+
+ skb_set_transport_header(skb, skb_network_offset(skb) + iph_sz);
+ }
memcpy(skb_network_header(skb), hdr, len);
bpf_compute_data_pointers(skb);
skb_clear_hash(skb);
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 9f33b07b14813..ad57c4c9eaab6 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -50,7 +50,9 @@ static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
ret = udp_msg_has_data(sk, psock);
if (!ret) {
+ release_sock(sk);
wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ lock_sock(sk);
ret = udp_msg_has_data(sk, psock);
}
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
@@ -79,6 +81,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out;
}
+ lock_sock(sk);
msg_bytes_ready:
copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
if (!copied) {
@@ -90,11 +93,17 @@ msg_bytes_ready:
if (data) {
if (psock_has_data(psock))
goto msg_bytes_ready;
+
+ release_sock(sk);
+
ret = sk_udp_recvmsg(sk, msg, len, flags);
goto out;
}
copied = -EAGAIN;
}
+
+ release_sock(sk);
+
ret = copied;
out:
sk_psock_put(sk, psock);