1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 */
4#include <linux/bpf.h>
5#include <linux/btf.h>
6#include <linux/bpf-cgroup.h>
7#include <linux/cgroup.h>
8#include <linux/rcupdate.h>
9#include <linux/random.h>
10#include <linux/smp.h>
11#include <linux/topology.h>
12#include <linux/ktime.h>
13#include <linux/sched.h>
14#include <linux/uidgid.h>
15#include <linux/filter.h>
16#include <linux/ctype.h>
17#include <linux/jiffies.h>
18#include <linux/pid_namespace.h>
19#include <linux/poison.h>
20#include <linux/proc_ns.h>
21#include <linux/sched/task.h>
22#include <linux/security.h>
23#include <linux/btf_ids.h>
24#include <linux/bpf_mem_alloc.h>
25#include <linux/kasan.h>
26
27#include "../../lib/kstrtox.h"
28
29/* If kernel subsystem is allowing eBPF programs to call this function,
30 * inside its own verifier_ops->get_func_proto() callback it should return
31 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
32 *
33 * Different map implementations will rely on rcu in map methods
34 * lookup/update/delete, therefore eBPF programs must run under rcu lock
35 * if program is allowed to access maps, so check rcu_read_lock_held() or
36 * rcu_read_lock_trace_held() in all three functions.
37 */
38BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
39{
40	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
41		     !rcu_read_lock_bh_held());
42	return (unsigned long) map->ops->map_lookup_elem(map, key);
43}
44
45const struct bpf_func_proto bpf_map_lookup_elem_proto = {
46	.func		= bpf_map_lookup_elem,
47	.gpl_only	= false,
48	.pkt_access	= true,
49	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
50	.arg1_type	= ARG_CONST_MAP_PTR,
51	.arg2_type	= ARG_PTR_TO_MAP_KEY,
52};
53
54BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
55	   void *, value, u64, flags)
56{
57	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
58		     !rcu_read_lock_bh_held());
59	return map->ops->map_update_elem(map, key, value, flags);
60}
61
62const struct bpf_func_proto bpf_map_update_elem_proto = {
63	.func		= bpf_map_update_elem,
64	.gpl_only	= false,
65	.pkt_access	= true,
66	.ret_type	= RET_INTEGER,
67	.arg1_type	= ARG_CONST_MAP_PTR,
68	.arg2_type	= ARG_PTR_TO_MAP_KEY,
69	.arg3_type	= ARG_PTR_TO_MAP_VALUE,
70	.arg4_type	= ARG_ANYTHING,
71};
72
73BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
74{
75	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
76		     !rcu_read_lock_bh_held());
77	return map->ops->map_delete_elem(map, key);
78}
79
80const struct bpf_func_proto bpf_map_delete_elem_proto = {
81	.func		= bpf_map_delete_elem,
82	.gpl_only	= false,
83	.pkt_access	= true,
84	.ret_type	= RET_INTEGER,
85	.arg1_type	= ARG_CONST_MAP_PTR,
86	.arg2_type	= ARG_PTR_TO_MAP_KEY,
87};
88
89BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
90{
91	return map->ops->map_push_elem(map, value, flags);
92}
93
94const struct bpf_func_proto bpf_map_push_elem_proto = {
95	.func		= bpf_map_push_elem,
96	.gpl_only	= false,
97	.pkt_access	= true,
98	.ret_type	= RET_INTEGER,
99	.arg1_type	= ARG_CONST_MAP_PTR,
100	.arg2_type	= ARG_PTR_TO_MAP_VALUE,
101	.arg3_type	= ARG_ANYTHING,
102};
103
104BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
105{
106	return map->ops->map_pop_elem(map, value);
107}
108
109const struct bpf_func_proto bpf_map_pop_elem_proto = {
110	.func		= bpf_map_pop_elem,
111	.gpl_only	= false,
112	.ret_type	= RET_INTEGER,
113	.arg1_type	= ARG_CONST_MAP_PTR,
114	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
115};
116
117BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
118{
119	return map->ops->map_peek_elem(map, value);
120}
121
122const struct bpf_func_proto bpf_map_peek_elem_proto = {
123	.func		= bpf_map_peek_elem,
124	.gpl_only	= false,
125	.ret_type	= RET_INTEGER,
126	.arg1_type	= ARG_CONST_MAP_PTR,
127	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
128};
129
130BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
131{
132	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
133	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
134}
135
136const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
137	.func		= bpf_map_lookup_percpu_elem,
138	.gpl_only	= false,
139	.pkt_access	= true,
140	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
141	.arg1_type	= ARG_CONST_MAP_PTR,
142	.arg2_type	= ARG_PTR_TO_MAP_KEY,
143	.arg3_type	= ARG_ANYTHING,
144};
145
146const struct bpf_func_proto bpf_get_prandom_u32_proto = {
147	.func		= bpf_user_rnd_u32,
148	.gpl_only	= false,
149	.ret_type	= RET_INTEGER,
150};
151
152BPF_CALL_0(bpf_get_smp_processor_id)
153{
154	return smp_processor_id();
155}
156
157const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
158	.func		= bpf_get_smp_processor_id,
159	.gpl_only	= false,
160	.ret_type	= RET_INTEGER,
161};
162
163BPF_CALL_0(bpf_get_numa_node_id)
164{
165	return numa_node_id();
166}
167
168const struct bpf_func_proto bpf_get_numa_node_id_proto = {
169	.func		= bpf_get_numa_node_id,
170	.gpl_only	= false,
171	.ret_type	= RET_INTEGER,
172};
173
174BPF_CALL_0(bpf_ktime_get_ns)
175{
176	/* NMI safe access to clock monotonic */
177	return ktime_get_mono_fast_ns();
178}
179
180const struct bpf_func_proto bpf_ktime_get_ns_proto = {
181	.func		= bpf_ktime_get_ns,
182	.gpl_only	= false,
183	.ret_type	= RET_INTEGER,
184};
185
186BPF_CALL_0(bpf_ktime_get_boot_ns)
187{
188	/* NMI safe access to clock boottime */
189	return ktime_get_boot_fast_ns();
190}
191
192const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
193	.func		= bpf_ktime_get_boot_ns,
194	.gpl_only	= false,
195	.ret_type	= RET_INTEGER,
196};
197
198BPF_CALL_0(bpf_ktime_get_coarse_ns)
199{
200	return ktime_get_coarse_ns();
201}
202
203const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
204	.func		= bpf_ktime_get_coarse_ns,
205	.gpl_only	= false,
206	.ret_type	= RET_INTEGER,
207};
208
209BPF_CALL_0(bpf_ktime_get_tai_ns)
210{
211	/* NMI safe access to clock tai */
212	return ktime_get_tai_fast_ns();
213}
214
215const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
216	.func		= bpf_ktime_get_tai_ns,
217	.gpl_only	= false,
218	.ret_type	= RET_INTEGER,
219};
220
221BPF_CALL_0(bpf_get_current_pid_tgid)
222{
223	struct task_struct *task = current;
224
225	if (unlikely(!task))
226		return -EINVAL;
227
228	return (u64) task->tgid << 32 | task->pid;
229}
230
231const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
232	.func		= bpf_get_current_pid_tgid,
233	.gpl_only	= false,
234	.ret_type	= RET_INTEGER,
235};
236
237BPF_CALL_0(bpf_get_current_uid_gid)
238{
239	struct task_struct *task = current;
240	kuid_t uid;
241	kgid_t gid;
242
243	if (unlikely(!task))
244		return -EINVAL;
245
246	current_uid_gid(&uid, &gid);
247	return (u64) from_kgid(&init_user_ns, gid) << 32 |
248		     from_kuid(&init_user_ns, uid);
249}
250
251const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
252	.func		= bpf_get_current_uid_gid,
253	.gpl_only	= false,
254	.ret_type	= RET_INTEGER,
255};
256
257BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
258{
259	struct task_struct *task = current;
260
261	if (unlikely(!task))
262		goto err_clear;
263
264	/* Verifier guarantees that size > 0 */
265	strscpy_pad(buf, task->comm, size);
266	return 0;
267err_clear:
268	memset(buf, 0, size);
269	return -EINVAL;
270}
271
272const struct bpf_func_proto bpf_get_current_comm_proto = {
273	.func		= bpf_get_current_comm,
274	.gpl_only	= false,
275	.ret_type	= RET_INTEGER,
276	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
277	.arg2_type	= ARG_CONST_SIZE,
278};
279
280#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
281
282static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
283{
284	arch_spinlock_t *l = (void *)lock;
285	union {
286		__u32 val;
287		arch_spinlock_t lock;
288	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
289
290	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
291	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
292	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
293	preempt_disable();
294	arch_spin_lock(l);
295}
296
297static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
298{
299	arch_spinlock_t *l = (void *)lock;
300
301	arch_spin_unlock(l);
302	preempt_enable();
303}
304
305#else
306
307static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
308{
309	atomic_t *l = (void *)lock;
310
311	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
312	do {
313		atomic_cond_read_relaxed(l, !VAL);
314	} while (atomic_xchg(l, 1));
315}
316
317static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
318{
319	atomic_t *l = (void *)lock;
320
321	atomic_set_release(l, 0);
322}
323
324#endif
325
326static DEFINE_PER_CPU(unsigned long, irqsave_flags);
327
328static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
329{
330	unsigned long flags;
331
332	local_irq_save(flags);
333	__bpf_spin_lock(lock);
334	__this_cpu_write(irqsave_flags, flags);
335}
336
337NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
338{
339	__bpf_spin_lock_irqsave(lock);
340	return 0;
341}
342
343const struct bpf_func_proto bpf_spin_lock_proto = {
344	.func		= bpf_spin_lock,
345	.gpl_only	= false,
346	.ret_type	= RET_VOID,
347	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
348	.arg1_btf_id    = BPF_PTR_POISON,
349};
350
351static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
352{
353	unsigned long flags;
354
355	flags = __this_cpu_read(irqsave_flags);
356	__bpf_spin_unlock(lock);
357	local_irq_restore(flags);
358}
359
360NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
361{
362	__bpf_spin_unlock_irqrestore(lock);
363	return 0;
364}
365
366const struct bpf_func_proto bpf_spin_unlock_proto = {
367	.func		= bpf_spin_unlock,
368	.gpl_only	= false,
369	.ret_type	= RET_VOID,
370	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
371	.arg1_btf_id    = BPF_PTR_POISON,
372};
373
374void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
375			   bool lock_src)
376{
377	struct bpf_spin_lock *lock;
378
379	if (lock_src)
380		lock = src + map->record->spin_lock_off;
381	else
382		lock = dst + map->record->spin_lock_off;
383	preempt_disable();
384	__bpf_spin_lock_irqsave(lock);
385	copy_map_value(map, dst, src);
386	__bpf_spin_unlock_irqrestore(lock);
387	preempt_enable();
388}
389
390BPF_CALL_0(bpf_jiffies64)
391{
392	return get_jiffies_64();
393}
394
395const struct bpf_func_proto bpf_jiffies64_proto = {
396	.func		= bpf_jiffies64,
397	.gpl_only	= false,
398	.ret_type	= RET_INTEGER,
399};
400
401#ifdef CONFIG_CGROUPS
402BPF_CALL_0(bpf_get_current_cgroup_id)
403{
404	struct cgroup *cgrp;
405	u64 cgrp_id;
406
407	rcu_read_lock();
408	cgrp = task_dfl_cgroup(current);
409	cgrp_id = cgroup_id(cgrp);
410	rcu_read_unlock();
411
412	return cgrp_id;
413}
414
415const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
416	.func		= bpf_get_current_cgroup_id,
417	.gpl_only	= false,
418	.ret_type	= RET_INTEGER,
419};
420
421BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
422{
423	struct cgroup *cgrp;
424	struct cgroup *ancestor;
425	u64 cgrp_id;
426
427	rcu_read_lock();
428	cgrp = task_dfl_cgroup(current);
429	ancestor = cgroup_ancestor(cgrp, ancestor_level);
430	cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
431	rcu_read_unlock();
432
433	return cgrp_id;
434}
435
436const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
437	.func		= bpf_get_current_ancestor_cgroup_id,
438	.gpl_only	= false,
439	.ret_type	= RET_INTEGER,
440	.arg1_type	= ARG_ANYTHING,
441};
442#endif /* CONFIG_CGROUPS */
443
444#define BPF_STRTOX_BASE_MASK 0x1F
445
446static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
447			  unsigned long long *res, bool *is_negative)
448{
449	unsigned int base = flags & BPF_STRTOX_BASE_MASK;
450	const char *cur_buf = buf;
451	size_t cur_len = buf_len;
452	unsigned int consumed;
453	size_t val_len;
454	char str[64];
455
456	if (!buf || !buf_len || !res || !is_negative)
457		return -EINVAL;
458
459	if (base != 0 && base != 8 && base != 10 && base != 16)
460		return -EINVAL;
461
462	if (flags & ~BPF_STRTOX_BASE_MASK)
463		return -EINVAL;
464
465	while (cur_buf < buf + buf_len && isspace(*cur_buf))
466		++cur_buf;
467
468	*is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
469	if (*is_negative)
470		++cur_buf;
471
472	consumed = cur_buf - buf;
473	cur_len -= consumed;
474	if (!cur_len)
475		return -EINVAL;
476
477	cur_len = min(cur_len, sizeof(str) - 1);
478	memcpy(str, cur_buf, cur_len);
479	str[cur_len] = '\0';
480	cur_buf = str;
481
482	cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
483	val_len = _parse_integer(cur_buf, base, res);
484
485	if (val_len & KSTRTOX_OVERFLOW)
486		return -ERANGE;
487
488	if (val_len == 0)
489		return -EINVAL;
490
491	cur_buf += val_len;
492	consumed += cur_buf - str;
493
494	return consumed;
495}
496
497static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
498			 long long *res)
499{
500	unsigned long long _res;
501	bool is_negative;
502	int err;
503
504	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
505	if (err < 0)
506		return err;
507	if (is_negative) {
508		if ((long long)-_res > 0)
509			return -ERANGE;
510		*res = -_res;
511	} else {
512		if ((long long)_res < 0)
513			return -ERANGE;
514		*res = _res;
515	}
516	return err;
517}
518
519BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
520	   long *, res)
521{
522	long long _res;
523	int err;
524
525	err = __bpf_strtoll(buf, buf_len, flags, &_res);
526	if (err < 0)
527		return err;
528	if (_res != (long)_res)
529		return -ERANGE;
530	*res = _res;
531	return err;
532}
533
534const struct bpf_func_proto bpf_strtol_proto = {
535	.func		= bpf_strtol,
536	.gpl_only	= false,
537	.ret_type	= RET_INTEGER,
538	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
539	.arg2_type	= ARG_CONST_SIZE,
540	.arg3_type	= ARG_ANYTHING,
541	.arg4_type	= ARG_PTR_TO_LONG,
542};
543
544BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
545	   unsigned long *, res)
546{
547	unsigned long long _res;
548	bool is_negative;
549	int err;
550
551	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
552	if (err < 0)
553		return err;
554	if (is_negative)
555		return -EINVAL;
556	if (_res != (unsigned long)_res)
557		return -ERANGE;
558	*res = _res;
559	return err;
560}
561
562const struct bpf_func_proto bpf_strtoul_proto = {
563	.func		= bpf_strtoul,
564	.gpl_only	= false,
565	.ret_type	= RET_INTEGER,
566	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
567	.arg2_type	= ARG_CONST_SIZE,
568	.arg3_type	= ARG_ANYTHING,
569	.arg4_type	= ARG_PTR_TO_LONG,
570};
571
572BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
573{
574	return strncmp(s1, s2, s1_sz);
575}
576
577static const struct bpf_func_proto bpf_strncmp_proto = {
578	.func		= bpf_strncmp,
579	.gpl_only	= false,
580	.ret_type	= RET_INTEGER,
581	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
582	.arg2_type	= ARG_CONST_SIZE,
583	.arg3_type	= ARG_PTR_TO_CONST_STR,
584};
585
586BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
587	   struct bpf_pidns_info *, nsdata, u32, size)
588{
589	struct task_struct *task = current;
590	struct pid_namespace *pidns;
591	int err = -EINVAL;
592
593	if (unlikely(size != sizeof(struct bpf_pidns_info)))
594		goto clear;
595
596	if (unlikely((u64)(dev_t)dev != dev))
597		goto clear;
598
599	if (unlikely(!task))
600		goto clear;
601
602	pidns = task_active_pid_ns(task);
603	if (unlikely(!pidns)) {
604		err = -ENOENT;
605		goto clear;
606	}
607
608	if (!ns_match(&pidns->ns, (dev_t)dev, ino))
609		goto clear;
610
611	nsdata->pid = task_pid_nr_ns(task, pidns);
612	nsdata->tgid = task_tgid_nr_ns(task, pidns);
613	return 0;
614clear:
615	memset((void *)nsdata, 0, (size_t) size);
616	return err;
617}
618
619const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
620	.func		= bpf_get_ns_current_pid_tgid,
621	.gpl_only	= false,
622	.ret_type	= RET_INTEGER,
623	.arg1_type	= ARG_ANYTHING,
624	.arg2_type	= ARG_ANYTHING,
625	.arg3_type      = ARG_PTR_TO_UNINIT_MEM,
626	.arg4_type      = ARG_CONST_SIZE,
627};
628
629static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
630	.func		= bpf_get_raw_cpu_id,
631	.gpl_only	= false,
632	.ret_type	= RET_INTEGER,
633};
634
635BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
636	   u64, flags, void *, data, u64, size)
637{
638	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
639		return -EINVAL;
640
641	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
642}
643
644const struct bpf_func_proto bpf_event_output_data_proto =  {
645	.func		= bpf_event_output_data,
646	.gpl_only       = true,
647	.ret_type       = RET_INTEGER,
648	.arg1_type      = ARG_PTR_TO_CTX,
649	.arg2_type      = ARG_CONST_MAP_PTR,
650	.arg3_type      = ARG_ANYTHING,
651	.arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
652	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
653};
654
655BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
656	   const void __user *, user_ptr)
657{
658	int ret = copy_from_user(dst, user_ptr, size);
659
660	if (unlikely(ret)) {
661		memset(dst, 0, size);
662		ret = -EFAULT;
663	}
664
665	return ret;
666}
667
668const struct bpf_func_proto bpf_copy_from_user_proto = {
669	.func		= bpf_copy_from_user,
670	.gpl_only	= false,
671	.might_sleep	= true,
672	.ret_type	= RET_INTEGER,
673	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
674	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
675	.arg3_type	= ARG_ANYTHING,
676};
677
678BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
679	   const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
680{
681	int ret;
682
683	/* flags is not used yet */
684	if (unlikely(flags))
685		return -EINVAL;
686
687	if (unlikely(!size))
688		return 0;
689
690	ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
691	if (ret == size)
692		return 0;
693
694	memset(dst, 0, size);
695	/* Return -EFAULT for partial read */
696	return ret < 0 ? ret : -EFAULT;
697}
698
699const struct bpf_func_proto bpf_copy_from_user_task_proto = {
700	.func		= bpf_copy_from_user_task,
701	.gpl_only	= true,
702	.might_sleep	= true,
703	.ret_type	= RET_INTEGER,
704	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
705	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
706	.arg3_type	= ARG_ANYTHING,
707	.arg4_type	= ARG_PTR_TO_BTF_ID,
708	.arg4_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
709	.arg5_type	= ARG_ANYTHING
710};
711
712BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
713{
714	if (cpu >= nr_cpu_ids)
715		return (unsigned long)NULL;
716
717	return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
718}
719
720const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
721	.func		= bpf_per_cpu_ptr,
722	.gpl_only	= false,
723	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
724	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
725	.arg2_type	= ARG_ANYTHING,
726};
727
728BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
729{
730	return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
731}
732
733const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
734	.func		= bpf_this_cpu_ptr,
735	.gpl_only	= false,
736	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
737	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
738};
739
740static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
741		size_t bufsz)
742{
743	void __user *user_ptr = (__force void __user *)unsafe_ptr;
744
745	buf[0] = 0;
746
747	switch (fmt_ptype) {
748	case 's':
749#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
750		if ((unsigned long)unsafe_ptr < TASK_SIZE)
751			return strncpy_from_user_nofault(buf, user_ptr, bufsz);
752		fallthrough;
753#endif
754	case 'k':
755		return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
756	case 'u':
757		return strncpy_from_user_nofault(buf, user_ptr, bufsz);
758	}
759
760	return -EINVAL;
761}
762
763/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
764 * arguments representation.
765 */
766#define MAX_BPRINTF_BIN_ARGS	512
767
768/* Support executing three nested bprintf helper calls on a given CPU */
769#define MAX_BPRINTF_NEST_LEVEL	3
770struct bpf_bprintf_buffers {
771	char bin_args[MAX_BPRINTF_BIN_ARGS];
772	char buf[MAX_BPRINTF_BUF];
773};
774
775static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
776static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
777
778static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
779{
780	int nest_level;
781
782	preempt_disable();
783	nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
784	if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
785		this_cpu_dec(bpf_bprintf_nest_level);
786		preempt_enable();
787		return -EBUSY;
788	}
789	*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
790
791	return 0;
792}
793
794void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
795{
796	if (!data->bin_args && !data->buf)
797		return;
798	if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
799		return;
800	this_cpu_dec(bpf_bprintf_nest_level);
801	preempt_enable();
802}
803
804/*
805 * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
806 *
807 * Returns a negative value if fmt is an invalid format string or 0 otherwise.
808 *
809 * This can be used in two ways:
810 * - Format string verification only: when data->get_bin_args is false
811 * - Arguments preparation: in addition to the above verification, it writes in
812 *   data->bin_args a binary representation of arguments usable by bstr_printf
813 *   where pointers from BPF have been sanitized.
814 *
815 * In argument preparation mode, if 0 is returned, safe temporary buffers are
816 * allocated and bpf_bprintf_cleanup should be called to free them after use.
817 */
818int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
819			u32 num_args, struct bpf_bprintf_data *data)
820{
821	bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
822	char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
823	struct bpf_bprintf_buffers *buffers = NULL;
824	size_t sizeof_cur_arg, sizeof_cur_ip;
825	int err, i, num_spec = 0;
826	u64 cur_arg;
827	char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
828
829	fmt_end = strnchr(fmt, fmt_size, 0);
830	if (!fmt_end)
831		return -EINVAL;
832	fmt_size = fmt_end - fmt;
833
834	if (get_buffers && try_get_buffers(&buffers))
835		return -EBUSY;
836
837	if (data->get_bin_args) {
838		if (num_args)
839			tmp_buf = buffers->bin_args;
840		tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
841		data->bin_args = (u32 *)tmp_buf;
842	}
843
844	if (data->get_buf)
845		data->buf = buffers->buf;
846
847	for (i = 0; i < fmt_size; i++) {
848		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
849			err = -EINVAL;
850			goto out;
851		}
852
853		if (fmt[i] != '%')
854			continue;
855
856		if (fmt[i + 1] == '%') {
857			i++;
858			continue;
859		}
860
861		if (num_spec >= num_args) {
862			err = -EINVAL;
863			goto out;
864		}
865
866		/* The string is zero-terminated so if fmt[i] != 0, we can
867		 * always access fmt[i + 1], in the worst case it will be a 0
868		 */
869		i++;
870
871		/* skip optional "[0 +-][num]" width formatting field */
872		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
873		       fmt[i] == ' ')
874			i++;
875		if (fmt[i] >= '1' && fmt[i] <= '9') {
876			i++;
877			while (fmt[i] >= '0' && fmt[i] <= '9')
878				i++;
879		}
880
881		if (fmt[i] == 'p') {
882			sizeof_cur_arg = sizeof(long);
883
884			if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
885			    fmt[i + 2] == 's') {
886				fmt_ptype = fmt[i + 1];
887				i += 2;
888				goto fmt_str;
889			}
890
891			if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
892			    ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
893			    fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
894			    fmt[i + 1] == 'S') {
895				/* just kernel pointers */
896				if (tmp_buf)
897					cur_arg = raw_args[num_spec];
898				i++;
899				goto nocopy_fmt;
900			}
901
902			if (fmt[i + 1] == 'B') {
903				if (tmp_buf)  {
904					err = snprintf(tmp_buf,
905						       (tmp_buf_end - tmp_buf),
906						       "%pB",
907						       (void *)(long)raw_args[num_spec]);
908					tmp_buf += (err + 1);
909				}
910
911				i++;
912				num_spec++;
913				continue;
914			}
915
916			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
917			if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
918			    (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
919				err = -EINVAL;
920				goto out;
921			}
922
923			i += 2;
924			if (!tmp_buf)
925				goto nocopy_fmt;
926
927			sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
928			if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
929				err = -ENOSPC;
930				goto out;
931			}
932
933			unsafe_ptr = (char *)(long)raw_args[num_spec];
934			err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
935						       sizeof_cur_ip);
936			if (err < 0)
937				memset(cur_ip, 0, sizeof_cur_ip);
938
939			/* hack: bstr_printf expects IP addresses to be
940			 * pre-formatted as strings, ironically, the easiest way
941			 * to do that is to call snprintf.
942			 */
943			ip_spec[2] = fmt[i - 1];
944			ip_spec[3] = fmt[i];
945			err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
946				       ip_spec, &cur_ip);
947
948			tmp_buf += err + 1;
949			num_spec++;
950
951			continue;
952		} else if (fmt[i] == 's') {
953			fmt_ptype = fmt[i];
954fmt_str:
955			if (fmt[i + 1] != 0 &&
956			    !isspace(fmt[i + 1]) &&
957			    !ispunct(fmt[i + 1])) {
958				err = -EINVAL;
959				goto out;
960			}
961
962			if (!tmp_buf)
963				goto nocopy_fmt;
964
965			if (tmp_buf_end == tmp_buf) {
966				err = -ENOSPC;
967				goto out;
968			}
969
970			unsafe_ptr = (char *)(long)raw_args[num_spec];
971			err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
972						    fmt_ptype,
973						    tmp_buf_end - tmp_buf);
974			if (err < 0) {
975				tmp_buf[0] = '\0';
976				err = 1;
977			}
978
979			tmp_buf += err;
980			num_spec++;
981
982			continue;
983		} else if (fmt[i] == 'c') {
984			if (!tmp_buf)
985				goto nocopy_fmt;
986
987			if (tmp_buf_end == tmp_buf) {
988				err = -ENOSPC;
989				goto out;
990			}
991
992			*tmp_buf = raw_args[num_spec];
993			tmp_buf++;
994			num_spec++;
995
996			continue;
997		}
998
999		sizeof_cur_arg = sizeof(int);
1000
1001		if (fmt[i] == 'l') {
1002			sizeof_cur_arg = sizeof(long);
1003			i++;
1004		}
1005		if (fmt[i] == 'l') {
1006			sizeof_cur_arg = sizeof(long long);
1007			i++;
1008		}
1009
1010		if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
1011		    fmt[i] != 'x' && fmt[i] != 'X') {
1012			err = -EINVAL;
1013			goto out;
1014		}
1015
1016		if (tmp_buf)
1017			cur_arg = raw_args[num_spec];
1018nocopy_fmt:
1019		if (tmp_buf) {
1020			tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
1021			if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
1022				err = -ENOSPC;
1023				goto out;
1024			}
1025
1026			if (sizeof_cur_arg == 8) {
1027				*(u32 *)tmp_buf = *(u32 *)&cur_arg;
1028				*(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
1029			} else {
1030				*(u32 *)tmp_buf = (u32)(long)cur_arg;
1031			}
1032			tmp_buf += sizeof_cur_arg;
1033		}
1034		num_spec++;
1035	}
1036
1037	err = 0;
1038out:
1039	if (err)
1040		bpf_bprintf_cleanup(data);
1041	return err;
1042}
1043
1044BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
1045	   const void *, args, u32, data_len)
1046{
1047	struct bpf_bprintf_data data = {
1048		.get_bin_args	= true,
1049	};
1050	int err, num_args;
1051
1052	if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
1053	    (data_len && !args))
1054		return -EINVAL;
1055	num_args = data_len / 8;
1056
1057	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
1058	 * can safely give an unbounded size.
1059	 */
1060	err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
1061	if (err < 0)
1062		return err;
1063
1064	err = bstr_printf(str, str_size, fmt, data.bin_args);
1065
1066	bpf_bprintf_cleanup(&data);
1067
1068	return err + 1;
1069}
1070
1071const struct bpf_func_proto bpf_snprintf_proto = {
1072	.func		= bpf_snprintf,
1073	.gpl_only	= true,
1074	.ret_type	= RET_INTEGER,
1075	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
1076	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1077	.arg3_type	= ARG_PTR_TO_CONST_STR,
1078	.arg4_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
1079	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
1080};
1081
1082struct bpf_async_cb {
1083	struct bpf_map *map;
1084	struct bpf_prog *prog;
1085	void __rcu *callback_fn;
1086	void *value;
1087	struct rcu_head rcu;
1088	u64 flags;
1089};
1090
1091/* BPF map elements can contain 'struct bpf_timer'.
1092 * Such map owns all of its BPF timers.
1093 * 'struct bpf_timer' is allocated as part of map element allocation
1094 * and it's zero initialized.
1095 * That space is used to keep 'struct bpf_async_kern'.
1096 * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
1097 * remembers 'struct bpf_map *' pointer it's part of.
1098 * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
1099 * bpf_timer_start() arms the timer.
1100 * If user space reference to a map goes to zero at this point
1101 * ops->map_release_uref callback is responsible for cancelling the timers,
1102 * freeing their memory, and decrementing prog's refcnts.
1103 * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
1104 * Inner maps can contain bpf timers as well. ops->map_release_uref is
1105 * freeing the timers when inner map is replaced or deleted by user space.
1106 */
1107struct bpf_hrtimer {
1108	struct bpf_async_cb cb;
1109	struct hrtimer timer;
1110};
1111
1112struct bpf_work {
1113	struct bpf_async_cb cb;
1114	struct work_struct work;
1115	struct work_struct delete_work;
1116};
1117
1118/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
1119struct bpf_async_kern {
1120	union {
1121		struct bpf_async_cb *cb;
1122		struct bpf_hrtimer *timer;
1123		struct bpf_work *work;
1124	};
1125	/* bpf_spin_lock is used here instead of spinlock_t to make
1126	 * sure that it always fits into space reserved by struct bpf_timer
1127	 * regardless of LOCKDEP and spinlock debug flags.
1128	 */
1129	struct bpf_spin_lock lock;
1130} __attribute__((aligned(8)));
1131
1132enum bpf_async_type {
1133	BPF_ASYNC_TYPE_TIMER = 0,
1134	BPF_ASYNC_TYPE_WQ,
1135};
1136
1137static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
1138
1139static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
1140{
1141	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
1142	struct bpf_map *map = t->cb.map;
1143	void *value = t->cb.value;
1144	bpf_callback_t callback_fn;
1145	void *key;
1146	u32 idx;
1147
1148	BTF_TYPE_EMIT(struct bpf_timer);
1149	callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
1150	if (!callback_fn)
1151		goto out;
1152
1153	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
1154	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
1155	 * Remember the timer this callback is servicing to prevent
1156	 * deadlock if callback_fn() calls bpf_timer_cancel() or
1157	 * bpf_map_delete_elem() on the same timer.
1158	 */
1159	this_cpu_write(hrtimer_running, t);
1160	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
1161		struct bpf_array *array = container_of(map, struct bpf_array, map);
1162
1163		/* compute the key */
1164		idx = ((char *)value - array->value) / array->elem_size;
1165		key = &idx;
1166	} else { /* hash or lru */
1167		key = value - round_up(map->key_size, 8);
1168	}
1169
1170	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1171	/* The verifier checked that return value is zero. */
1172
1173	this_cpu_write(hrtimer_running, NULL);
1174out:
1175	return HRTIMER_NORESTART;
1176}
1177
1178static void bpf_wq_work(struct work_struct *work)
1179{
1180	struct bpf_work *w = container_of(work, struct bpf_work, work);
1181	struct bpf_async_cb *cb = &w->cb;
1182	struct bpf_map *map = cb->map;
1183	bpf_callback_t callback_fn;
1184	void *value = cb->value;
1185	void *key;
1186	u32 idx;
1187
1188	BTF_TYPE_EMIT(struct bpf_wq);
1189
1190	callback_fn = READ_ONCE(cb->callback_fn);
1191	if (!callback_fn)
1192		return;
1193
1194	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
1195		struct bpf_array *array = container_of(map, struct bpf_array, map);
1196
1197		/* compute the key */
1198		idx = ((char *)value - array->value) / array->elem_size;
1199		key = &idx;
1200	} else { /* hash or lru */
1201		key = value - round_up(map->key_size, 8);
1202	}
1203
1204        rcu_read_lock_trace();
1205        migrate_disable();
1206
1207	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1208
1209	migrate_enable();
1210	rcu_read_unlock_trace();
1211}
1212
1213static void bpf_wq_delete_work(struct work_struct *work)
1214{
1215	struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
1216
1217	cancel_work_sync(&w->work);
1218
1219	kfree_rcu(w, cb.rcu);
1220}
1221
1222static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
1223			    enum bpf_async_type type)
1224{
1225	struct bpf_async_cb *cb;
1226	struct bpf_hrtimer *t;
1227	struct bpf_work *w;
1228	clockid_t clockid;
1229	size_t size;
1230	int ret = 0;
1231
1232	if (in_nmi())
1233		return -EOPNOTSUPP;
1234
1235	switch (type) {
1236	case BPF_ASYNC_TYPE_TIMER:
1237		size = sizeof(struct bpf_hrtimer);
1238		break;
1239	case BPF_ASYNC_TYPE_WQ:
1240		size = sizeof(struct bpf_work);
1241		break;
1242	default:
1243		return -EINVAL;
1244	}
1245
1246	__bpf_spin_lock_irqsave(&async->lock);
1247	t = async->timer;
1248	if (t) {
1249		ret = -EBUSY;
1250		goto out;
1251	}
1252
1253	/* allocate hrtimer via map_kmalloc to use memcg accounting */
1254	cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
1255	if (!cb) {
1256		ret = -ENOMEM;
1257		goto out;
1258	}
1259
1260	switch (type) {
1261	case BPF_ASYNC_TYPE_TIMER:
1262		clockid = flags & (MAX_CLOCKS - 1);
1263		t = (struct bpf_hrtimer *)cb;
1264
1265		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
1266		t->timer.function = bpf_timer_cb;
1267		cb->value = (void *)async - map->record->timer_off;
1268		break;
1269	case BPF_ASYNC_TYPE_WQ:
1270		w = (struct bpf_work *)cb;
1271
1272		INIT_WORK(&w->work, bpf_wq_work);
1273		INIT_WORK(&w->delete_work, bpf_wq_delete_work);
1274		cb->value = (void *)async - map->record->wq_off;
1275		break;
1276	}
1277	cb->map = map;
1278	cb->prog = NULL;
1279	cb->flags = flags;
1280	rcu_assign_pointer(cb->callback_fn, NULL);
1281
1282	WRITE_ONCE(async->cb, cb);
1283	/* Guarantee the order between async->cb and map->usercnt. So
1284	 * when there are concurrent uref release and bpf timer init, either
1285	 * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
1286	 * timer or atomic64_read() below returns a zero usercnt.
1287	 */
1288	smp_mb();
1289	if (!atomic64_read(&map->usercnt)) {
1290		/* maps with timers must be either held by user space
1291		 * or pinned in bpffs.
1292		 */
1293		WRITE_ONCE(async->cb, NULL);
1294		kfree(cb);
1295		ret = -EPERM;
1296	}
1297out:
1298	__bpf_spin_unlock_irqrestore(&async->lock);
1299	return ret;
1300}
1301
1302BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
1303	   u64, flags)
1304{
1305	clock_t clockid = flags & (MAX_CLOCKS - 1);
1306
1307	BUILD_BUG_ON(MAX_CLOCKS != 16);
1308	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
1309	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
1310
1311	if (flags >= MAX_CLOCKS ||
1312	    /* similar to timerfd except _ALARM variants are not supported */
1313	    (clockid != CLOCK_MONOTONIC &&
1314	     clockid != CLOCK_REALTIME &&
1315	     clockid != CLOCK_BOOTTIME))
1316		return -EINVAL;
1317
1318	return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
1319}
1320
1321static const struct bpf_func_proto bpf_timer_init_proto = {
1322	.func		= bpf_timer_init,
1323	.gpl_only	= true,
1324	.ret_type	= RET_INTEGER,
1325	.arg1_type	= ARG_PTR_TO_TIMER,
1326	.arg2_type	= ARG_CONST_MAP_PTR,
1327	.arg3_type	= ARG_ANYTHING,
1328};
1329
1330static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
1331				    struct bpf_prog_aux *aux, unsigned int flags,
1332				    enum bpf_async_type type)
1333{
1334	struct bpf_prog *prev, *prog = aux->prog;
1335	struct bpf_async_cb *cb;
1336	int ret = 0;
1337
1338	if (in_nmi())
1339		return -EOPNOTSUPP;
1340	__bpf_spin_lock_irqsave(&async->lock);
1341	cb = async->cb;
1342	if (!cb) {
1343		ret = -EINVAL;
1344		goto out;
1345	}
1346	if (!atomic64_read(&cb->map->usercnt)) {
1347		/* maps with timers must be either held by user space
1348		 * or pinned in bpffs. Otherwise timer might still be
1349		 * running even when bpf prog is detached and user space
1350		 * is gone, since map_release_uref won't ever be called.
1351		 */
1352		ret = -EPERM;
1353		goto out;
1354	}
1355	prev = cb->prog;
1356	if (prev != prog) {
1357		/* Bump prog refcnt once. Every bpf_timer_set_callback()
1358		 * can pick different callback_fn-s within the same prog.
1359		 */
1360		prog = bpf_prog_inc_not_zero(prog);
1361		if (IS_ERR(prog)) {
1362			ret = PTR_ERR(prog);
1363			goto out;
1364		}
1365		if (prev)
1366			/* Drop prev prog refcnt when swapping with new prog */
1367			bpf_prog_put(prev);
1368		cb->prog = prog;
1369	}
1370	rcu_assign_pointer(cb->callback_fn, callback_fn);
1371out:
1372	__bpf_spin_unlock_irqrestore(&async->lock);
1373	return ret;
1374}
1375
1376BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
1377	   struct bpf_prog_aux *, aux)
1378{
1379	return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
1380}
1381
1382static const struct bpf_func_proto bpf_timer_set_callback_proto = {
1383	.func		= bpf_timer_set_callback,
1384	.gpl_only	= true,
1385	.ret_type	= RET_INTEGER,
1386	.arg1_type	= ARG_PTR_TO_TIMER,
1387	.arg2_type	= ARG_PTR_TO_FUNC,
1388};
1389
1390BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
1391{
1392	struct bpf_hrtimer *t;
1393	int ret = 0;
1394	enum hrtimer_mode mode;
1395
1396	if (in_nmi())
1397		return -EOPNOTSUPP;
1398	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
1399		return -EINVAL;
1400	__bpf_spin_lock_irqsave(&timer->lock);
1401	t = timer->timer;
1402	if (!t || !t->cb.prog) {
1403		ret = -EINVAL;
1404		goto out;
1405	}
1406
1407	if (flags & BPF_F_TIMER_ABS)
1408		mode = HRTIMER_MODE_ABS_SOFT;
1409	else
1410		mode = HRTIMER_MODE_REL_SOFT;
1411
1412	if (flags & BPF_F_TIMER_CPU_PIN)
1413		mode |= HRTIMER_MODE_PINNED;
1414
1415	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
1416out:
1417	__bpf_spin_unlock_irqrestore(&timer->lock);
1418	return ret;
1419}
1420
1421static const struct bpf_func_proto bpf_timer_start_proto = {
1422	.func		= bpf_timer_start,
1423	.gpl_only	= true,
1424	.ret_type	= RET_INTEGER,
1425	.arg1_type	= ARG_PTR_TO_TIMER,
1426	.arg2_type	= ARG_ANYTHING,
1427	.arg3_type	= ARG_ANYTHING,
1428};
1429
1430static void drop_prog_refcnt(struct bpf_async_cb *async)
1431{
1432	struct bpf_prog *prog = async->prog;
1433
1434	if (prog) {
1435		bpf_prog_put(prog);
1436		async->prog = NULL;
1437		rcu_assign_pointer(async->callback_fn, NULL);
1438	}
1439}
1440
1441BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
1442{
1443	struct bpf_hrtimer *t;
1444	int ret = 0;
1445
1446	if (in_nmi())
1447		return -EOPNOTSUPP;
1448	rcu_read_lock();
1449	__bpf_spin_lock_irqsave(&timer->lock);
1450	t = timer->timer;
1451	if (!t) {
1452		ret = -EINVAL;
1453		goto out;
1454	}
1455	if (this_cpu_read(hrtimer_running) == t) {
1456		/* If bpf callback_fn is trying to bpf_timer_cancel()
1457		 * its own timer the hrtimer_cancel() will deadlock
1458		 * since it waits for callback_fn to finish
1459		 */
1460		ret = -EDEADLK;
1461		goto out;
1462	}
1463	drop_prog_refcnt(&t->cb);
1464out:
1465	__bpf_spin_unlock_irqrestore(&timer->lock);
1466	/* Cancel the timer and wait for associated callback to finish
1467	 * if it was running.
1468	 */
1469	ret = ret ?: hrtimer_cancel(&t->timer);
1470	rcu_read_unlock();
1471	return ret;
1472}
1473
1474static const struct bpf_func_proto bpf_timer_cancel_proto = {
1475	.func		= bpf_timer_cancel,
1476	.gpl_only	= true,
1477	.ret_type	= RET_INTEGER,
1478	.arg1_type	= ARG_PTR_TO_TIMER,
1479};
1480
1481static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
1482{
1483	struct bpf_async_cb *cb;
1484
1485	/* Performance optimization: read async->cb without lock first. */
1486	if (!READ_ONCE(async->cb))
1487		return NULL;
1488
1489	__bpf_spin_lock_irqsave(&async->lock);
1490	/* re-read it under lock */
1491	cb = async->cb;
1492	if (!cb)
1493		goto out;
1494	drop_prog_refcnt(cb);
1495	/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
1496	 * this timer, since it won't be initialized.
1497	 */
1498	WRITE_ONCE(async->cb, NULL);
1499out:
1500	__bpf_spin_unlock_irqrestore(&async->lock);
1501	return cb;
1502}
1503
1504/* This function is called by map_delete/update_elem for individual element and
1505 * by ops->map_release_uref when the user space reference to a map reaches zero.
1506 */
1507void bpf_timer_cancel_and_free(void *val)
1508{
1509	struct bpf_hrtimer *t;
1510
1511	t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
1512
1513	if (!t)
1514		return;
1515	/* Cancel the timer and wait for callback to complete if it was running.
1516	 * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
1517	 * right after for both preallocated and non-preallocated maps.
1518	 * The async->cb = NULL was already done and no code path can
1519	 * see address 't' anymore.
1520	 *
1521	 * Check that bpf_map_delete/update_elem() wasn't called from timer
1522	 * callback_fn. In such case don't call hrtimer_cancel() (since it will
1523	 * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
1524	 * return -1). Though callback_fn is still running on this cpu it's
1525	 * safe to do kfree(t) because bpf_timer_cb() read everything it needed
1526	 * from 't'. The bpf subprog callback_fn won't be able to access 't',
1527	 * since async->cb = NULL was already done. The timer will be
1528	 * effectively cancelled because bpf_timer_cb() will return
1529	 * HRTIMER_NORESTART.
1530	 */
1531	if (this_cpu_read(hrtimer_running) != t)
1532		hrtimer_cancel(&t->timer);
1533	kfree_rcu(t, cb.rcu);
1534}
1535
1536/* This function is called by map_delete/update_elem for individual element and
1537 * by ops->map_release_uref when the user space reference to a map reaches zero.
1538 */
1539void bpf_wq_cancel_and_free(void *val)
1540{
1541	struct bpf_work *work;
1542
1543	BTF_TYPE_EMIT(struct bpf_wq);
1544
1545	work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
1546	if (!work)
1547		return;
1548	/* Trigger cancel of the sleepable work, but *do not* wait for
1549	 * it to finish if it was running as we might not be in a
1550	 * sleepable context.
1551	 * kfree will be called once the work has finished.
1552	 */
1553	schedule_work(&work->delete_work);
1554}
1555
1556BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
1557{
1558	unsigned long *kptr = map_value;
1559
1560	/* This helper may be inlined by verifier. */
1561	return xchg(kptr, (unsigned long)ptr);
1562}
1563
1564/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
1565 * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
1566 * denote type that verifier will determine.
1567 */
1568static const struct bpf_func_proto bpf_kptr_xchg_proto = {
1569	.func         = bpf_kptr_xchg,
1570	.gpl_only     = false,
1571	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
1572	.ret_btf_id   = BPF_PTR_POISON,
1573	.arg1_type    = ARG_PTR_TO_KPTR,
1574	.arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
1575	.arg2_btf_id  = BPF_PTR_POISON,
1576};
1577
1578/* Since the upper 8 bits of dynptr->size is reserved, the
1579 * maximum supported size is 2^24 - 1.
1580 */
1581#define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
1582#define DYNPTR_TYPE_SHIFT	28
1583#define DYNPTR_SIZE_MASK	0xFFFFFF
1584#define DYNPTR_RDONLY_BIT	BIT(31)
1585
1586bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
1587{
1588	return ptr->size & DYNPTR_RDONLY_BIT;
1589}
1590
1591void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
1592{
1593	ptr->size |= DYNPTR_RDONLY_BIT;
1594}
1595
1596static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
1597{
1598	ptr->size |= type << DYNPTR_TYPE_SHIFT;
1599}
1600
1601static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
1602{
1603	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
1604}
1605
1606u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
1607{
1608	return ptr->size & DYNPTR_SIZE_MASK;
1609}
1610
1611static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
1612{
1613	u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
1614
1615	ptr->size = new_size | metadata;
1616}
1617
1618int bpf_dynptr_check_size(u32 size)
1619{
1620	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
1621}
1622
1623void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
1624		     enum bpf_dynptr_type type, u32 offset, u32 size)
1625{
1626	ptr->data = data;
1627	ptr->offset = offset;
1628	ptr->size = size;
1629	bpf_dynptr_set_type(ptr, type);
1630}
1631
1632void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
1633{
1634	memset(ptr, 0, sizeof(*ptr));
1635}
1636
1637static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
1638{
1639	u32 size = __bpf_dynptr_size(ptr);
1640
1641	if (len > size || offset > size - len)
1642		return -E2BIG;
1643
1644	return 0;
1645}
1646
1647BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
1648{
1649	int err;
1650
1651	BTF_TYPE_EMIT(struct bpf_dynptr);
1652
1653	err = bpf_dynptr_check_size(size);
1654	if (err)
1655		goto error;
1656
1657	/* flags is currently unsupported */
1658	if (flags) {
1659		err = -EINVAL;
1660		goto error;
1661	}
1662
1663	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
1664
1665	return 0;
1666
1667error:
1668	bpf_dynptr_set_null(ptr);
1669	return err;
1670}
1671
1672static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
1673	.func		= bpf_dynptr_from_mem,
1674	.gpl_only	= false,
1675	.ret_type	= RET_INTEGER,
1676	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1677	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1678	.arg3_type	= ARG_ANYTHING,
1679	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
1680};
1681
1682BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
1683	   u32, offset, u64, flags)
1684{
1685	enum bpf_dynptr_type type;
1686	int err;
1687
1688	if (!src->data || flags)
1689		return -EINVAL;
1690
1691	err = bpf_dynptr_check_off_len(src, offset, len);
1692	if (err)
1693		return err;
1694
1695	type = bpf_dynptr_get_type(src);
1696
1697	switch (type) {
1698	case BPF_DYNPTR_TYPE_LOCAL:
1699	case BPF_DYNPTR_TYPE_RINGBUF:
1700		/* Source and destination may possibly overlap, hence use memmove to
1701		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1702		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1703		 */
1704		memmove(dst, src->data + src->offset + offset, len);
1705		return 0;
1706	case BPF_DYNPTR_TYPE_SKB:
1707		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
1708	case BPF_DYNPTR_TYPE_XDP:
1709		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
1710	default:
1711		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
1712		return -EFAULT;
1713	}
1714}
1715
1716static const struct bpf_func_proto bpf_dynptr_read_proto = {
1717	.func		= bpf_dynptr_read,
1718	.gpl_only	= false,
1719	.ret_type	= RET_INTEGER,
1720	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1721	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1722	.arg3_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
1723	.arg4_type	= ARG_ANYTHING,
1724	.arg5_type	= ARG_ANYTHING,
1725};
1726
1727BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
1728	   u32, len, u64, flags)
1729{
1730	enum bpf_dynptr_type type;
1731	int err;
1732
1733	if (!dst->data || __bpf_dynptr_is_rdonly(dst))
1734		return -EINVAL;
1735
1736	err = bpf_dynptr_check_off_len(dst, offset, len);
1737	if (err)
1738		return err;
1739
1740	type = bpf_dynptr_get_type(dst);
1741
1742	switch (type) {
1743	case BPF_DYNPTR_TYPE_LOCAL:
1744	case BPF_DYNPTR_TYPE_RINGBUF:
1745		if (flags)
1746			return -EINVAL;
1747		/* Source and destination may possibly overlap, hence use memmove to
1748		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1749		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1750		 */
1751		memmove(dst->data + dst->offset + offset, src, len);
1752		return 0;
1753	case BPF_DYNPTR_TYPE_SKB:
1754		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
1755					     flags);
1756	case BPF_DYNPTR_TYPE_XDP:
1757		if (flags)
1758			return -EINVAL;
1759		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
1760	default:
1761		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
1762		return -EFAULT;
1763	}
1764}
1765
1766static const struct bpf_func_proto bpf_dynptr_write_proto = {
1767	.func		= bpf_dynptr_write,
1768	.gpl_only	= false,
1769	.ret_type	= RET_INTEGER,
1770	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
1771	.arg2_type	= ARG_ANYTHING,
1772	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
1773	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
1774	.arg5_type	= ARG_ANYTHING,
1775};
1776
1777BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
1778{
1779	enum bpf_dynptr_type type;
1780	int err;
1781
1782	if (!ptr->data)
1783		return 0;
1784
1785	err = bpf_dynptr_check_off_len(ptr, offset, len);
1786	if (err)
1787		return 0;
1788
1789	if (__bpf_dynptr_is_rdonly(ptr))
1790		return 0;
1791
1792	type = bpf_dynptr_get_type(ptr);
1793
1794	switch (type) {
1795	case BPF_DYNPTR_TYPE_LOCAL:
1796	case BPF_DYNPTR_TYPE_RINGBUF:
1797		return (unsigned long)(ptr->data + ptr->offset + offset);
1798	case BPF_DYNPTR_TYPE_SKB:
1799	case BPF_DYNPTR_TYPE_XDP:
1800		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
1801		return 0;
1802	default:
1803		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
1804		return 0;
1805	}
1806}
1807
1808static const struct bpf_func_proto bpf_dynptr_data_proto = {
1809	.func		= bpf_dynptr_data,
1810	.gpl_only	= false,
1811	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
1812	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
1813	.arg2_type	= ARG_ANYTHING,
1814	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
1815};
1816
1817const struct bpf_func_proto bpf_get_current_task_proto __weak;
1818const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
1819const struct bpf_func_proto bpf_probe_read_user_proto __weak;
1820const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
1821const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
1822const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
1823const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
1824
1825const struct bpf_func_proto *
1826bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1827{
1828	switch (func_id) {
1829	case BPF_FUNC_map_lookup_elem:
1830		return &bpf_map_lookup_elem_proto;
1831	case BPF_FUNC_map_update_elem:
1832		return &bpf_map_update_elem_proto;
1833	case BPF_FUNC_map_delete_elem:
1834		return &bpf_map_delete_elem_proto;
1835	case BPF_FUNC_map_push_elem:
1836		return &bpf_map_push_elem_proto;
1837	case BPF_FUNC_map_pop_elem:
1838		return &bpf_map_pop_elem_proto;
1839	case BPF_FUNC_map_peek_elem:
1840		return &bpf_map_peek_elem_proto;
1841	case BPF_FUNC_map_lookup_percpu_elem:
1842		return &bpf_map_lookup_percpu_elem_proto;
1843	case BPF_FUNC_get_prandom_u32:
1844		return &bpf_get_prandom_u32_proto;
1845	case BPF_FUNC_get_smp_processor_id:
1846		return &bpf_get_raw_smp_processor_id_proto;
1847	case BPF_FUNC_get_numa_node_id:
1848		return &bpf_get_numa_node_id_proto;
1849	case BPF_FUNC_tail_call:
1850		return &bpf_tail_call_proto;
1851	case BPF_FUNC_ktime_get_ns:
1852		return &bpf_ktime_get_ns_proto;
1853	case BPF_FUNC_ktime_get_boot_ns:
1854		return &bpf_ktime_get_boot_ns_proto;
1855	case BPF_FUNC_ktime_get_tai_ns:
1856		return &bpf_ktime_get_tai_ns_proto;
1857	case BPF_FUNC_ringbuf_output:
1858		return &bpf_ringbuf_output_proto;
1859	case BPF_FUNC_ringbuf_reserve:
1860		return &bpf_ringbuf_reserve_proto;
1861	case BPF_FUNC_ringbuf_submit:
1862		return &bpf_ringbuf_submit_proto;
1863	case BPF_FUNC_ringbuf_discard:
1864		return &bpf_ringbuf_discard_proto;
1865	case BPF_FUNC_ringbuf_query:
1866		return &bpf_ringbuf_query_proto;
1867	case BPF_FUNC_strncmp:
1868		return &bpf_strncmp_proto;
1869	case BPF_FUNC_strtol:
1870		return &bpf_strtol_proto;
1871	case BPF_FUNC_strtoul:
1872		return &bpf_strtoul_proto;
1873	case BPF_FUNC_get_current_pid_tgid:
1874		return &bpf_get_current_pid_tgid_proto;
1875	case BPF_FUNC_get_ns_current_pid_tgid:
1876		return &bpf_get_ns_current_pid_tgid_proto;
1877	default:
1878		break;
1879	}
1880
1881	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
1882		return NULL;
1883
1884	switch (func_id) {
1885	case BPF_FUNC_spin_lock:
1886		return &bpf_spin_lock_proto;
1887	case BPF_FUNC_spin_unlock:
1888		return &bpf_spin_unlock_proto;
1889	case BPF_FUNC_jiffies64:
1890		return &bpf_jiffies64_proto;
1891	case BPF_FUNC_per_cpu_ptr:
1892		return &bpf_per_cpu_ptr_proto;
1893	case BPF_FUNC_this_cpu_ptr:
1894		return &bpf_this_cpu_ptr_proto;
1895	case BPF_FUNC_timer_init:
1896		return &bpf_timer_init_proto;
1897	case BPF_FUNC_timer_set_callback:
1898		return &bpf_timer_set_callback_proto;
1899	case BPF_FUNC_timer_start:
1900		return &bpf_timer_start_proto;
1901	case BPF_FUNC_timer_cancel:
1902		return &bpf_timer_cancel_proto;
1903	case BPF_FUNC_kptr_xchg:
1904		return &bpf_kptr_xchg_proto;
1905	case BPF_FUNC_for_each_map_elem:
1906		return &bpf_for_each_map_elem_proto;
1907	case BPF_FUNC_loop:
1908		return &bpf_loop_proto;
1909	case BPF_FUNC_user_ringbuf_drain:
1910		return &bpf_user_ringbuf_drain_proto;
1911	case BPF_FUNC_ringbuf_reserve_dynptr:
1912		return &bpf_ringbuf_reserve_dynptr_proto;
1913	case BPF_FUNC_ringbuf_submit_dynptr:
1914		return &bpf_ringbuf_submit_dynptr_proto;
1915	case BPF_FUNC_ringbuf_discard_dynptr:
1916		return &bpf_ringbuf_discard_dynptr_proto;
1917	case BPF_FUNC_dynptr_from_mem:
1918		return &bpf_dynptr_from_mem_proto;
1919	case BPF_FUNC_dynptr_read:
1920		return &bpf_dynptr_read_proto;
1921	case BPF_FUNC_dynptr_write:
1922		return &bpf_dynptr_write_proto;
1923	case BPF_FUNC_dynptr_data:
1924		return &bpf_dynptr_data_proto;
1925#ifdef CONFIG_CGROUPS
1926	case BPF_FUNC_cgrp_storage_get:
1927		return &bpf_cgrp_storage_get_proto;
1928	case BPF_FUNC_cgrp_storage_delete:
1929		return &bpf_cgrp_storage_delete_proto;
1930	case BPF_FUNC_get_current_cgroup_id:
1931		return &bpf_get_current_cgroup_id_proto;
1932	case BPF_FUNC_get_current_ancestor_cgroup_id:
1933		return &bpf_get_current_ancestor_cgroup_id_proto;
1934#endif
1935	default:
1936		break;
1937	}
1938
1939	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
1940		return NULL;
1941
1942	switch (func_id) {
1943	case BPF_FUNC_trace_printk:
1944		return bpf_get_trace_printk_proto();
1945	case BPF_FUNC_get_current_task:
1946		return &bpf_get_current_task_proto;
1947	case BPF_FUNC_get_current_task_btf:
1948		return &bpf_get_current_task_btf_proto;
1949	case BPF_FUNC_probe_read_user:
1950		return &bpf_probe_read_user_proto;
1951	case BPF_FUNC_probe_read_kernel:
1952		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
1953		       NULL : &bpf_probe_read_kernel_proto;
1954	case BPF_FUNC_probe_read_user_str:
1955		return &bpf_probe_read_user_str_proto;
1956	case BPF_FUNC_probe_read_kernel_str:
1957		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
1958		       NULL : &bpf_probe_read_kernel_str_proto;
1959	case BPF_FUNC_snprintf_btf:
1960		return &bpf_snprintf_btf_proto;
1961	case BPF_FUNC_snprintf:
1962		return &bpf_snprintf_proto;
1963	case BPF_FUNC_task_pt_regs:
1964		return &bpf_task_pt_regs_proto;
1965	case BPF_FUNC_trace_vprintk:
1966		return bpf_get_trace_vprintk_proto();
1967	default:
1968		return NULL;
1969	}
1970}
1971
1972void bpf_list_head_free(const struct btf_field *field, void *list_head,
1973			struct bpf_spin_lock *spin_lock)
1974{
1975	struct list_head *head = list_head, *orig_head = list_head;
1976
1977	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
1978	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
1979
1980	/* Do the actual list draining outside the lock to not hold the lock for
1981	 * too long, and also prevent deadlocks if tracing programs end up
1982	 * executing on entry/exit of functions called inside the critical
1983	 * section, and end up doing map ops that call bpf_list_head_free for
1984	 * the same map value again.
1985	 */
1986	__bpf_spin_lock_irqsave(spin_lock);
1987	if (!head->next || list_empty(head))
1988		goto unlock;
1989	head = head->next;
1990unlock:
1991	INIT_LIST_HEAD(orig_head);
1992	__bpf_spin_unlock_irqrestore(spin_lock);
1993
1994	while (head != orig_head) {
1995		void *obj = head;
1996
1997		obj -= field->graph_root.node_offset;
1998		head = head->next;
1999		/* The contained type can also have resources, including a
2000		 * bpf_list_head which needs to be freed.
2001		 */
2002		migrate_disable();
2003		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
2004		migrate_enable();
2005	}
2006}
2007
2008/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
2009 * 'rb_node *', so field name of rb_node within containing struct is not
2010 * needed.
2011 *
2012 * Since bpf_rb_tree's node type has a corresponding struct btf_field with
2013 * graph_root.node_offset, it's not necessary to know field name
2014 * or type of node struct
2015 */
2016#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
2017	for (pos = rb_first_postorder(root); \
2018	    pos && ({ n = rb_next_postorder(pos); 1; }); \
2019	    pos = n)
2020
2021void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
2022		      struct bpf_spin_lock *spin_lock)
2023{
2024	struct rb_root_cached orig_root, *root = rb_root;
2025	struct rb_node *pos, *n;
2026	void *obj;
2027
2028	BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
2029	BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
2030
2031	__bpf_spin_lock_irqsave(spin_lock);
2032	orig_root = *root;
2033	*root = RB_ROOT_CACHED;
2034	__bpf_spin_unlock_irqrestore(spin_lock);
2035
2036	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
2037		obj = pos;
2038		obj -= field->graph_root.node_offset;
2039
2040
2041		migrate_disable();
2042		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
2043		migrate_enable();
2044	}
2045}
2046
2047__bpf_kfunc_start_defs();
2048
2049__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2050{
2051	struct btf_struct_meta *meta = meta__ign;
2052	u64 size = local_type_id__k;
2053	void *p;
2054
2055	p = bpf_mem_alloc(&bpf_global_ma, size);
2056	if (!p)
2057		return NULL;
2058	if (meta)
2059		bpf_obj_init(meta->record, p);
2060	return p;
2061}
2062
2063__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2064{
2065	u64 size = local_type_id__k;
2066
2067	/* The verifier has ensured that meta__ign must be NULL */
2068	return bpf_mem_alloc(&bpf_global_percpu_ma, size);
2069}
2070
2071/* Must be called under migrate_disable(), as required by bpf_mem_free */
2072void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
2073{
2074	struct bpf_mem_alloc *ma;
2075
2076	if (rec && rec->refcount_off >= 0 &&
2077	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
2078		/* Object is refcounted and refcount_dec didn't result in 0
2079		 * refcount. Return without freeing the object
2080		 */
2081		return;
2082	}
2083
2084	if (rec)
2085		bpf_obj_free_fields(rec, p);
2086
2087	if (percpu)
2088		ma = &bpf_global_percpu_ma;
2089	else
2090		ma = &bpf_global_ma;
2091	bpf_mem_free_rcu(ma, p);
2092}
2093
2094__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
2095{
2096	struct btf_struct_meta *meta = meta__ign;
2097	void *p = p__alloc;
2098
2099	__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
2100}
2101
2102__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
2103{
2104	/* The verifier has ensured that meta__ign must be NULL */
2105	bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
2106}
2107
2108__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
2109{
2110	struct btf_struct_meta *meta = meta__ign;
2111	struct bpf_refcount *ref;
2112
2113	/* Could just cast directly to refcount_t *, but need some code using
2114	 * bpf_refcount type so that it is emitted in vmlinux BTF
2115	 */
2116	ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
2117	if (!refcount_inc_not_zero((refcount_t *)ref))
2118		return NULL;
2119
2120	/* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
2121	 * in verifier.c
2122	 */
2123	return (void *)p__refcounted_kptr;
2124}
2125
2126static int __bpf_list_add(struct bpf_list_node_kern *node,
2127			  struct bpf_list_head *head,
2128			  bool tail, struct btf_record *rec, u64 off)
2129{
2130	struct list_head *n = &node->list_head, *h = (void *)head;
2131
2132	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2133	 * called on its fields, so init here
2134	 */
2135	if (unlikely(!h->next))
2136		INIT_LIST_HEAD(h);
2137
2138	/* node->owner != NULL implies !list_empty(n), no need to separately
2139	 * check the latter
2140	 */
2141	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2142		/* Only called from BPF prog, no need to migrate_disable */
2143		__bpf_obj_drop_impl((void *)n - off, rec, false);
2144		return -EINVAL;
2145	}
2146
2147	tail ? list_add_tail(n, h) : list_add(n, h);
2148	WRITE_ONCE(node->owner, head);
2149
2150	return 0;
2151}
2152
2153__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
2154					 struct bpf_list_node *node,
2155					 void *meta__ign, u64 off)
2156{
2157	struct bpf_list_node_kern *n = (void *)node;
2158	struct btf_struct_meta *meta = meta__ign;
2159
2160	return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
2161}
2162
2163__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
2164					struct bpf_list_node *node,
2165					void *meta__ign, u64 off)
2166{
2167	struct bpf_list_node_kern *n = (void *)node;
2168	struct btf_struct_meta *meta = meta__ign;
2169
2170	return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
2171}
2172
2173static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
2174{
2175	struct list_head *n, *h = (void *)head;
2176	struct bpf_list_node_kern *node;
2177
2178	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2179	 * called on its fields, so init here
2180	 */
2181	if (unlikely(!h->next))
2182		INIT_LIST_HEAD(h);
2183	if (list_empty(h))
2184		return NULL;
2185
2186	n = tail ? h->prev : h->next;
2187	node = container_of(n, struct bpf_list_node_kern, list_head);
2188	if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
2189		return NULL;
2190
2191	list_del_init(n);
2192	WRITE_ONCE(node->owner, NULL);
2193	return (struct bpf_list_node *)n;
2194}
2195
2196__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
2197{
2198	return __bpf_list_del(head, false);
2199}
2200
2201__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
2202{
2203	return __bpf_list_del(head, true);
2204}
2205
2206__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
2207						  struct bpf_rb_node *node)
2208{
2209	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2210	struct rb_root_cached *r = (struct rb_root_cached *)root;
2211	struct rb_node *n = &node_internal->rb_node;
2212
2213	/* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
2214	 * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
2215	 */
2216	if (READ_ONCE(node_internal->owner) != root)
2217		return NULL;
2218
2219	rb_erase_cached(n, r);
2220	RB_CLEAR_NODE(n);
2221	WRITE_ONCE(node_internal->owner, NULL);
2222	return (struct bpf_rb_node *)n;
2223}
2224
2225/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
2226 * program
2227 */
2228static int __bpf_rbtree_add(struct bpf_rb_root *root,
2229			    struct bpf_rb_node_kern *node,
2230			    void *less, struct btf_record *rec, u64 off)
2231{
2232	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
2233	struct rb_node *parent = NULL, *n = &node->rb_node;
2234	bpf_callback_t cb = (bpf_callback_t)less;
2235	bool leftmost = true;
2236
2237	/* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
2238	 * check the latter
2239	 */
2240	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2241		/* Only called from BPF prog, no need to migrate_disable */
2242		__bpf_obj_drop_impl((void *)n - off, rec, false);
2243		return -EINVAL;
2244	}
2245
2246	while (*link) {
2247		parent = *link;
2248		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
2249			link = &parent->rb_left;
2250		} else {
2251			link = &parent->rb_right;
2252			leftmost = false;
2253		}
2254	}
2255
2256	rb_link_node(n, parent, link);
2257	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
2258	WRITE_ONCE(node->owner, root);
2259	return 0;
2260}
2261
2262__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
2263				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2264				    void *meta__ign, u64 off)
2265{
2266	struct btf_struct_meta *meta = meta__ign;
2267	struct bpf_rb_node_kern *n = (void *)node;
2268
2269	return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
2270}
2271
2272__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
2273{
2274	struct rb_root_cached *r = (struct rb_root_cached *)root;
2275
2276	return (struct bpf_rb_node *)rb_first_cached(r);
2277}
2278
2279/**
2280 * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
2281 * kfunc which is not stored in a map as a kptr, must be released by calling
2282 * bpf_task_release().
2283 * @p: The task on which a reference is being acquired.
2284 */
2285__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
2286{
2287	if (refcount_inc_not_zero(&p->rcu_users))
2288		return p;
2289	return NULL;
2290}
2291
2292/**
2293 * bpf_task_release - Release the reference acquired on a task.
2294 * @p: The task on which a reference is being released.
2295 */
2296__bpf_kfunc void bpf_task_release(struct task_struct *p)
2297{
2298	put_task_struct_rcu_user(p);
2299}
2300
2301__bpf_kfunc void bpf_task_release_dtor(void *p)
2302{
2303	put_task_struct_rcu_user(p);
2304}
2305CFI_NOSEAL(bpf_task_release_dtor);
2306
2307#ifdef CONFIG_CGROUPS
2308/**
2309 * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
2310 * this kfunc which is not stored in a map as a kptr, must be released by
2311 * calling bpf_cgroup_release().
2312 * @cgrp: The cgroup on which a reference is being acquired.
2313 */
2314__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
2315{
2316	return cgroup_tryget(cgrp) ? cgrp : NULL;
2317}
2318
2319/**
2320 * bpf_cgroup_release - Release the reference acquired on a cgroup.
2321 * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
2322 * not be freed until the current grace period has ended, even if its refcount
2323 * drops to 0.
2324 * @cgrp: The cgroup on which a reference is being released.
2325 */
2326__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
2327{
2328	cgroup_put(cgrp);
2329}
2330
2331__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
2332{
2333	cgroup_put(cgrp);
2334}
2335CFI_NOSEAL(bpf_cgroup_release_dtor);
2336
2337/**
2338 * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
2339 * array. A cgroup returned by this kfunc which is not subsequently stored in a
2340 * map, must be released by calling bpf_cgroup_release().
2341 * @cgrp: The cgroup for which we're performing a lookup.
2342 * @level: The level of ancestor to look up.
2343 */
2344__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
2345{
2346	struct cgroup *ancestor;
2347
2348	if (level > cgrp->level || level < 0)
2349		return NULL;
2350
2351	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
2352	ancestor = cgrp->ancestors[level];
2353	if (!cgroup_tryget(ancestor))
2354		return NULL;
2355	return ancestor;
2356}
2357
2358/**
2359 * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
2360 * kfunc which is not subsequently stored in a map, must be released by calling
2361 * bpf_cgroup_release().
2362 * @cgid: cgroup id.
2363 */
2364__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
2365{
2366	struct cgroup *cgrp;
2367
2368	cgrp = cgroup_get_from_id(cgid);
2369	if (IS_ERR(cgrp))
2370		return NULL;
2371	return cgrp;
2372}
2373
2374/**
2375 * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
2376 * task's membership of cgroup ancestry.
2377 * @task: the task to be tested
2378 * @ancestor: possible ancestor of @task's cgroup
2379 *
2380 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
2381 * It follows all the same rules as cgroup_is_descendant, and only applies
2382 * to the default hierarchy.
2383 */
2384__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
2385				       struct cgroup *ancestor)
2386{
2387	long ret;
2388
2389	rcu_read_lock();
2390	ret = task_under_cgroup_hierarchy(task, ancestor);
2391	rcu_read_unlock();
2392	return ret;
2393}
2394
2395/**
2396 * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
2397 * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
2398 * hierarchy ID.
2399 * @task: The target task
2400 * @hierarchy_id: The ID of a cgroup1 hierarchy
2401 *
2402 * On success, the cgroup is returen. On failure, NULL is returned.
2403 */
2404__bpf_kfunc struct cgroup *
2405bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
2406{
2407	struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
2408
2409	if (IS_ERR(cgrp))
2410		return NULL;
2411	return cgrp;
2412}
2413#endif /* CONFIG_CGROUPS */
2414
2415/**
2416 * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
2417 * in the root pid namespace idr. If a task is returned, it must either be
2418 * stored in a map, or released with bpf_task_release().
2419 * @pid: The pid of the task being looked up.
2420 */
2421__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
2422{
2423	struct task_struct *p;
2424
2425	rcu_read_lock();
2426	p = find_task_by_pid_ns(pid, &init_pid_ns);
2427	if (p)
2428		p = bpf_task_acquire(p);
2429	rcu_read_unlock();
2430
2431	return p;
2432}
2433
2434/**
2435 * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
2436 * @ptr: The dynptr whose data slice to retrieve
2437 * @offset: Offset into the dynptr
2438 * @buffer__opt: User-provided buffer to copy contents into.  May be NULL
2439 * @buffer__szk: Size (in bytes) of the buffer if present. This is the
2440 *               length of the requested slice. This must be a constant.
2441 *
2442 * For non-skb and non-xdp type dynptrs, there is no difference between
2443 * bpf_dynptr_slice and bpf_dynptr_data.
2444 *
2445 *  If buffer__opt is NULL, the call will fail if buffer_opt was needed.
2446 *
2447 * If the intention is to write to the data slice, please use
2448 * bpf_dynptr_slice_rdwr.
2449 *
2450 * The user must check that the returned pointer is not null before using it.
2451 *
2452 * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
2453 * does not change the underlying packet data pointers, so a call to
2454 * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
2455 * the bpf program.
2456 *
2457 * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
2458 * data slice (can be either direct pointer to the data or a pointer to the user
2459 * provided buffer, with its contents containing the data, if unable to obtain
2460 * direct pointer)
2461 */
2462__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
2463				   void *buffer__opt, u32 buffer__szk)
2464{
2465	enum bpf_dynptr_type type;
2466	u32 len = buffer__szk;
2467	int err;
2468
2469	if (!ptr->data)
2470		return NULL;
2471
2472	err = bpf_dynptr_check_off_len(ptr, offset, len);
2473	if (err)
2474		return NULL;
2475
2476	type = bpf_dynptr_get_type(ptr);
2477
2478	switch (type) {
2479	case BPF_DYNPTR_TYPE_LOCAL:
2480	case BPF_DYNPTR_TYPE_RINGBUF:
2481		return ptr->data + ptr->offset + offset;
2482	case BPF_DYNPTR_TYPE_SKB:
2483		if (buffer__opt)
2484			return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
2485		else
2486			return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
2487	case BPF_DYNPTR_TYPE_XDP:
2488	{
2489		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
2490		if (!IS_ERR_OR_NULL(xdp_ptr))
2491			return xdp_ptr;
2492
2493		if (!buffer__opt)
2494			return NULL;
2495		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
2496		return buffer__opt;
2497	}
2498	default:
2499		WARN_ONCE(true, "unknown dynptr type %d\n", type);
2500		return NULL;
2501	}
2502}
2503
2504/**
2505 * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
2506 * @ptr: The dynptr whose data slice to retrieve
2507 * @offset: Offset into the dynptr
2508 * @buffer__opt: User-provided buffer to copy contents into. May be NULL
2509 * @buffer__szk: Size (in bytes) of the buffer if present. This is the
2510 *               length of the requested slice. This must be a constant.
2511 *
2512 * For non-skb and non-xdp type dynptrs, there is no difference between
2513 * bpf_dynptr_slice and bpf_dynptr_data.
2514 *
2515 * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
2516 *
2517 * The returned pointer is writable and may point to either directly the dynptr
2518 * data at the requested offset or to the buffer if unable to obtain a direct
2519 * data pointer to (example: the requested slice is to the paged area of an skb
2520 * packet). In the case where the returned pointer is to the buffer, the user
2521 * is responsible for persisting writes through calling bpf_dynptr_write(). This
2522 * usually looks something like this pattern:
2523 *
2524 * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
2525 * if (!eth)
2526 *	return TC_ACT_SHOT;
2527 *
2528 * // mutate eth header //
2529 *
2530 * if (eth == buffer)
2531 *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
2532 *
2533 * Please note that, as in the example above, the user must check that the
2534 * returned pointer is not null before using it.
2535 *
2536 * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
2537 * does not change the underlying packet data pointers, so a call to
2538 * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
2539 * the bpf program.
2540 *
2541 * Return: NULL if the call failed (eg invalid dynptr), pointer to a
2542 * data slice (can be either direct pointer to the data or a pointer to the user
2543 * provided buffer, with its contents containing the data, if unable to obtain
2544 * direct pointer)
2545 */
2546__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
2547					void *buffer__opt, u32 buffer__szk)
2548{
2549	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
2550		return NULL;
2551
2552	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
2553	 *
2554	 * For skb-type dynptrs, it is safe to write into the returned pointer
2555	 * if the bpf program allows skb data writes. There are two possibilities
2556	 * that may occur when calling bpf_dynptr_slice_rdwr:
2557	 *
2558	 * 1) The requested slice is in the head of the skb. In this case, the
2559	 * returned pointer is directly to skb data, and if the skb is cloned, the
2560	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
2561	 * The pointer can be directly written into.
2562	 *
2563	 * 2) Some portion of the requested slice is in the paged buffer area.
2564	 * In this case, the requested data will be copied out into the buffer
2565	 * and the returned pointer will be a pointer to the buffer. The skb
2566	 * will not be pulled. To persist the write, the user will need to call
2567	 * bpf_dynptr_write(), which will pull the skb and commit the write.
2568	 *
2569	 * Similarly for xdp programs, if the requested slice is not across xdp
2570	 * fragments, then a direct pointer will be returned, otherwise the data
2571	 * will be copied out into the buffer and the user will need to call
2572	 * bpf_dynptr_write() to commit changes.
2573	 */
2574	return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
2575}
2576
2577__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
2578{
2579	u32 size;
2580
2581	if (!ptr->data || start > end)
2582		return -EINVAL;
2583
2584	size = __bpf_dynptr_size(ptr);
2585
2586	if (start > size || end > size)
2587		return -ERANGE;
2588
2589	ptr->offset += start;
2590	bpf_dynptr_set_size(ptr, end - start);
2591
2592	return 0;
2593}
2594
2595__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
2596{
2597	return !ptr->data;
2598}
2599
2600__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
2601{
2602	if (!ptr->data)
2603		return false;
2604
2605	return __bpf_dynptr_is_rdonly(ptr);
2606}
2607
2608__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
2609{
2610	if (!ptr->data)
2611		return -EINVAL;
2612
2613	return __bpf_dynptr_size(ptr);
2614}
2615
2616__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
2617				 struct bpf_dynptr_kern *clone__uninit)
2618{
2619	if (!ptr->data) {
2620		bpf_dynptr_set_null(clone__uninit);
2621		return -EINVAL;
2622	}
2623
2624	*clone__uninit = *ptr;
2625
2626	return 0;
2627}
2628
2629__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
2630{
2631	return obj;
2632}
2633
2634__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
2635{
2636	return (void *)obj__ign;
2637}
2638
2639__bpf_kfunc void bpf_rcu_read_lock(void)
2640{
2641	rcu_read_lock();
2642}
2643
2644__bpf_kfunc void bpf_rcu_read_unlock(void)
2645{
2646	rcu_read_unlock();
2647}
2648
2649struct bpf_throw_ctx {
2650	struct bpf_prog_aux *aux;
2651	u64 sp;
2652	u64 bp;
2653	int cnt;
2654};
2655
2656static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
2657{
2658	struct bpf_throw_ctx *ctx = cookie;
2659	struct bpf_prog *prog;
2660
2661	if (!is_bpf_text_address(ip))
2662		return !ctx->cnt;
2663	prog = bpf_prog_ksym_find(ip);
2664	ctx->cnt++;
2665	if (bpf_is_subprog(prog))
2666		return true;
2667	ctx->aux = prog->aux;
2668	ctx->sp = sp;
2669	ctx->bp = bp;
2670	return false;
2671}
2672
2673__bpf_kfunc void bpf_throw(u64 cookie)
2674{
2675	struct bpf_throw_ctx ctx = {};
2676
2677	arch_bpf_stack_walk(bpf_stack_walker, &ctx);
2678	WARN_ON_ONCE(!ctx.aux);
2679	if (ctx.aux)
2680		WARN_ON_ONCE(!ctx.aux->exception_boundary);
2681	WARN_ON_ONCE(!ctx.bp);
2682	WARN_ON_ONCE(!ctx.cnt);
2683	/* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
2684	 * deeper stack depths than ctx.sp as we do not return from bpf_throw,
2685	 * which skips compiler generated instrumentation to do the same.
2686	 */
2687	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
2688	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
2689	WARN(1, "A call to BPF exception callback should never return\n");
2690}
2691
2692__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
2693{
2694	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
2695	struct bpf_map *map = p__map;
2696
2697	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
2698	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
2699
2700	if (flags)
2701		return -EINVAL;
2702
2703	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
2704}
2705
2706__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
2707{
2708	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
2709	struct bpf_work *w;
2710
2711	if (in_nmi())
2712		return -EOPNOTSUPP;
2713	if (flags)
2714		return -EINVAL;
2715	w = READ_ONCE(async->work);
2716	if (!w || !READ_ONCE(w->cb.prog))
2717		return -EINVAL;
2718
2719	schedule_work(&w->work);
2720	return 0;
2721}
2722
2723__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
2724					 int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
2725					 unsigned int flags,
2726					 void *aux__ign)
2727{
2728	struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign;
2729	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
2730
2731	if (flags)
2732		return -EINVAL;
2733
2734	return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
2735}
2736
2737__bpf_kfunc void bpf_preempt_disable(void)
2738{
2739	preempt_disable();
2740}
2741
2742__bpf_kfunc void bpf_preempt_enable(void)
2743{
2744	preempt_enable();
2745}
2746
2747__bpf_kfunc_end_defs();
2748
2749BTF_KFUNCS_START(generic_btf_ids)
2750#ifdef CONFIG_CRASH_DUMP
2751BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
2752#endif
2753BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
2754BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
2755BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
2756BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
2757BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
2758BTF_ID_FLAGS(func, bpf_list_push_front_impl)
2759BTF_ID_FLAGS(func, bpf_list_push_back_impl)
2760BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
2761BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
2762BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
2763BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
2764BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
2765BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
2766BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
2767
2768#ifdef CONFIG_CGROUPS
2769BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
2770BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
2771BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
2772BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
2773BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
2774BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
2775#endif
2776BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
2777BTF_ID_FLAGS(func, bpf_throw)
2778BTF_KFUNCS_END(generic_btf_ids)
2779
2780static const struct btf_kfunc_id_set generic_kfunc_set = {
2781	.owner = THIS_MODULE,
2782	.set   = &generic_btf_ids,
2783};
2784
2785
2786BTF_ID_LIST(generic_dtor_ids)
2787BTF_ID(struct, task_struct)
2788BTF_ID(func, bpf_task_release_dtor)
2789#ifdef CONFIG_CGROUPS
2790BTF_ID(struct, cgroup)
2791BTF_ID(func, bpf_cgroup_release_dtor)
2792#endif
2793
2794BTF_KFUNCS_START(common_btf_ids)
2795BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
2796BTF_ID_FLAGS(func, bpf_rdonly_cast)
2797BTF_ID_FLAGS(func, bpf_rcu_read_lock)
2798BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
2799BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
2800BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
2801BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
2802BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
2803BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
2804BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
2805BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
2806BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
2807#ifdef CONFIG_CGROUPS
2808BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
2809BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
2810BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
2811BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
2812BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
2813BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
2814#endif
2815BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
2816BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
2817BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
2818BTF_ID_FLAGS(func, bpf_dynptr_adjust)
2819BTF_ID_FLAGS(func, bpf_dynptr_is_null)
2820BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
2821BTF_ID_FLAGS(func, bpf_dynptr_size)
2822BTF_ID_FLAGS(func, bpf_dynptr_clone)
2823BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
2824BTF_ID_FLAGS(func, bpf_wq_init)
2825BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
2826BTF_ID_FLAGS(func, bpf_wq_start)
2827BTF_ID_FLAGS(func, bpf_preempt_disable)
2828BTF_ID_FLAGS(func, bpf_preempt_enable)
2829BTF_KFUNCS_END(common_btf_ids)
2830
2831static const struct btf_kfunc_id_set common_kfunc_set = {
2832	.owner = THIS_MODULE,
2833	.set   = &common_btf_ids,
2834};
2835
2836static int __init kfunc_init(void)
2837{
2838	int ret;
2839	const struct btf_id_dtor_kfunc generic_dtors[] = {
2840		{
2841			.btf_id       = generic_dtor_ids[0],
2842			.kfunc_btf_id = generic_dtor_ids[1]
2843		},
2844#ifdef CONFIG_CGROUPS
2845		{
2846			.btf_id       = generic_dtor_ids[2],
2847			.kfunc_btf_id = generic_dtor_ids[3]
2848		},
2849#endif
2850	};
2851
2852	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
2853	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
2854	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
2855	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
2856	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
2857	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
2858						  ARRAY_SIZE(generic_dtors),
2859						  THIS_MODULE);
2860	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
2861}
2862
2863late_initcall(kfunc_init);
2864
2865/* Get a pointer to dynptr data up to len bytes for read only access. If
2866 * the dynptr doesn't have continuous data up to len bytes, return NULL.
2867 */
2868const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
2869{
2870	return bpf_dynptr_slice(ptr, 0, NULL, len);
2871}
2872
2873/* Get a pointer to dynptr data up to len bytes for read write access. If
2874 * the dynptr doesn't have continuous data up to len bytes, or the dynptr
2875 * is read only, return NULL.
2876 */
2877void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
2878{
2879	if (__bpf_dynptr_is_rdonly(ptr))
2880		return NULL;
2881	return (void *)__bpf_dynptr_data(ptr, len);
2882}
2883