1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 */
4#include <linux/bpf.h>
5#include <linux/bpf-cgroup.h>
6#include <linux/bpf_trace.h>
7#include <linux/bpf_lirc.h>
8#include <linux/bpf_verifier.h>
9#include <linux/bsearch.h>
10#include <linux/btf.h>
11#include <linux/syscalls.h>
12#include <linux/slab.h>
13#include <linux/sched/signal.h>
14#include <linux/vmalloc.h>
15#include <linux/mmzone.h>
16#include <linux/anon_inodes.h>
17#include <linux/fdtable.h>
18#include <linux/file.h>
19#include <linux/fs.h>
20#include <linux/license.h>
21#include <linux/filter.h>
22#include <linux/kernel.h>
23#include <linux/idr.h>
24#include <linux/cred.h>
25#include <linux/timekeeping.h>
26#include <linux/ctype.h>
27#include <linux/nospec.h>
28#include <linux/audit.h>
29#include <uapi/linux/btf.h>
30#include <linux/pgtable.h>
31#include <linux/bpf_lsm.h>
32#include <linux/poll.h>
33#include <linux/sort.h>
34#include <linux/bpf-netns.h>
35#include <linux/rcupdate_trace.h>
36#include <linux/memcontrol.h>
37#include <linux/trace_events.h>
38
39#include <net/netfilter/nf_bpf_link.h>
40#include <net/netkit.h>
41#include <net/tcx.h>
42
43#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
44			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
45			  (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
46#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
47#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
48#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
49			IS_FD_HASH(map))
50
51#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
52
53DEFINE_PER_CPU(int, bpf_prog_active);
54static DEFINE_IDR(prog_idr);
55static DEFINE_SPINLOCK(prog_idr_lock);
56static DEFINE_IDR(map_idr);
57static DEFINE_SPINLOCK(map_idr_lock);
58static DEFINE_IDR(link_idr);
59static DEFINE_SPINLOCK(link_idr_lock);
60
61int sysctl_unprivileged_bpf_disabled __read_mostly =
62	IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
63
64static const struct bpf_map_ops * const bpf_map_types[] = {
65#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
66#define BPF_MAP_TYPE(_id, _ops) \
67	[_id] = &_ops,
68#define BPF_LINK_TYPE(_id, _name)
69#include <linux/bpf_types.h>
70#undef BPF_PROG_TYPE
71#undef BPF_MAP_TYPE
72#undef BPF_LINK_TYPE
73};
74
75/*
76 * If we're handed a bigger struct than we know of, ensure all the unknown bits
77 * are 0 - i.e. new user-space does not rely on any kernel feature extensions
78 * we don't know about yet.
79 *
80 * There is a ToCToU between this function call and the following
81 * copy_from_user() call. However, this is not a concern since this function is
82 * meant to be a future-proofing of bits.
83 */
84int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
85			     size_t expected_size,
86			     size_t actual_size)
87{
88	int res;
89
90	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
91		return -E2BIG;
92
93	if (actual_size <= expected_size)
94		return 0;
95
96	if (uaddr.is_kernel)
97		res = memchr_inv(uaddr.kernel + expected_size, 0,
98				 actual_size - expected_size) == NULL;
99	else
100		res = check_zeroed_user(uaddr.user + expected_size,
101					actual_size - expected_size);
102	if (res < 0)
103		return res;
104	return res ? 0 : -E2BIG;
105}
106
107const struct bpf_map_ops bpf_map_offload_ops = {
108	.map_meta_equal = bpf_map_meta_equal,
109	.map_alloc = bpf_map_offload_map_alloc,
110	.map_free = bpf_map_offload_map_free,
111	.map_check_btf = map_check_no_btf,
112	.map_mem_usage = bpf_map_offload_map_mem_usage,
113};
114
115static void bpf_map_write_active_inc(struct bpf_map *map)
116{
117	atomic64_inc(&map->writecnt);
118}
119
120static void bpf_map_write_active_dec(struct bpf_map *map)
121{
122	atomic64_dec(&map->writecnt);
123}
124
125bool bpf_map_write_active(const struct bpf_map *map)
126{
127	return atomic64_read(&map->writecnt) != 0;
128}
129
130static u32 bpf_map_value_size(const struct bpf_map *map)
131{
132	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
133	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
134	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
135	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
136		return round_up(map->value_size, 8) * num_possible_cpus();
137	else if (IS_FD_MAP(map))
138		return sizeof(u32);
139	else
140		return  map->value_size;
141}
142
143static void maybe_wait_bpf_programs(struct bpf_map *map)
144{
145	/* Wait for any running non-sleepable BPF programs to complete so that
146	 * userspace, when we return to it, knows that all non-sleepable
147	 * programs that could be running use the new map value. For sleepable
148	 * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
149	 * for the completions of these programs, but considering the waiting
150	 * time can be very long and userspace may think it will hang forever,
151	 * so don't handle sleepable BPF programs now.
152	 */
153	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
154	    map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
155		synchronize_rcu();
156}
157
158static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
159				void *key, void *value, __u64 flags)
160{
161	int err;
162
163	/* Need to create a kthread, thus must support schedule */
164	if (bpf_map_is_offloaded(map)) {
165		return bpf_map_offload_update_elem(map, key, value, flags);
166	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
167		   map->map_type == BPF_MAP_TYPE_ARENA ||
168		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
169		return map->ops->map_update_elem(map, key, value, flags);
170	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
171		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
172		return sock_map_update_elem_sys(map, key, value, flags);
173	} else if (IS_FD_PROG_ARRAY(map)) {
174		return bpf_fd_array_map_update_elem(map, map_file, key, value,
175						    flags);
176	}
177
178	bpf_disable_instrumentation();
179	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
180	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
181		err = bpf_percpu_hash_update(map, key, value, flags);
182	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
183		err = bpf_percpu_array_update(map, key, value, flags);
184	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
185		err = bpf_percpu_cgroup_storage_update(map, key, value,
186						       flags);
187	} else if (IS_FD_ARRAY(map)) {
188		err = bpf_fd_array_map_update_elem(map, map_file, key, value,
189						   flags);
190	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
191		err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
192						  flags);
193	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
194		/* rcu_read_lock() is not needed */
195		err = bpf_fd_reuseport_array_update_elem(map, key, value,
196							 flags);
197	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
198		   map->map_type == BPF_MAP_TYPE_STACK ||
199		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
200		err = map->ops->map_push_elem(map, value, flags);
201	} else {
202		rcu_read_lock();
203		err = map->ops->map_update_elem(map, key, value, flags);
204		rcu_read_unlock();
205	}
206	bpf_enable_instrumentation();
207
208	return err;
209}
210
211static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
212			      __u64 flags)
213{
214	void *ptr;
215	int err;
216
217	if (bpf_map_is_offloaded(map))
218		return bpf_map_offload_lookup_elem(map, key, value);
219
220	bpf_disable_instrumentation();
221	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
222	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
223		err = bpf_percpu_hash_copy(map, key, value);
224	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
225		err = bpf_percpu_array_copy(map, key, value);
226	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
227		err = bpf_percpu_cgroup_storage_copy(map, key, value);
228	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
229		err = bpf_stackmap_copy(map, key, value);
230	} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
231		err = bpf_fd_array_map_lookup_elem(map, key, value);
232	} else if (IS_FD_HASH(map)) {
233		err = bpf_fd_htab_map_lookup_elem(map, key, value);
234	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
235		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
236	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
237		   map->map_type == BPF_MAP_TYPE_STACK ||
238		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
239		err = map->ops->map_peek_elem(map, value);
240	} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
241		/* struct_ops map requires directly updating "value" */
242		err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
243	} else {
244		rcu_read_lock();
245		if (map->ops->map_lookup_elem_sys_only)
246			ptr = map->ops->map_lookup_elem_sys_only(map, key);
247		else
248			ptr = map->ops->map_lookup_elem(map, key);
249		if (IS_ERR(ptr)) {
250			err = PTR_ERR(ptr);
251		} else if (!ptr) {
252			err = -ENOENT;
253		} else {
254			err = 0;
255			if (flags & BPF_F_LOCK)
256				/* lock 'ptr' and copy everything but lock */
257				copy_map_value_locked(map, value, ptr, true);
258			else
259				copy_map_value(map, value, ptr);
260			/* mask lock and timer, since value wasn't zero inited */
261			check_and_init_map_value(map, value);
262		}
263		rcu_read_unlock();
264	}
265
266	bpf_enable_instrumentation();
267
268	return err;
269}
270
271/* Please, do not use this function outside from the map creation path
272 * (e.g. in map update path) without taking care of setting the active
273 * memory cgroup (see at bpf_map_kmalloc_node() for example).
274 */
275static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
276{
277	/* We really just want to fail instead of triggering OOM killer
278	 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
279	 * which is used for lower order allocation requests.
280	 *
281	 * It has been observed that higher order allocation requests done by
282	 * vmalloc with __GFP_NORETRY being set might fail due to not trying
283	 * to reclaim memory from the page cache, thus we set
284	 * __GFP_RETRY_MAYFAIL to avoid such situations.
285	 */
286
287	gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
288	unsigned int flags = 0;
289	unsigned long align = 1;
290	void *area;
291
292	if (size >= SIZE_MAX)
293		return NULL;
294
295	/* kmalloc()'ed memory can't be mmap()'ed */
296	if (mmapable) {
297		BUG_ON(!PAGE_ALIGNED(size));
298		align = SHMLBA;
299		flags = VM_USERMAP;
300	} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
301		area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
302				    numa_node);
303		if (area != NULL)
304			return area;
305	}
306
307	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
308			gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
309			flags, numa_node, __builtin_return_address(0));
310}
311
312void *bpf_map_area_alloc(u64 size, int numa_node)
313{
314	return __bpf_map_area_alloc(size, numa_node, false);
315}
316
317void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
318{
319	return __bpf_map_area_alloc(size, numa_node, true);
320}
321
322void bpf_map_area_free(void *area)
323{
324	kvfree(area);
325}
326
327static u32 bpf_map_flags_retain_permanent(u32 flags)
328{
329	/* Some map creation flags are not tied to the map object but
330	 * rather to the map fd instead, so they have no meaning upon
331	 * map object inspection since multiple file descriptors with
332	 * different (access) properties can exist here. Thus, given
333	 * this has zero meaning for the map itself, lets clear these
334	 * from here.
335	 */
336	return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
337}
338
339void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
340{
341	map->map_type = attr->map_type;
342	map->key_size = attr->key_size;
343	map->value_size = attr->value_size;
344	map->max_entries = attr->max_entries;
345	map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
346	map->numa_node = bpf_map_attr_numa_node(attr);
347	map->map_extra = attr->map_extra;
348}
349
350static int bpf_map_alloc_id(struct bpf_map *map)
351{
352	int id;
353
354	idr_preload(GFP_KERNEL);
355	spin_lock_bh(&map_idr_lock);
356	id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
357	if (id > 0)
358		map->id = id;
359	spin_unlock_bh(&map_idr_lock);
360	idr_preload_end();
361
362	if (WARN_ON_ONCE(!id))
363		return -ENOSPC;
364
365	return id > 0 ? 0 : id;
366}
367
368void bpf_map_free_id(struct bpf_map *map)
369{
370	unsigned long flags;
371
372	/* Offloaded maps are removed from the IDR store when their device
373	 * disappears - even if someone holds an fd to them they are unusable,
374	 * the memory is gone, all ops will fail; they are simply waiting for
375	 * refcnt to drop to be freed.
376	 */
377	if (!map->id)
378		return;
379
380	spin_lock_irqsave(&map_idr_lock, flags);
381
382	idr_remove(&map_idr, map->id);
383	map->id = 0;
384
385	spin_unlock_irqrestore(&map_idr_lock, flags);
386}
387
388#ifdef CONFIG_MEMCG_KMEM
389static void bpf_map_save_memcg(struct bpf_map *map)
390{
391	/* Currently if a map is created by a process belonging to the root
392	 * memory cgroup, get_obj_cgroup_from_current() will return NULL.
393	 * So we have to check map->objcg for being NULL each time it's
394	 * being used.
395	 */
396	if (memcg_bpf_enabled())
397		map->objcg = get_obj_cgroup_from_current();
398}
399
400static void bpf_map_release_memcg(struct bpf_map *map)
401{
402	if (map->objcg)
403		obj_cgroup_put(map->objcg);
404}
405
406static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
407{
408	if (map->objcg)
409		return get_mem_cgroup_from_objcg(map->objcg);
410
411	return root_mem_cgroup;
412}
413
414void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
415			   int node)
416{
417	struct mem_cgroup *memcg, *old_memcg;
418	void *ptr;
419
420	memcg = bpf_map_get_memcg(map);
421	old_memcg = set_active_memcg(memcg);
422	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
423	set_active_memcg(old_memcg);
424	mem_cgroup_put(memcg);
425
426	return ptr;
427}
428
429void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
430{
431	struct mem_cgroup *memcg, *old_memcg;
432	void *ptr;
433
434	memcg = bpf_map_get_memcg(map);
435	old_memcg = set_active_memcg(memcg);
436	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
437	set_active_memcg(old_memcg);
438	mem_cgroup_put(memcg);
439
440	return ptr;
441}
442
443void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
444		       gfp_t flags)
445{
446	struct mem_cgroup *memcg, *old_memcg;
447	void *ptr;
448
449	memcg = bpf_map_get_memcg(map);
450	old_memcg = set_active_memcg(memcg);
451	ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
452	set_active_memcg(old_memcg);
453	mem_cgroup_put(memcg);
454
455	return ptr;
456}
457
458void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
459				    size_t align, gfp_t flags)
460{
461	struct mem_cgroup *memcg, *old_memcg;
462	void __percpu *ptr;
463
464	memcg = bpf_map_get_memcg(map);
465	old_memcg = set_active_memcg(memcg);
466	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
467	set_active_memcg(old_memcg);
468	mem_cgroup_put(memcg);
469
470	return ptr;
471}
472
473#else
474static void bpf_map_save_memcg(struct bpf_map *map)
475{
476}
477
478static void bpf_map_release_memcg(struct bpf_map *map)
479{
480}
481#endif
482
483int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
484			unsigned long nr_pages, struct page **pages)
485{
486	unsigned long i, j;
487	struct page *pg;
488	int ret = 0;
489#ifdef CONFIG_MEMCG_KMEM
490	struct mem_cgroup *memcg, *old_memcg;
491
492	memcg = bpf_map_get_memcg(map);
493	old_memcg = set_active_memcg(memcg);
494#endif
495	for (i = 0; i < nr_pages; i++) {
496		pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
497
498		if (pg) {
499			pages[i] = pg;
500			continue;
501		}
502		for (j = 0; j < i; j++)
503			__free_page(pages[j]);
504		ret = -ENOMEM;
505		break;
506	}
507
508#ifdef CONFIG_MEMCG_KMEM
509	set_active_memcg(old_memcg);
510	mem_cgroup_put(memcg);
511#endif
512	return ret;
513}
514
515
516static int btf_field_cmp(const void *a, const void *b)
517{
518	const struct btf_field *f1 = a, *f2 = b;
519
520	if (f1->offset < f2->offset)
521		return -1;
522	else if (f1->offset > f2->offset)
523		return 1;
524	return 0;
525}
526
527struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
528				  u32 field_mask)
529{
530	struct btf_field *field;
531
532	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
533		return NULL;
534	field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
535	if (!field || !(field->type & field_mask))
536		return NULL;
537	return field;
538}
539
540void btf_record_free(struct btf_record *rec)
541{
542	int i;
543
544	if (IS_ERR_OR_NULL(rec))
545		return;
546	for (i = 0; i < rec->cnt; i++) {
547		switch (rec->fields[i].type) {
548		case BPF_KPTR_UNREF:
549		case BPF_KPTR_REF:
550		case BPF_KPTR_PERCPU:
551			if (rec->fields[i].kptr.module)
552				module_put(rec->fields[i].kptr.module);
553			btf_put(rec->fields[i].kptr.btf);
554			break;
555		case BPF_LIST_HEAD:
556		case BPF_LIST_NODE:
557		case BPF_RB_ROOT:
558		case BPF_RB_NODE:
559		case BPF_SPIN_LOCK:
560		case BPF_TIMER:
561		case BPF_REFCOUNT:
562		case BPF_WORKQUEUE:
563			/* Nothing to release */
564			break;
565		default:
566			WARN_ON_ONCE(1);
567			continue;
568		}
569	}
570	kfree(rec);
571}
572
573void bpf_map_free_record(struct bpf_map *map)
574{
575	btf_record_free(map->record);
576	map->record = NULL;
577}
578
579struct btf_record *btf_record_dup(const struct btf_record *rec)
580{
581	const struct btf_field *fields;
582	struct btf_record *new_rec;
583	int ret, size, i;
584
585	if (IS_ERR_OR_NULL(rec))
586		return NULL;
587	size = offsetof(struct btf_record, fields[rec->cnt]);
588	new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
589	if (!new_rec)
590		return ERR_PTR(-ENOMEM);
591	/* Do a deep copy of the btf_record */
592	fields = rec->fields;
593	new_rec->cnt = 0;
594	for (i = 0; i < rec->cnt; i++) {
595		switch (fields[i].type) {
596		case BPF_KPTR_UNREF:
597		case BPF_KPTR_REF:
598		case BPF_KPTR_PERCPU:
599			btf_get(fields[i].kptr.btf);
600			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
601				ret = -ENXIO;
602				goto free;
603			}
604			break;
605		case BPF_LIST_HEAD:
606		case BPF_LIST_NODE:
607		case BPF_RB_ROOT:
608		case BPF_RB_NODE:
609		case BPF_SPIN_LOCK:
610		case BPF_TIMER:
611		case BPF_REFCOUNT:
612		case BPF_WORKQUEUE:
613			/* Nothing to acquire */
614			break;
615		default:
616			ret = -EFAULT;
617			WARN_ON_ONCE(1);
618			goto free;
619		}
620		new_rec->cnt++;
621	}
622	return new_rec;
623free:
624	btf_record_free(new_rec);
625	return ERR_PTR(ret);
626}
627
628bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
629{
630	bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
631	int size;
632
633	if (!a_has_fields && !b_has_fields)
634		return true;
635	if (a_has_fields != b_has_fields)
636		return false;
637	if (rec_a->cnt != rec_b->cnt)
638		return false;
639	size = offsetof(struct btf_record, fields[rec_a->cnt]);
640	/* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
641	 * members are zeroed out. So memcmp is safe to do without worrying
642	 * about padding/unused fields.
643	 *
644	 * While spin_lock, timer, and kptr have no relation to map BTF,
645	 * list_head metadata is specific to map BTF, the btf and value_rec
646	 * members in particular. btf is the map BTF, while value_rec points to
647	 * btf_record in that map BTF.
648	 *
649	 * So while by default, we don't rely on the map BTF (which the records
650	 * were parsed from) matching for both records, which is not backwards
651	 * compatible, in case list_head is part of it, we implicitly rely on
652	 * that by way of depending on memcmp succeeding for it.
653	 */
654	return !memcmp(rec_a, rec_b, size);
655}
656
657void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
658{
659	if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
660		return;
661	bpf_timer_cancel_and_free(obj + rec->timer_off);
662}
663
664void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
665{
666	if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
667		return;
668	bpf_wq_cancel_and_free(obj + rec->wq_off);
669}
670
671void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
672{
673	const struct btf_field *fields;
674	int i;
675
676	if (IS_ERR_OR_NULL(rec))
677		return;
678	fields = rec->fields;
679	for (i = 0; i < rec->cnt; i++) {
680		struct btf_struct_meta *pointee_struct_meta;
681		const struct btf_field *field = &fields[i];
682		void *field_ptr = obj + field->offset;
683		void *xchgd_field;
684
685		switch (fields[i].type) {
686		case BPF_SPIN_LOCK:
687			break;
688		case BPF_TIMER:
689			bpf_timer_cancel_and_free(field_ptr);
690			break;
691		case BPF_WORKQUEUE:
692			bpf_wq_cancel_and_free(field_ptr);
693			break;
694		case BPF_KPTR_UNREF:
695			WRITE_ONCE(*(u64 *)field_ptr, 0);
696			break;
697		case BPF_KPTR_REF:
698		case BPF_KPTR_PERCPU:
699			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
700			if (!xchgd_field)
701				break;
702
703			if (!btf_is_kernel(field->kptr.btf)) {
704				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
705									   field->kptr.btf_id);
706				migrate_disable();
707				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
708								 pointee_struct_meta->record : NULL,
709								 fields[i].type == BPF_KPTR_PERCPU);
710				migrate_enable();
711			} else {
712				field->kptr.dtor(xchgd_field);
713			}
714			break;
715		case BPF_LIST_HEAD:
716			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
717				continue;
718			bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
719			break;
720		case BPF_RB_ROOT:
721			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
722				continue;
723			bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
724			break;
725		case BPF_LIST_NODE:
726		case BPF_RB_NODE:
727		case BPF_REFCOUNT:
728			break;
729		default:
730			WARN_ON_ONCE(1);
731			continue;
732		}
733	}
734}
735
736/* called from workqueue */
737static void bpf_map_free_deferred(struct work_struct *work)
738{
739	struct bpf_map *map = container_of(work, struct bpf_map, work);
740	struct btf_record *rec = map->record;
741	struct btf *btf = map->btf;
742
743	security_bpf_map_free(map);
744	bpf_map_release_memcg(map);
745	/* implementation dependent freeing */
746	map->ops->map_free(map);
747	/* Delay freeing of btf_record for maps, as map_free
748	 * callback usually needs access to them. It is better to do it here
749	 * than require each callback to do the free itself manually.
750	 *
751	 * Note that the btf_record stashed in map->inner_map_meta->record was
752	 * already freed using the map_free callback for map in map case which
753	 * eventually calls bpf_map_free_meta, since inner_map_meta is only a
754	 * template bpf_map struct used during verification.
755	 */
756	btf_record_free(rec);
757	/* Delay freeing of btf for maps, as map_free callback may need
758	 * struct_meta info which will be freed with btf_put().
759	 */
760	btf_put(btf);
761}
762
763static void bpf_map_put_uref(struct bpf_map *map)
764{
765	if (atomic64_dec_and_test(&map->usercnt)) {
766		if (map->ops->map_release_uref)
767			map->ops->map_release_uref(map);
768	}
769}
770
771static void bpf_map_free_in_work(struct bpf_map *map)
772{
773	INIT_WORK(&map->work, bpf_map_free_deferred);
774	/* Avoid spawning kworkers, since they all might contend
775	 * for the same mutex like slab_mutex.
776	 */
777	queue_work(system_unbound_wq, &map->work);
778}
779
780static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
781{
782	bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
783}
784
785static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
786{
787	if (rcu_trace_implies_rcu_gp())
788		bpf_map_free_rcu_gp(rcu);
789	else
790		call_rcu(rcu, bpf_map_free_rcu_gp);
791}
792
793/* decrement map refcnt and schedule it for freeing via workqueue
794 * (underlying map implementation ops->map_free() might sleep)
795 */
796void bpf_map_put(struct bpf_map *map)
797{
798	if (atomic64_dec_and_test(&map->refcnt)) {
799		/* bpf_map_free_id() must be called first */
800		bpf_map_free_id(map);
801
802		WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
803		if (READ_ONCE(map->free_after_mult_rcu_gp))
804			call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
805		else if (READ_ONCE(map->free_after_rcu_gp))
806			call_rcu(&map->rcu, bpf_map_free_rcu_gp);
807		else
808			bpf_map_free_in_work(map);
809	}
810}
811EXPORT_SYMBOL_GPL(bpf_map_put);
812
813void bpf_map_put_with_uref(struct bpf_map *map)
814{
815	bpf_map_put_uref(map);
816	bpf_map_put(map);
817}
818
819static int bpf_map_release(struct inode *inode, struct file *filp)
820{
821	struct bpf_map *map = filp->private_data;
822
823	if (map->ops->map_release)
824		map->ops->map_release(map, filp);
825
826	bpf_map_put_with_uref(map);
827	return 0;
828}
829
830static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
831{
832	fmode_t mode = f.file->f_mode;
833
834	/* Our file permissions may have been overridden by global
835	 * map permissions facing syscall side.
836	 */
837	if (READ_ONCE(map->frozen))
838		mode &= ~FMODE_CAN_WRITE;
839	return mode;
840}
841
842#ifdef CONFIG_PROC_FS
843/* Show the memory usage of a bpf map */
844static u64 bpf_map_memory_usage(const struct bpf_map *map)
845{
846	return map->ops->map_mem_usage(map);
847}
848
849static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
850{
851	struct bpf_map *map = filp->private_data;
852	u32 type = 0, jited = 0;
853
854	if (map_type_contains_progs(map)) {
855		spin_lock(&map->owner.lock);
856		type  = map->owner.type;
857		jited = map->owner.jited;
858		spin_unlock(&map->owner.lock);
859	}
860
861	seq_printf(m,
862		   "map_type:\t%u\n"
863		   "key_size:\t%u\n"
864		   "value_size:\t%u\n"
865		   "max_entries:\t%u\n"
866		   "map_flags:\t%#x\n"
867		   "map_extra:\t%#llx\n"
868		   "memlock:\t%llu\n"
869		   "map_id:\t%u\n"
870		   "frozen:\t%u\n",
871		   map->map_type,
872		   map->key_size,
873		   map->value_size,
874		   map->max_entries,
875		   map->map_flags,
876		   (unsigned long long)map->map_extra,
877		   bpf_map_memory_usage(map),
878		   map->id,
879		   READ_ONCE(map->frozen));
880	if (type) {
881		seq_printf(m, "owner_prog_type:\t%u\n", type);
882		seq_printf(m, "owner_jited:\t%u\n", jited);
883	}
884}
885#endif
886
887static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
888			      loff_t *ppos)
889{
890	/* We need this handler such that alloc_file() enables
891	 * f_mode with FMODE_CAN_READ.
892	 */
893	return -EINVAL;
894}
895
896static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
897			       size_t siz, loff_t *ppos)
898{
899	/* We need this handler such that alloc_file() enables
900	 * f_mode with FMODE_CAN_WRITE.
901	 */
902	return -EINVAL;
903}
904
905/* called for any extra memory-mapped regions (except initial) */
906static void bpf_map_mmap_open(struct vm_area_struct *vma)
907{
908	struct bpf_map *map = vma->vm_file->private_data;
909
910	if (vma->vm_flags & VM_MAYWRITE)
911		bpf_map_write_active_inc(map);
912}
913
914/* called for all unmapped memory region (including initial) */
915static void bpf_map_mmap_close(struct vm_area_struct *vma)
916{
917	struct bpf_map *map = vma->vm_file->private_data;
918
919	if (vma->vm_flags & VM_MAYWRITE)
920		bpf_map_write_active_dec(map);
921}
922
923static const struct vm_operations_struct bpf_map_default_vmops = {
924	.open		= bpf_map_mmap_open,
925	.close		= bpf_map_mmap_close,
926};
927
928static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
929{
930	struct bpf_map *map = filp->private_data;
931	int err;
932
933	if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
934		return -ENOTSUPP;
935
936	if (!(vma->vm_flags & VM_SHARED))
937		return -EINVAL;
938
939	mutex_lock(&map->freeze_mutex);
940
941	if (vma->vm_flags & VM_WRITE) {
942		if (map->frozen) {
943			err = -EPERM;
944			goto out;
945		}
946		/* map is meant to be read-only, so do not allow mapping as
947		 * writable, because it's possible to leak a writable page
948		 * reference and allows user-space to still modify it after
949		 * freezing, while verifier will assume contents do not change
950		 */
951		if (map->map_flags & BPF_F_RDONLY_PROG) {
952			err = -EACCES;
953			goto out;
954		}
955	}
956
957	/* set default open/close callbacks */
958	vma->vm_ops = &bpf_map_default_vmops;
959	vma->vm_private_data = map;
960	vm_flags_clear(vma, VM_MAYEXEC);
961	if (!(vma->vm_flags & VM_WRITE))
962		/* disallow re-mapping with PROT_WRITE */
963		vm_flags_clear(vma, VM_MAYWRITE);
964
965	err = map->ops->map_mmap(map, vma);
966	if (err)
967		goto out;
968
969	if (vma->vm_flags & VM_MAYWRITE)
970		bpf_map_write_active_inc(map);
971out:
972	mutex_unlock(&map->freeze_mutex);
973	return err;
974}
975
976static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
977{
978	struct bpf_map *map = filp->private_data;
979
980	if (map->ops->map_poll)
981		return map->ops->map_poll(map, filp, pts);
982
983	return EPOLLERR;
984}
985
986static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
987					   unsigned long len, unsigned long pgoff,
988					   unsigned long flags)
989{
990	struct bpf_map *map = filp->private_data;
991
992	if (map->ops->map_get_unmapped_area)
993		return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
994#ifdef CONFIG_MMU
995	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
996#else
997	return addr;
998#endif
999}
1000
1001const struct file_operations bpf_map_fops = {
1002#ifdef CONFIG_PROC_FS
1003	.show_fdinfo	= bpf_map_show_fdinfo,
1004#endif
1005	.release	= bpf_map_release,
1006	.read		= bpf_dummy_read,
1007	.write		= bpf_dummy_write,
1008	.mmap		= bpf_map_mmap,
1009	.poll		= bpf_map_poll,
1010	.get_unmapped_area = bpf_get_unmapped_area,
1011};
1012
1013int bpf_map_new_fd(struct bpf_map *map, int flags)
1014{
1015	int ret;
1016
1017	ret = security_bpf_map(map, OPEN_FMODE(flags));
1018	if (ret < 0)
1019		return ret;
1020
1021	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
1022				flags | O_CLOEXEC);
1023}
1024
1025int bpf_get_file_flag(int flags)
1026{
1027	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
1028		return -EINVAL;
1029	if (flags & BPF_F_RDONLY)
1030		return O_RDONLY;
1031	if (flags & BPF_F_WRONLY)
1032		return O_WRONLY;
1033	return O_RDWR;
1034}
1035
1036/* helper macro to check that unused fields 'union bpf_attr' are zero */
1037#define CHECK_ATTR(CMD) \
1038	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
1039		   sizeof(attr->CMD##_LAST_FIELD), 0, \
1040		   sizeof(*attr) - \
1041		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
1042		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
1043
1044/* dst and src must have at least "size" number of bytes.
1045 * Return strlen on success and < 0 on error.
1046 */
1047int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
1048{
1049	const char *end = src + size;
1050	const char *orig_src = src;
1051
1052	memset(dst, 0, size);
1053	/* Copy all isalnum(), '_' and '.' chars. */
1054	while (src < end && *src) {
1055		if (!isalnum(*src) &&
1056		    *src != '_' && *src != '.')
1057			return -EINVAL;
1058		*dst++ = *src++;
1059	}
1060
1061	/* No '\0' found in "size" number of bytes */
1062	if (src == end)
1063		return -EINVAL;
1064
1065	return src - orig_src;
1066}
1067
1068int map_check_no_btf(const struct bpf_map *map,
1069		     const struct btf *btf,
1070		     const struct btf_type *key_type,
1071		     const struct btf_type *value_type)
1072{
1073	return -ENOTSUPP;
1074}
1075
1076static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
1077			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
1078{
1079	const struct btf_type *key_type, *value_type;
1080	u32 key_size, value_size;
1081	int ret = 0;
1082
1083	/* Some maps allow key to be unspecified. */
1084	if (btf_key_id) {
1085		key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
1086		if (!key_type || key_size != map->key_size)
1087			return -EINVAL;
1088	} else {
1089		key_type = btf_type_by_id(btf, 0);
1090		if (!map->ops->map_check_btf)
1091			return -EINVAL;
1092	}
1093
1094	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
1095	if (!value_type || value_size != map->value_size)
1096		return -EINVAL;
1097
1098	map->record = btf_parse_fields(btf, value_type,
1099				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
1100				       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
1101				       map->value_size);
1102	if (!IS_ERR_OR_NULL(map->record)) {
1103		int i;
1104
1105		if (!bpf_token_capable(token, CAP_BPF)) {
1106			ret = -EPERM;
1107			goto free_map_tab;
1108		}
1109		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
1110			ret = -EACCES;
1111			goto free_map_tab;
1112		}
1113		for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
1114			switch (map->record->field_mask & (1 << i)) {
1115			case 0:
1116				continue;
1117			case BPF_SPIN_LOCK:
1118				if (map->map_type != BPF_MAP_TYPE_HASH &&
1119				    map->map_type != BPF_MAP_TYPE_ARRAY &&
1120				    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
1121				    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1122				    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1123				    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1124				    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1125					ret = -EOPNOTSUPP;
1126					goto free_map_tab;
1127				}
1128				break;
1129			case BPF_TIMER:
1130			case BPF_WORKQUEUE:
1131				if (map->map_type != BPF_MAP_TYPE_HASH &&
1132				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1133				    map->map_type != BPF_MAP_TYPE_ARRAY) {
1134					ret = -EOPNOTSUPP;
1135					goto free_map_tab;
1136				}
1137				break;
1138			case BPF_KPTR_UNREF:
1139			case BPF_KPTR_REF:
1140			case BPF_KPTR_PERCPU:
1141			case BPF_REFCOUNT:
1142				if (map->map_type != BPF_MAP_TYPE_HASH &&
1143				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
1144				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1145				    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
1146				    map->map_type != BPF_MAP_TYPE_ARRAY &&
1147				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
1148				    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1149				    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1150				    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1151				    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1152					ret = -EOPNOTSUPP;
1153					goto free_map_tab;
1154				}
1155				break;
1156			case BPF_LIST_HEAD:
1157			case BPF_RB_ROOT:
1158				if (map->map_type != BPF_MAP_TYPE_HASH &&
1159				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1160				    map->map_type != BPF_MAP_TYPE_ARRAY) {
1161					ret = -EOPNOTSUPP;
1162					goto free_map_tab;
1163				}
1164				break;
1165			default:
1166				/* Fail if map_type checks are missing for a field type */
1167				ret = -EOPNOTSUPP;
1168				goto free_map_tab;
1169			}
1170		}
1171	}
1172
1173	ret = btf_check_and_fixup_fields(btf, map->record);
1174	if (ret < 0)
1175		goto free_map_tab;
1176
1177	if (map->ops->map_check_btf) {
1178		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
1179		if (ret < 0)
1180			goto free_map_tab;
1181	}
1182
1183	return ret;
1184free_map_tab:
1185	bpf_map_free_record(map);
1186	return ret;
1187}
1188
1189static bool bpf_net_capable(void)
1190{
1191	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
1192}
1193
1194#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
1195/* called via syscall */
1196static int map_create(union bpf_attr *attr)
1197{
1198	const struct bpf_map_ops *ops;
1199	struct bpf_token *token = NULL;
1200	int numa_node = bpf_map_attr_numa_node(attr);
1201	u32 map_type = attr->map_type;
1202	struct bpf_map *map;
1203	bool token_flag;
1204	int f_flags;
1205	int err;
1206
1207	err = CHECK_ATTR(BPF_MAP_CREATE);
1208	if (err)
1209		return -EINVAL;
1210
1211	/* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
1212	 * to avoid per-map type checks tripping on unknown flag
1213	 */
1214	token_flag = attr->map_flags & BPF_F_TOKEN_FD;
1215	attr->map_flags &= ~BPF_F_TOKEN_FD;
1216
1217	if (attr->btf_vmlinux_value_type_id) {
1218		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
1219		    attr->btf_key_type_id || attr->btf_value_type_id)
1220			return -EINVAL;
1221	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
1222		return -EINVAL;
1223	}
1224
1225	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
1226	    attr->map_type != BPF_MAP_TYPE_ARENA &&
1227	    attr->map_extra != 0)
1228		return -EINVAL;
1229
1230	f_flags = bpf_get_file_flag(attr->map_flags);
1231	if (f_flags < 0)
1232		return f_flags;
1233
1234	if (numa_node != NUMA_NO_NODE &&
1235	    ((unsigned int)numa_node >= nr_node_ids ||
1236	     !node_online(numa_node)))
1237		return -EINVAL;
1238
1239	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
1240	map_type = attr->map_type;
1241	if (map_type >= ARRAY_SIZE(bpf_map_types))
1242		return -EINVAL;
1243	map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
1244	ops = bpf_map_types[map_type];
1245	if (!ops)
1246		return -EINVAL;
1247
1248	if (ops->map_alloc_check) {
1249		err = ops->map_alloc_check(attr);
1250		if (err)
1251			return err;
1252	}
1253	if (attr->map_ifindex)
1254		ops = &bpf_map_offload_ops;
1255	if (!ops->map_mem_usage)
1256		return -EINVAL;
1257
1258	if (token_flag) {
1259		token = bpf_token_get_from_fd(attr->map_token_fd);
1260		if (IS_ERR(token))
1261			return PTR_ERR(token);
1262
1263		/* if current token doesn't grant map creation permissions,
1264		 * then we can't use this token, so ignore it and rely on
1265		 * system-wide capabilities checks
1266		 */
1267		if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
1268		    !bpf_token_allow_map_type(token, attr->map_type)) {
1269			bpf_token_put(token);
1270			token = NULL;
1271		}
1272	}
1273
1274	err = -EPERM;
1275
1276	/* Intent here is for unprivileged_bpf_disabled to block BPF map
1277	 * creation for unprivileged users; other actions depend
1278	 * on fd availability and access to bpffs, so are dependent on
1279	 * object creation success. Even with unprivileged BPF disabled,
1280	 * capability checks are still carried out.
1281	 */
1282	if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
1283		goto put_token;
1284
1285	/* check privileged map type permissions */
1286	switch (map_type) {
1287	case BPF_MAP_TYPE_ARRAY:
1288	case BPF_MAP_TYPE_PERCPU_ARRAY:
1289	case BPF_MAP_TYPE_PROG_ARRAY:
1290	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
1291	case BPF_MAP_TYPE_CGROUP_ARRAY:
1292	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1293	case BPF_MAP_TYPE_HASH:
1294	case BPF_MAP_TYPE_PERCPU_HASH:
1295	case BPF_MAP_TYPE_HASH_OF_MAPS:
1296	case BPF_MAP_TYPE_RINGBUF:
1297	case BPF_MAP_TYPE_USER_RINGBUF:
1298	case BPF_MAP_TYPE_CGROUP_STORAGE:
1299	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
1300		/* unprivileged */
1301		break;
1302	case BPF_MAP_TYPE_SK_STORAGE:
1303	case BPF_MAP_TYPE_INODE_STORAGE:
1304	case BPF_MAP_TYPE_TASK_STORAGE:
1305	case BPF_MAP_TYPE_CGRP_STORAGE:
1306	case BPF_MAP_TYPE_BLOOM_FILTER:
1307	case BPF_MAP_TYPE_LPM_TRIE:
1308	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
1309	case BPF_MAP_TYPE_STACK_TRACE:
1310	case BPF_MAP_TYPE_QUEUE:
1311	case BPF_MAP_TYPE_STACK:
1312	case BPF_MAP_TYPE_LRU_HASH:
1313	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
1314	case BPF_MAP_TYPE_STRUCT_OPS:
1315	case BPF_MAP_TYPE_CPUMAP:
1316	case BPF_MAP_TYPE_ARENA:
1317		if (!bpf_token_capable(token, CAP_BPF))
1318			goto put_token;
1319		break;
1320	case BPF_MAP_TYPE_SOCKMAP:
1321	case BPF_MAP_TYPE_SOCKHASH:
1322	case BPF_MAP_TYPE_DEVMAP:
1323	case BPF_MAP_TYPE_DEVMAP_HASH:
1324	case BPF_MAP_TYPE_XSKMAP:
1325		if (!bpf_token_capable(token, CAP_NET_ADMIN))
1326			goto put_token;
1327		break;
1328	default:
1329		WARN(1, "unsupported map type %d", map_type);
1330		goto put_token;
1331	}
1332
1333	map = ops->map_alloc(attr);
1334	if (IS_ERR(map)) {
1335		err = PTR_ERR(map);
1336		goto put_token;
1337	}
1338	map->ops = ops;
1339	map->map_type = map_type;
1340
1341	err = bpf_obj_name_cpy(map->name, attr->map_name,
1342			       sizeof(attr->map_name));
1343	if (err < 0)
1344		goto free_map;
1345
1346	atomic64_set(&map->refcnt, 1);
1347	atomic64_set(&map->usercnt, 1);
1348	mutex_init(&map->freeze_mutex);
1349	spin_lock_init(&map->owner.lock);
1350
1351	if (attr->btf_key_type_id || attr->btf_value_type_id ||
1352	    /* Even the map's value is a kernel's struct,
1353	     * the bpf_prog.o must have BTF to begin with
1354	     * to figure out the corresponding kernel's
1355	     * counter part.  Thus, attr->btf_fd has
1356	     * to be valid also.
1357	     */
1358	    attr->btf_vmlinux_value_type_id) {
1359		struct btf *btf;
1360
1361		btf = btf_get_by_fd(attr->btf_fd);
1362		if (IS_ERR(btf)) {
1363			err = PTR_ERR(btf);
1364			goto free_map;
1365		}
1366		if (btf_is_kernel(btf)) {
1367			btf_put(btf);
1368			err = -EACCES;
1369			goto free_map;
1370		}
1371		map->btf = btf;
1372
1373		if (attr->btf_value_type_id) {
1374			err = map_check_btf(map, token, btf, attr->btf_key_type_id,
1375					    attr->btf_value_type_id);
1376			if (err)
1377				goto free_map;
1378		}
1379
1380		map->btf_key_type_id = attr->btf_key_type_id;
1381		map->btf_value_type_id = attr->btf_value_type_id;
1382		map->btf_vmlinux_value_type_id =
1383			attr->btf_vmlinux_value_type_id;
1384	}
1385
1386	err = security_bpf_map_create(map, attr, token);
1387	if (err)
1388		goto free_map_sec;
1389
1390	err = bpf_map_alloc_id(map);
1391	if (err)
1392		goto free_map_sec;
1393
1394	bpf_map_save_memcg(map);
1395	bpf_token_put(token);
1396
1397	err = bpf_map_new_fd(map, f_flags);
1398	if (err < 0) {
1399		/* failed to allocate fd.
1400		 * bpf_map_put_with_uref() is needed because the above
1401		 * bpf_map_alloc_id() has published the map
1402		 * to the userspace and the userspace may
1403		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
1404		 */
1405		bpf_map_put_with_uref(map);
1406		return err;
1407	}
1408
1409	return err;
1410
1411free_map_sec:
1412	security_bpf_map_free(map);
1413free_map:
1414	btf_put(map->btf);
1415	map->ops->map_free(map);
1416put_token:
1417	bpf_token_put(token);
1418	return err;
1419}
1420
1421/* if error is returned, fd is released.
1422 * On success caller should complete fd access with matching fdput()
1423 */
1424struct bpf_map *__bpf_map_get(struct fd f)
1425{
1426	if (!f.file)
1427		return ERR_PTR(-EBADF);
1428	if (f.file->f_op != &bpf_map_fops) {
1429		fdput(f);
1430		return ERR_PTR(-EINVAL);
1431	}
1432
1433	return f.file->private_data;
1434}
1435
1436void bpf_map_inc(struct bpf_map *map)
1437{
1438	atomic64_inc(&map->refcnt);
1439}
1440EXPORT_SYMBOL_GPL(bpf_map_inc);
1441
1442void bpf_map_inc_with_uref(struct bpf_map *map)
1443{
1444	atomic64_inc(&map->refcnt);
1445	atomic64_inc(&map->usercnt);
1446}
1447EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
1448
1449struct bpf_map *bpf_map_get(u32 ufd)
1450{
1451	struct fd f = fdget(ufd);
1452	struct bpf_map *map;
1453
1454	map = __bpf_map_get(f);
1455	if (IS_ERR(map))
1456		return map;
1457
1458	bpf_map_inc(map);
1459	fdput(f);
1460
1461	return map;
1462}
1463EXPORT_SYMBOL(bpf_map_get);
1464
1465struct bpf_map *bpf_map_get_with_uref(u32 ufd)
1466{
1467	struct fd f = fdget(ufd);
1468	struct bpf_map *map;
1469
1470	map = __bpf_map_get(f);
1471	if (IS_ERR(map))
1472		return map;
1473
1474	bpf_map_inc_with_uref(map);
1475	fdput(f);
1476
1477	return map;
1478}
1479
1480/* map_idr_lock should have been held or the map should have been
1481 * protected by rcu read lock.
1482 */
1483struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
1484{
1485	int refold;
1486
1487	refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
1488	if (!refold)
1489		return ERR_PTR(-ENOENT);
1490	if (uref)
1491		atomic64_inc(&map->usercnt);
1492
1493	return map;
1494}
1495
1496struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
1497{
1498	spin_lock_bh(&map_idr_lock);
1499	map = __bpf_map_inc_not_zero(map, false);
1500	spin_unlock_bh(&map_idr_lock);
1501
1502	return map;
1503}
1504EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
1505
1506int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
1507{
1508	return -ENOTSUPP;
1509}
1510
1511static void *__bpf_copy_key(void __user *ukey, u64 key_size)
1512{
1513	if (key_size)
1514		return vmemdup_user(ukey, key_size);
1515
1516	if (ukey)
1517		return ERR_PTR(-EINVAL);
1518
1519	return NULL;
1520}
1521
1522static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
1523{
1524	if (key_size)
1525		return kvmemdup_bpfptr(ukey, key_size);
1526
1527	if (!bpfptr_is_null(ukey))
1528		return ERR_PTR(-EINVAL);
1529
1530	return NULL;
1531}
1532
1533/* last field in 'union bpf_attr' used by this command */
1534#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
1535
1536static int map_lookup_elem(union bpf_attr *attr)
1537{
1538	void __user *ukey = u64_to_user_ptr(attr->key);
1539	void __user *uvalue = u64_to_user_ptr(attr->value);
1540	int ufd = attr->map_fd;
1541	struct bpf_map *map;
1542	void *key, *value;
1543	u32 value_size;
1544	struct fd f;
1545	int err;
1546
1547	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1548		return -EINVAL;
1549
1550	if (attr->flags & ~BPF_F_LOCK)
1551		return -EINVAL;
1552
1553	f = fdget(ufd);
1554	map = __bpf_map_get(f);
1555	if (IS_ERR(map))
1556		return PTR_ERR(map);
1557	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1558		err = -EPERM;
1559		goto err_put;
1560	}
1561
1562	if ((attr->flags & BPF_F_LOCK) &&
1563	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1564		err = -EINVAL;
1565		goto err_put;
1566	}
1567
1568	key = __bpf_copy_key(ukey, map->key_size);
1569	if (IS_ERR(key)) {
1570		err = PTR_ERR(key);
1571		goto err_put;
1572	}
1573
1574	value_size = bpf_map_value_size(map);
1575
1576	err = -ENOMEM;
1577	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1578	if (!value)
1579		goto free_key;
1580
1581	if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
1582		if (copy_from_user(value, uvalue, value_size))
1583			err = -EFAULT;
1584		else
1585			err = bpf_map_copy_value(map, key, value, attr->flags);
1586		goto free_value;
1587	}
1588
1589	err = bpf_map_copy_value(map, key, value, attr->flags);
1590	if (err)
1591		goto free_value;
1592
1593	err = -EFAULT;
1594	if (copy_to_user(uvalue, value, value_size) != 0)
1595		goto free_value;
1596
1597	err = 0;
1598
1599free_value:
1600	kvfree(value);
1601free_key:
1602	kvfree(key);
1603err_put:
1604	fdput(f);
1605	return err;
1606}
1607
1608
1609#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1610
1611static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
1612{
1613	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1614	bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
1615	int ufd = attr->map_fd;
1616	struct bpf_map *map;
1617	void *key, *value;
1618	u32 value_size;
1619	struct fd f;
1620	int err;
1621
1622	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1623		return -EINVAL;
1624
1625	f = fdget(ufd);
1626	map = __bpf_map_get(f);
1627	if (IS_ERR(map))
1628		return PTR_ERR(map);
1629	bpf_map_write_active_inc(map);
1630	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1631		err = -EPERM;
1632		goto err_put;
1633	}
1634
1635	if ((attr->flags & BPF_F_LOCK) &&
1636	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1637		err = -EINVAL;
1638		goto err_put;
1639	}
1640
1641	key = ___bpf_copy_key(ukey, map->key_size);
1642	if (IS_ERR(key)) {
1643		err = PTR_ERR(key);
1644		goto err_put;
1645	}
1646
1647	value_size = bpf_map_value_size(map);
1648	value = kvmemdup_bpfptr(uvalue, value_size);
1649	if (IS_ERR(value)) {
1650		err = PTR_ERR(value);
1651		goto free_key;
1652	}
1653
1654	err = bpf_map_update_value(map, f.file, key, value, attr->flags);
1655	if (!err)
1656		maybe_wait_bpf_programs(map);
1657
1658	kvfree(value);
1659free_key:
1660	kvfree(key);
1661err_put:
1662	bpf_map_write_active_dec(map);
1663	fdput(f);
1664	return err;
1665}
1666
1667#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1668
1669static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
1670{
1671	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1672	int ufd = attr->map_fd;
1673	struct bpf_map *map;
1674	struct fd f;
1675	void *key;
1676	int err;
1677
1678	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1679		return -EINVAL;
1680
1681	f = fdget(ufd);
1682	map = __bpf_map_get(f);
1683	if (IS_ERR(map))
1684		return PTR_ERR(map);
1685	bpf_map_write_active_inc(map);
1686	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1687		err = -EPERM;
1688		goto err_put;
1689	}
1690
1691	key = ___bpf_copy_key(ukey, map->key_size);
1692	if (IS_ERR(key)) {
1693		err = PTR_ERR(key);
1694		goto err_put;
1695	}
1696
1697	if (bpf_map_is_offloaded(map)) {
1698		err = bpf_map_offload_delete_elem(map, key);
1699		goto out;
1700	} else if (IS_FD_PROG_ARRAY(map) ||
1701		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1702		/* These maps require sleepable context */
1703		err = map->ops->map_delete_elem(map, key);
1704		goto out;
1705	}
1706
1707	bpf_disable_instrumentation();
1708	rcu_read_lock();
1709	err = map->ops->map_delete_elem(map, key);
1710	rcu_read_unlock();
1711	bpf_enable_instrumentation();
1712	if (!err)
1713		maybe_wait_bpf_programs(map);
1714out:
1715	kvfree(key);
1716err_put:
1717	bpf_map_write_active_dec(map);
1718	fdput(f);
1719	return err;
1720}
1721
1722/* last field in 'union bpf_attr' used by this command */
1723#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1724
1725static int map_get_next_key(union bpf_attr *attr)
1726{
1727	void __user *ukey = u64_to_user_ptr(attr->key);
1728	void __user *unext_key = u64_to_user_ptr(attr->next_key);
1729	int ufd = attr->map_fd;
1730	struct bpf_map *map;
1731	void *key, *next_key;
1732	struct fd f;
1733	int err;
1734
1735	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1736		return -EINVAL;
1737
1738	f = fdget(ufd);
1739	map = __bpf_map_get(f);
1740	if (IS_ERR(map))
1741		return PTR_ERR(map);
1742	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1743		err = -EPERM;
1744		goto err_put;
1745	}
1746
1747	if (ukey) {
1748		key = __bpf_copy_key(ukey, map->key_size);
1749		if (IS_ERR(key)) {
1750			err = PTR_ERR(key);
1751			goto err_put;
1752		}
1753	} else {
1754		key = NULL;
1755	}
1756
1757	err = -ENOMEM;
1758	next_key = kvmalloc(map->key_size, GFP_USER);
1759	if (!next_key)
1760		goto free_key;
1761
1762	if (bpf_map_is_offloaded(map)) {
1763		err = bpf_map_offload_get_next_key(map, key, next_key);
1764		goto out;
1765	}
1766
1767	rcu_read_lock();
1768	err = map->ops->map_get_next_key(map, key, next_key);
1769	rcu_read_unlock();
1770out:
1771	if (err)
1772		goto free_next_key;
1773
1774	err = -EFAULT;
1775	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
1776		goto free_next_key;
1777
1778	err = 0;
1779
1780free_next_key:
1781	kvfree(next_key);
1782free_key:
1783	kvfree(key);
1784err_put:
1785	fdput(f);
1786	return err;
1787}
1788
1789int generic_map_delete_batch(struct bpf_map *map,
1790			     const union bpf_attr *attr,
1791			     union bpf_attr __user *uattr)
1792{
1793	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1794	u32 cp, max_count;
1795	int err = 0;
1796	void *key;
1797
1798	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1799		return -EINVAL;
1800
1801	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1802	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1803		return -EINVAL;
1804	}
1805
1806	max_count = attr->batch.count;
1807	if (!max_count)
1808		return 0;
1809
1810	if (put_user(0, &uattr->batch.count))
1811		return -EFAULT;
1812
1813	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1814	if (!key)
1815		return -ENOMEM;
1816
1817	for (cp = 0; cp < max_count; cp++) {
1818		err = -EFAULT;
1819		if (copy_from_user(key, keys + cp * map->key_size,
1820				   map->key_size))
1821			break;
1822
1823		if (bpf_map_is_offloaded(map)) {
1824			err = bpf_map_offload_delete_elem(map, key);
1825			break;
1826		}
1827
1828		bpf_disable_instrumentation();
1829		rcu_read_lock();
1830		err = map->ops->map_delete_elem(map, key);
1831		rcu_read_unlock();
1832		bpf_enable_instrumentation();
1833		if (err)
1834			break;
1835		cond_resched();
1836	}
1837	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1838		err = -EFAULT;
1839
1840	kvfree(key);
1841
1842	return err;
1843}
1844
1845int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
1846			     const union bpf_attr *attr,
1847			     union bpf_attr __user *uattr)
1848{
1849	void __user *values = u64_to_user_ptr(attr->batch.values);
1850	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1851	u32 value_size, cp, max_count;
1852	void *key, *value;
1853	int err = 0;
1854
1855	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1856		return -EINVAL;
1857
1858	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1859	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1860		return -EINVAL;
1861	}
1862
1863	value_size = bpf_map_value_size(map);
1864
1865	max_count = attr->batch.count;
1866	if (!max_count)
1867		return 0;
1868
1869	if (put_user(0, &uattr->batch.count))
1870		return -EFAULT;
1871
1872	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1873	if (!key)
1874		return -ENOMEM;
1875
1876	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1877	if (!value) {
1878		kvfree(key);
1879		return -ENOMEM;
1880	}
1881
1882	for (cp = 0; cp < max_count; cp++) {
1883		err = -EFAULT;
1884		if (copy_from_user(key, keys + cp * map->key_size,
1885		    map->key_size) ||
1886		    copy_from_user(value, values + cp * value_size, value_size))
1887			break;
1888
1889		err = bpf_map_update_value(map, map_file, key, value,
1890					   attr->batch.elem_flags);
1891
1892		if (err)
1893			break;
1894		cond_resched();
1895	}
1896
1897	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1898		err = -EFAULT;
1899
1900	kvfree(value);
1901	kvfree(key);
1902
1903	return err;
1904}
1905
1906#define MAP_LOOKUP_RETRIES 3
1907
1908int generic_map_lookup_batch(struct bpf_map *map,
1909				    const union bpf_attr *attr,
1910				    union bpf_attr __user *uattr)
1911{
1912	void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1913	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1914	void __user *values = u64_to_user_ptr(attr->batch.values);
1915	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1916	void *buf, *buf_prevkey, *prev_key, *key, *value;
1917	int err, retry = MAP_LOOKUP_RETRIES;
1918	u32 value_size, cp, max_count;
1919
1920	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1921		return -EINVAL;
1922
1923	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1924	    !btf_record_has_field(map->record, BPF_SPIN_LOCK))
1925		return -EINVAL;
1926
1927	value_size = bpf_map_value_size(map);
1928
1929	max_count = attr->batch.count;
1930	if (!max_count)
1931		return 0;
1932
1933	if (put_user(0, &uattr->batch.count))
1934		return -EFAULT;
1935
1936	buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1937	if (!buf_prevkey)
1938		return -ENOMEM;
1939
1940	buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1941	if (!buf) {
1942		kvfree(buf_prevkey);
1943		return -ENOMEM;
1944	}
1945
1946	err = -EFAULT;
1947	prev_key = NULL;
1948	if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1949		goto free_buf;
1950	key = buf;
1951	value = key + map->key_size;
1952	if (ubatch)
1953		prev_key = buf_prevkey;
1954
1955	for (cp = 0; cp < max_count;) {
1956		rcu_read_lock();
1957		err = map->ops->map_get_next_key(map, prev_key, key);
1958		rcu_read_unlock();
1959		if (err)
1960			break;
1961		err = bpf_map_copy_value(map, key, value,
1962					 attr->batch.elem_flags);
1963
1964		if (err == -ENOENT) {
1965			if (retry) {
1966				retry--;
1967				continue;
1968			}
1969			err = -EINTR;
1970			break;
1971		}
1972
1973		if (err)
1974			goto free_buf;
1975
1976		if (copy_to_user(keys + cp * map->key_size, key,
1977				 map->key_size)) {
1978			err = -EFAULT;
1979			goto free_buf;
1980		}
1981		if (copy_to_user(values + cp * value_size, value, value_size)) {
1982			err = -EFAULT;
1983			goto free_buf;
1984		}
1985
1986		if (!prev_key)
1987			prev_key = buf_prevkey;
1988
1989		swap(prev_key, key);
1990		retry = MAP_LOOKUP_RETRIES;
1991		cp++;
1992		cond_resched();
1993	}
1994
1995	if (err == -EFAULT)
1996		goto free_buf;
1997
1998	if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1999		    (cp && copy_to_user(uobatch, prev_key, map->key_size))))
2000		err = -EFAULT;
2001
2002free_buf:
2003	kvfree(buf_prevkey);
2004	kvfree(buf);
2005	return err;
2006}
2007
2008#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
2009
2010static int map_lookup_and_delete_elem(union bpf_attr *attr)
2011{
2012	void __user *ukey = u64_to_user_ptr(attr->key);
2013	void __user *uvalue = u64_to_user_ptr(attr->value);
2014	int ufd = attr->map_fd;
2015	struct bpf_map *map;
2016	void *key, *value;
2017	u32 value_size;
2018	struct fd f;
2019	int err;
2020
2021	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
2022		return -EINVAL;
2023
2024	if (attr->flags & ~BPF_F_LOCK)
2025		return -EINVAL;
2026
2027	f = fdget(ufd);
2028	map = __bpf_map_get(f);
2029	if (IS_ERR(map))
2030		return PTR_ERR(map);
2031	bpf_map_write_active_inc(map);
2032	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
2033	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
2034		err = -EPERM;
2035		goto err_put;
2036	}
2037
2038	if (attr->flags &&
2039	    (map->map_type == BPF_MAP_TYPE_QUEUE ||
2040	     map->map_type == BPF_MAP_TYPE_STACK)) {
2041		err = -EINVAL;
2042		goto err_put;
2043	}
2044
2045	if ((attr->flags & BPF_F_LOCK) &&
2046	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
2047		err = -EINVAL;
2048		goto err_put;
2049	}
2050
2051	key = __bpf_copy_key(ukey, map->key_size);
2052	if (IS_ERR(key)) {
2053		err = PTR_ERR(key);
2054		goto err_put;
2055	}
2056
2057	value_size = bpf_map_value_size(map);
2058
2059	err = -ENOMEM;
2060	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
2061	if (!value)
2062		goto free_key;
2063
2064	err = -ENOTSUPP;
2065	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
2066	    map->map_type == BPF_MAP_TYPE_STACK) {
2067		err = map->ops->map_pop_elem(map, value);
2068	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
2069		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2070		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
2071		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2072		if (!bpf_map_is_offloaded(map)) {
2073			bpf_disable_instrumentation();
2074			rcu_read_lock();
2075			err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
2076			rcu_read_unlock();
2077			bpf_enable_instrumentation();
2078		}
2079	}
2080
2081	if (err)
2082		goto free_value;
2083
2084	if (copy_to_user(uvalue, value, value_size) != 0) {
2085		err = -EFAULT;
2086		goto free_value;
2087	}
2088
2089	err = 0;
2090
2091free_value:
2092	kvfree(value);
2093free_key:
2094	kvfree(key);
2095err_put:
2096	bpf_map_write_active_dec(map);
2097	fdput(f);
2098	return err;
2099}
2100
2101#define BPF_MAP_FREEZE_LAST_FIELD map_fd
2102
2103static int map_freeze(const union bpf_attr *attr)
2104{
2105	int err = 0, ufd = attr->map_fd;
2106	struct bpf_map *map;
2107	struct fd f;
2108
2109	if (CHECK_ATTR(BPF_MAP_FREEZE))
2110		return -EINVAL;
2111
2112	f = fdget(ufd);
2113	map = __bpf_map_get(f);
2114	if (IS_ERR(map))
2115		return PTR_ERR(map);
2116
2117	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
2118		fdput(f);
2119		return -ENOTSUPP;
2120	}
2121
2122	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
2123		fdput(f);
2124		return -EPERM;
2125	}
2126
2127	mutex_lock(&map->freeze_mutex);
2128	if (bpf_map_write_active(map)) {
2129		err = -EBUSY;
2130		goto err_put;
2131	}
2132	if (READ_ONCE(map->frozen)) {
2133		err = -EBUSY;
2134		goto err_put;
2135	}
2136
2137	WRITE_ONCE(map->frozen, true);
2138err_put:
2139	mutex_unlock(&map->freeze_mutex);
2140	fdput(f);
2141	return err;
2142}
2143
2144static const struct bpf_prog_ops * const bpf_prog_types[] = {
2145#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
2146	[_id] = & _name ## _prog_ops,
2147#define BPF_MAP_TYPE(_id, _ops)
2148#define BPF_LINK_TYPE(_id, _name)
2149#include <linux/bpf_types.h>
2150#undef BPF_PROG_TYPE
2151#undef BPF_MAP_TYPE
2152#undef BPF_LINK_TYPE
2153};
2154
2155static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
2156{
2157	const struct bpf_prog_ops *ops;
2158
2159	if (type >= ARRAY_SIZE(bpf_prog_types))
2160		return -EINVAL;
2161	type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
2162	ops = bpf_prog_types[type];
2163	if (!ops)
2164		return -EINVAL;
2165
2166	if (!bpf_prog_is_offloaded(prog->aux))
2167		prog->aux->ops = ops;
2168	else
2169		prog->aux->ops = &bpf_offload_prog_ops;
2170	prog->type = type;
2171	return 0;
2172}
2173
2174enum bpf_audit {
2175	BPF_AUDIT_LOAD,
2176	BPF_AUDIT_UNLOAD,
2177	BPF_AUDIT_MAX,
2178};
2179
2180static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
2181	[BPF_AUDIT_LOAD]   = "LOAD",
2182	[BPF_AUDIT_UNLOAD] = "UNLOAD",
2183};
2184
2185static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
2186{
2187	struct audit_context *ctx = NULL;
2188	struct audit_buffer *ab;
2189
2190	if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
2191		return;
2192	if (audit_enabled == AUDIT_OFF)
2193		return;
2194	if (!in_irq() && !irqs_disabled())
2195		ctx = audit_context();
2196	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
2197	if (unlikely(!ab))
2198		return;
2199	audit_log_format(ab, "prog-id=%u op=%s",
2200			 prog->aux->id, bpf_audit_str[op]);
2201	audit_log_end(ab);
2202}
2203
2204static int bpf_prog_alloc_id(struct bpf_prog *prog)
2205{
2206	int id;
2207
2208	idr_preload(GFP_KERNEL);
2209	spin_lock_bh(&prog_idr_lock);
2210	id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
2211	if (id > 0)
2212		prog->aux->id = id;
2213	spin_unlock_bh(&prog_idr_lock);
2214	idr_preload_end();
2215
2216	/* id is in [1, INT_MAX) */
2217	if (WARN_ON_ONCE(!id))
2218		return -ENOSPC;
2219
2220	return id > 0 ? 0 : id;
2221}
2222
2223void bpf_prog_free_id(struct bpf_prog *prog)
2224{
2225	unsigned long flags;
2226
2227	/* cBPF to eBPF migrations are currently not in the idr store.
2228	 * Offloaded programs are removed from the store when their device
2229	 * disappears - even if someone grabs an fd to them they are unusable,
2230	 * simply waiting for refcnt to drop to be freed.
2231	 */
2232	if (!prog->aux->id)
2233		return;
2234
2235	spin_lock_irqsave(&prog_idr_lock, flags);
2236	idr_remove(&prog_idr, prog->aux->id);
2237	prog->aux->id = 0;
2238	spin_unlock_irqrestore(&prog_idr_lock, flags);
2239}
2240
2241static void __bpf_prog_put_rcu(struct rcu_head *rcu)
2242{
2243	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
2244
2245	kvfree(aux->func_info);
2246	kfree(aux->func_info_aux);
2247	free_uid(aux->user);
2248	security_bpf_prog_free(aux->prog);
2249	bpf_prog_free(aux->prog);
2250}
2251
2252static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
2253{
2254	bpf_prog_kallsyms_del_all(prog);
2255	btf_put(prog->aux->btf);
2256	module_put(prog->aux->mod);
2257	kvfree(prog->aux->jited_linfo);
2258	kvfree(prog->aux->linfo);
2259	kfree(prog->aux->kfunc_tab);
2260	if (prog->aux->attach_btf)
2261		btf_put(prog->aux->attach_btf);
2262
2263	if (deferred) {
2264		if (prog->sleepable)
2265			call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
2266		else
2267			call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
2268	} else {
2269		__bpf_prog_put_rcu(&prog->aux->rcu);
2270	}
2271}
2272
2273static void bpf_prog_put_deferred(struct work_struct *work)
2274{
2275	struct bpf_prog_aux *aux;
2276	struct bpf_prog *prog;
2277
2278	aux = container_of(work, struct bpf_prog_aux, work);
2279	prog = aux->prog;
2280	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
2281	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
2282	bpf_prog_free_id(prog);
2283	__bpf_prog_put_noref(prog, true);
2284}
2285
2286static void __bpf_prog_put(struct bpf_prog *prog)
2287{
2288	struct bpf_prog_aux *aux = prog->aux;
2289
2290	if (atomic64_dec_and_test(&aux->refcnt)) {
2291		if (in_irq() || irqs_disabled()) {
2292			INIT_WORK(&aux->work, bpf_prog_put_deferred);
2293			schedule_work(&aux->work);
2294		} else {
2295			bpf_prog_put_deferred(&aux->work);
2296		}
2297	}
2298}
2299
2300void bpf_prog_put(struct bpf_prog *prog)
2301{
2302	__bpf_prog_put(prog);
2303}
2304EXPORT_SYMBOL_GPL(bpf_prog_put);
2305
2306static int bpf_prog_release(struct inode *inode, struct file *filp)
2307{
2308	struct bpf_prog *prog = filp->private_data;
2309
2310	bpf_prog_put(prog);
2311	return 0;
2312}
2313
2314struct bpf_prog_kstats {
2315	u64 nsecs;
2316	u64 cnt;
2317	u64 misses;
2318};
2319
2320void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
2321{
2322	struct bpf_prog_stats *stats;
2323	unsigned int flags;
2324
2325	stats = this_cpu_ptr(prog->stats);
2326	flags = u64_stats_update_begin_irqsave(&stats->syncp);
2327	u64_stats_inc(&stats->misses);
2328	u64_stats_update_end_irqrestore(&stats->syncp, flags);
2329}
2330
2331static void bpf_prog_get_stats(const struct bpf_prog *prog,
2332			       struct bpf_prog_kstats *stats)
2333{
2334	u64 nsecs = 0, cnt = 0, misses = 0;
2335	int cpu;
2336
2337	for_each_possible_cpu(cpu) {
2338		const struct bpf_prog_stats *st;
2339		unsigned int start;
2340		u64 tnsecs, tcnt, tmisses;
2341
2342		st = per_cpu_ptr(prog->stats, cpu);
2343		do {
2344			start = u64_stats_fetch_begin(&st->syncp);
2345			tnsecs = u64_stats_read(&st->nsecs);
2346			tcnt = u64_stats_read(&st->cnt);
2347			tmisses = u64_stats_read(&st->misses);
2348		} while (u64_stats_fetch_retry(&st->syncp, start));
2349		nsecs += tnsecs;
2350		cnt += tcnt;
2351		misses += tmisses;
2352	}
2353	stats->nsecs = nsecs;
2354	stats->cnt = cnt;
2355	stats->misses = misses;
2356}
2357
2358#ifdef CONFIG_PROC_FS
2359static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
2360{
2361	const struct bpf_prog *prog = filp->private_data;
2362	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2363	struct bpf_prog_kstats stats;
2364
2365	bpf_prog_get_stats(prog, &stats);
2366	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2367	seq_printf(m,
2368		   "prog_type:\t%u\n"
2369		   "prog_jited:\t%u\n"
2370		   "prog_tag:\t%s\n"
2371		   "memlock:\t%llu\n"
2372		   "prog_id:\t%u\n"
2373		   "run_time_ns:\t%llu\n"
2374		   "run_cnt:\t%llu\n"
2375		   "recursion_misses:\t%llu\n"
2376		   "verified_insns:\t%u\n",
2377		   prog->type,
2378		   prog->jited,
2379		   prog_tag,
2380		   prog->pages * 1ULL << PAGE_SHIFT,
2381		   prog->aux->id,
2382		   stats.nsecs,
2383		   stats.cnt,
2384		   stats.misses,
2385		   prog->aux->verified_insns);
2386}
2387#endif
2388
2389const struct file_operations bpf_prog_fops = {
2390#ifdef CONFIG_PROC_FS
2391	.show_fdinfo	= bpf_prog_show_fdinfo,
2392#endif
2393	.release	= bpf_prog_release,
2394	.read		= bpf_dummy_read,
2395	.write		= bpf_dummy_write,
2396};
2397
2398int bpf_prog_new_fd(struct bpf_prog *prog)
2399{
2400	int ret;
2401
2402	ret = security_bpf_prog(prog);
2403	if (ret < 0)
2404		return ret;
2405
2406	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
2407				O_RDWR | O_CLOEXEC);
2408}
2409
2410static struct bpf_prog *____bpf_prog_get(struct fd f)
2411{
2412	if (!f.file)
2413		return ERR_PTR(-EBADF);
2414	if (f.file->f_op != &bpf_prog_fops) {
2415		fdput(f);
2416		return ERR_PTR(-EINVAL);
2417	}
2418
2419	return f.file->private_data;
2420}
2421
2422void bpf_prog_add(struct bpf_prog *prog, int i)
2423{
2424	atomic64_add(i, &prog->aux->refcnt);
2425}
2426EXPORT_SYMBOL_GPL(bpf_prog_add);
2427
2428void bpf_prog_sub(struct bpf_prog *prog, int i)
2429{
2430	/* Only to be used for undoing previous bpf_prog_add() in some
2431	 * error path. We still know that another entity in our call
2432	 * path holds a reference to the program, thus atomic_sub() can
2433	 * be safely used in such cases!
2434	 */
2435	WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
2436}
2437EXPORT_SYMBOL_GPL(bpf_prog_sub);
2438
2439void bpf_prog_inc(struct bpf_prog *prog)
2440{
2441	atomic64_inc(&prog->aux->refcnt);
2442}
2443EXPORT_SYMBOL_GPL(bpf_prog_inc);
2444
2445/* prog_idr_lock should have been held */
2446struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
2447{
2448	int refold;
2449
2450	refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
2451
2452	if (!refold)
2453		return ERR_PTR(-ENOENT);
2454
2455	return prog;
2456}
2457EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
2458
2459bool bpf_prog_get_ok(struct bpf_prog *prog,
2460			    enum bpf_prog_type *attach_type, bool attach_drv)
2461{
2462	/* not an attachment, just a refcount inc, always allow */
2463	if (!attach_type)
2464		return true;
2465
2466	if (prog->type != *attach_type)
2467		return false;
2468	if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
2469		return false;
2470
2471	return true;
2472}
2473
2474static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
2475				       bool attach_drv)
2476{
2477	struct fd f = fdget(ufd);
2478	struct bpf_prog *prog;
2479
2480	prog = ____bpf_prog_get(f);
2481	if (IS_ERR(prog))
2482		return prog;
2483	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
2484		prog = ERR_PTR(-EINVAL);
2485		goto out;
2486	}
2487
2488	bpf_prog_inc(prog);
2489out:
2490	fdput(f);
2491	return prog;
2492}
2493
2494struct bpf_prog *bpf_prog_get(u32 ufd)
2495{
2496	return __bpf_prog_get(ufd, NULL, false);
2497}
2498
2499struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
2500				       bool attach_drv)
2501{
2502	return __bpf_prog_get(ufd, &type, attach_drv);
2503}
2504EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
2505
2506/* Initially all BPF programs could be loaded w/o specifying
2507 * expected_attach_type. Later for some of them specifying expected_attach_type
2508 * at load time became required so that program could be validated properly.
2509 * Programs of types that are allowed to be loaded both w/ and w/o (for
2510 * backward compatibility) expected_attach_type, should have the default attach
2511 * type assigned to expected_attach_type for the latter case, so that it can be
2512 * validated later at attach time.
2513 *
2514 * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
2515 * prog type requires it but has some attach types that have to be backward
2516 * compatible.
2517 */
2518static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
2519{
2520	switch (attr->prog_type) {
2521	case BPF_PROG_TYPE_CGROUP_SOCK:
2522		/* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
2523		 * exist so checking for non-zero is the way to go here.
2524		 */
2525		if (!attr->expected_attach_type)
2526			attr->expected_attach_type =
2527				BPF_CGROUP_INET_SOCK_CREATE;
2528		break;
2529	case BPF_PROG_TYPE_SK_REUSEPORT:
2530		if (!attr->expected_attach_type)
2531			attr->expected_attach_type =
2532				BPF_SK_REUSEPORT_SELECT;
2533		break;
2534	}
2535}
2536
2537static int
2538bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
2539			   enum bpf_attach_type expected_attach_type,
2540			   struct btf *attach_btf, u32 btf_id,
2541			   struct bpf_prog *dst_prog)
2542{
2543	if (btf_id) {
2544		if (btf_id > BTF_MAX_TYPE)
2545			return -EINVAL;
2546
2547		if (!attach_btf && !dst_prog)
2548			return -EINVAL;
2549
2550		switch (prog_type) {
2551		case BPF_PROG_TYPE_TRACING:
2552		case BPF_PROG_TYPE_LSM:
2553		case BPF_PROG_TYPE_STRUCT_OPS:
2554		case BPF_PROG_TYPE_EXT:
2555			break;
2556		default:
2557			return -EINVAL;
2558		}
2559	}
2560
2561	if (attach_btf && (!btf_id || dst_prog))
2562		return -EINVAL;
2563
2564	if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
2565	    prog_type != BPF_PROG_TYPE_EXT)
2566		return -EINVAL;
2567
2568	switch (prog_type) {
2569	case BPF_PROG_TYPE_CGROUP_SOCK:
2570		switch (expected_attach_type) {
2571		case BPF_CGROUP_INET_SOCK_CREATE:
2572		case BPF_CGROUP_INET_SOCK_RELEASE:
2573		case BPF_CGROUP_INET4_POST_BIND:
2574		case BPF_CGROUP_INET6_POST_BIND:
2575			return 0;
2576		default:
2577			return -EINVAL;
2578		}
2579	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2580		switch (expected_attach_type) {
2581		case BPF_CGROUP_INET4_BIND:
2582		case BPF_CGROUP_INET6_BIND:
2583		case BPF_CGROUP_INET4_CONNECT:
2584		case BPF_CGROUP_INET6_CONNECT:
2585		case BPF_CGROUP_UNIX_CONNECT:
2586		case BPF_CGROUP_INET4_GETPEERNAME:
2587		case BPF_CGROUP_INET6_GETPEERNAME:
2588		case BPF_CGROUP_UNIX_GETPEERNAME:
2589		case BPF_CGROUP_INET4_GETSOCKNAME:
2590		case BPF_CGROUP_INET6_GETSOCKNAME:
2591		case BPF_CGROUP_UNIX_GETSOCKNAME:
2592		case BPF_CGROUP_UDP4_SENDMSG:
2593		case BPF_CGROUP_UDP6_SENDMSG:
2594		case BPF_CGROUP_UNIX_SENDMSG:
2595		case BPF_CGROUP_UDP4_RECVMSG:
2596		case BPF_CGROUP_UDP6_RECVMSG:
2597		case BPF_CGROUP_UNIX_RECVMSG:
2598			return 0;
2599		default:
2600			return -EINVAL;
2601		}
2602	case BPF_PROG_TYPE_CGROUP_SKB:
2603		switch (expected_attach_type) {
2604		case BPF_CGROUP_INET_INGRESS:
2605		case BPF_CGROUP_INET_EGRESS:
2606			return 0;
2607		default:
2608			return -EINVAL;
2609		}
2610	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2611		switch (expected_attach_type) {
2612		case BPF_CGROUP_SETSOCKOPT:
2613		case BPF_CGROUP_GETSOCKOPT:
2614			return 0;
2615		default:
2616			return -EINVAL;
2617		}
2618	case BPF_PROG_TYPE_SK_LOOKUP:
2619		if (expected_attach_type == BPF_SK_LOOKUP)
2620			return 0;
2621		return -EINVAL;
2622	case BPF_PROG_TYPE_SK_REUSEPORT:
2623		switch (expected_attach_type) {
2624		case BPF_SK_REUSEPORT_SELECT:
2625		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
2626			return 0;
2627		default:
2628			return -EINVAL;
2629		}
2630	case BPF_PROG_TYPE_NETFILTER:
2631		if (expected_attach_type == BPF_NETFILTER)
2632			return 0;
2633		return -EINVAL;
2634	case BPF_PROG_TYPE_SYSCALL:
2635	case BPF_PROG_TYPE_EXT:
2636		if (expected_attach_type)
2637			return -EINVAL;
2638		fallthrough;
2639	default:
2640		return 0;
2641	}
2642}
2643
2644static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2645{
2646	switch (prog_type) {
2647	case BPF_PROG_TYPE_SCHED_CLS:
2648	case BPF_PROG_TYPE_SCHED_ACT:
2649	case BPF_PROG_TYPE_XDP:
2650	case BPF_PROG_TYPE_LWT_IN:
2651	case BPF_PROG_TYPE_LWT_OUT:
2652	case BPF_PROG_TYPE_LWT_XMIT:
2653	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2654	case BPF_PROG_TYPE_SK_SKB:
2655	case BPF_PROG_TYPE_SK_MSG:
2656	case BPF_PROG_TYPE_FLOW_DISSECTOR:
2657	case BPF_PROG_TYPE_CGROUP_DEVICE:
2658	case BPF_PROG_TYPE_CGROUP_SOCK:
2659	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2660	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2661	case BPF_PROG_TYPE_CGROUP_SYSCTL:
2662	case BPF_PROG_TYPE_SOCK_OPS:
2663	case BPF_PROG_TYPE_EXT: /* extends any prog */
2664	case BPF_PROG_TYPE_NETFILTER:
2665		return true;
2666	case BPF_PROG_TYPE_CGROUP_SKB:
2667		/* always unpriv */
2668	case BPF_PROG_TYPE_SK_REUSEPORT:
2669		/* equivalent to SOCKET_FILTER. need CAP_BPF only */
2670	default:
2671		return false;
2672	}
2673}
2674
2675static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2676{
2677	switch (prog_type) {
2678	case BPF_PROG_TYPE_KPROBE:
2679	case BPF_PROG_TYPE_TRACEPOINT:
2680	case BPF_PROG_TYPE_PERF_EVENT:
2681	case BPF_PROG_TYPE_RAW_TRACEPOINT:
2682	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2683	case BPF_PROG_TYPE_TRACING:
2684	case BPF_PROG_TYPE_LSM:
2685	case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
2686	case BPF_PROG_TYPE_EXT: /* extends any prog */
2687		return true;
2688	default:
2689		return false;
2690	}
2691}
2692
2693/* last field in 'union bpf_attr' used by this command */
2694#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
2695
2696static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
2697{
2698	enum bpf_prog_type type = attr->prog_type;
2699	struct bpf_prog *prog, *dst_prog = NULL;
2700	struct btf *attach_btf = NULL;
2701	struct bpf_token *token = NULL;
2702	bool bpf_cap;
2703	int err;
2704	char license[128];
2705
2706	if (CHECK_ATTR(BPF_PROG_LOAD))
2707		return -EINVAL;
2708
2709	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2710				 BPF_F_ANY_ALIGNMENT |
2711				 BPF_F_TEST_STATE_FREQ |
2712				 BPF_F_SLEEPABLE |
2713				 BPF_F_TEST_RND_HI32 |
2714				 BPF_F_XDP_HAS_FRAGS |
2715				 BPF_F_XDP_DEV_BOUND_ONLY |
2716				 BPF_F_TEST_REG_INVARIANTS |
2717				 BPF_F_TOKEN_FD))
2718		return -EINVAL;
2719
2720	bpf_prog_load_fixup_attach_type(attr);
2721
2722	if (attr->prog_flags & BPF_F_TOKEN_FD) {
2723		token = bpf_token_get_from_fd(attr->prog_token_fd);
2724		if (IS_ERR(token))
2725			return PTR_ERR(token);
2726		/* if current token doesn't grant prog loading permissions,
2727		 * then we can't use this token, so ignore it and rely on
2728		 * system-wide capabilities checks
2729		 */
2730		if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
2731		    !bpf_token_allow_prog_type(token, attr->prog_type,
2732					       attr->expected_attach_type)) {
2733			bpf_token_put(token);
2734			token = NULL;
2735		}
2736	}
2737
2738	bpf_cap = bpf_token_capable(token, CAP_BPF);
2739	err = -EPERM;
2740
2741	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2742	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2743	    !bpf_cap)
2744		goto put_token;
2745
2746	/* Intent here is for unprivileged_bpf_disabled to block BPF program
2747	 * creation for unprivileged users; other actions depend
2748	 * on fd availability and access to bpffs, so are dependent on
2749	 * object creation success. Even with unprivileged BPF disabled,
2750	 * capability checks are still carried out for these
2751	 * and other operations.
2752	 */
2753	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
2754		goto put_token;
2755
2756	if (attr->insn_cnt == 0 ||
2757	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
2758		err = -E2BIG;
2759		goto put_token;
2760	}
2761	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2762	    type != BPF_PROG_TYPE_CGROUP_SKB &&
2763	    !bpf_cap)
2764		goto put_token;
2765
2766	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
2767		goto put_token;
2768	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
2769		goto put_token;
2770
2771	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
2772	 * or btf, we need to check which one it is
2773	 */
2774	if (attr->attach_prog_fd) {
2775		dst_prog = bpf_prog_get(attr->attach_prog_fd);
2776		if (IS_ERR(dst_prog)) {
2777			dst_prog = NULL;
2778			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
2779			if (IS_ERR(attach_btf)) {
2780				err = -EINVAL;
2781				goto put_token;
2782			}
2783			if (!btf_is_kernel(attach_btf)) {
2784				/* attaching through specifying bpf_prog's BTF
2785				 * objects directly might be supported eventually
2786				 */
2787				btf_put(attach_btf);
2788				err = -ENOTSUPP;
2789				goto put_token;
2790			}
2791		}
2792	} else if (attr->attach_btf_id) {
2793		/* fall back to vmlinux BTF, if BTF type ID is specified */
2794		attach_btf = bpf_get_btf_vmlinux();
2795		if (IS_ERR(attach_btf)) {
2796			err = PTR_ERR(attach_btf);
2797			goto put_token;
2798		}
2799		if (!attach_btf) {
2800			err = -EINVAL;
2801			goto put_token;
2802		}
2803		btf_get(attach_btf);
2804	}
2805
2806	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2807				       attach_btf, attr->attach_btf_id,
2808				       dst_prog)) {
2809		if (dst_prog)
2810			bpf_prog_put(dst_prog);
2811		if (attach_btf)
2812			btf_put(attach_btf);
2813		err = -EINVAL;
2814		goto put_token;
2815	}
2816
2817	/* plain bpf_prog allocation */
2818	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
2819	if (!prog) {
2820		if (dst_prog)
2821			bpf_prog_put(dst_prog);
2822		if (attach_btf)
2823			btf_put(attach_btf);
2824		err = -EINVAL;
2825		goto put_token;
2826	}
2827
2828	prog->expected_attach_type = attr->expected_attach_type;
2829	prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
2830	prog->aux->attach_btf = attach_btf;
2831	prog->aux->attach_btf_id = attr->attach_btf_id;
2832	prog->aux->dst_prog = dst_prog;
2833	prog->aux->dev_bound = !!attr->prog_ifindex;
2834	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
2835
2836	/* move token into prog->aux, reuse taken refcnt */
2837	prog->aux->token = token;
2838	token = NULL;
2839
2840	prog->aux->user = get_current_user();
2841	prog->len = attr->insn_cnt;
2842
2843	err = -EFAULT;
2844	if (copy_from_bpfptr(prog->insns,
2845			     make_bpfptr(attr->insns, uattr.is_kernel),
2846			     bpf_prog_insn_size(prog)) != 0)
2847		goto free_prog;
2848	/* copy eBPF program license from user space */
2849	if (strncpy_from_bpfptr(license,
2850				make_bpfptr(attr->license, uattr.is_kernel),
2851				sizeof(license) - 1) < 0)
2852		goto free_prog;
2853	license[sizeof(license) - 1] = 0;
2854
2855	/* eBPF programs must be GPL compatible to use GPL-ed functions */
2856	prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
2857
2858	prog->orig_prog = NULL;
2859	prog->jited = 0;
2860
2861	atomic64_set(&prog->aux->refcnt, 1);
2862
2863	if (bpf_prog_is_dev_bound(prog->aux)) {
2864		err = bpf_prog_dev_bound_init(prog, attr);
2865		if (err)
2866			goto free_prog;
2867	}
2868
2869	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
2870	    bpf_prog_is_dev_bound(dst_prog->aux)) {
2871		err = bpf_prog_dev_bound_inherit(prog, dst_prog);
2872		if (err)
2873			goto free_prog;
2874	}
2875
2876	/*
2877	 * Bookkeeping for managing the program attachment chain.
2878	 *
2879	 * It might be tempting to set attach_tracing_prog flag at the attachment
2880	 * time, but this will not prevent from loading bunch of tracing prog
2881	 * first, then attach them one to another.
2882	 *
2883	 * The flag attach_tracing_prog is set for the whole program lifecycle, and
2884	 * doesn't have to be cleared in bpf_tracing_link_release, since tracing
2885	 * programs cannot change attachment target.
2886	 */
2887	if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
2888	    dst_prog->type == BPF_PROG_TYPE_TRACING) {
2889		prog->aux->attach_tracing_prog = true;
2890	}
2891
2892	/* find program type: socket_filter vs tracing_filter */
2893	err = find_prog_type(type, prog);
2894	if (err < 0)
2895		goto free_prog;
2896
2897	prog->aux->load_time = ktime_get_boottime_ns();
2898	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2899			       sizeof(attr->prog_name));
2900	if (err < 0)
2901		goto free_prog;
2902
2903	err = security_bpf_prog_load(prog, attr, token);
2904	if (err)
2905		goto free_prog_sec;
2906
2907	/* run eBPF verifier */
2908	err = bpf_check(&prog, attr, uattr, uattr_size);
2909	if (err < 0)
2910		goto free_used_maps;
2911
2912	prog = bpf_prog_select_runtime(prog, &err);
2913	if (err < 0)
2914		goto free_used_maps;
2915
2916	err = bpf_prog_alloc_id(prog);
2917	if (err)
2918		goto free_used_maps;
2919
2920	/* Upon success of bpf_prog_alloc_id(), the BPF prog is
2921	 * effectively publicly exposed. However, retrieving via
2922	 * bpf_prog_get_fd_by_id() will take another reference,
2923	 * therefore it cannot be gone underneath us.
2924	 *
2925	 * Only for the time /after/ successful bpf_prog_new_fd()
2926	 * and before returning to userspace, we might just hold
2927	 * one reference and any parallel close on that fd could
2928	 * rip everything out. Hence, below notifications must
2929	 * happen before bpf_prog_new_fd().
2930	 *
2931	 * Also, any failure handling from this point onwards must
2932	 * be using bpf_prog_put() given the program is exposed.
2933	 */
2934	bpf_prog_kallsyms_add(prog);
2935	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2936	bpf_audit_prog(prog, BPF_AUDIT_LOAD);
2937
2938	err = bpf_prog_new_fd(prog);
2939	if (err < 0)
2940		bpf_prog_put(prog);
2941	return err;
2942
2943free_used_maps:
2944	/* In case we have subprogs, we need to wait for a grace
2945	 * period before we can tear down JIT memory since symbols
2946	 * are already exposed under kallsyms.
2947	 */
2948	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
2949	return err;
2950
2951free_prog_sec:
2952	security_bpf_prog_free(prog);
2953free_prog:
2954	free_uid(prog->aux->user);
2955	if (prog->aux->attach_btf)
2956		btf_put(prog->aux->attach_btf);
2957	bpf_prog_free(prog);
2958put_token:
2959	bpf_token_put(token);
2960	return err;
2961}
2962
2963#define BPF_OBJ_LAST_FIELD path_fd
2964
2965static int bpf_obj_pin(const union bpf_attr *attr)
2966{
2967	int path_fd;
2968
2969	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
2970		return -EINVAL;
2971
2972	/* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2973	if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2974		return -EINVAL;
2975
2976	path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2977	return bpf_obj_pin_user(attr->bpf_fd, path_fd,
2978				u64_to_user_ptr(attr->pathname));
2979}
2980
2981static int bpf_obj_get(const union bpf_attr *attr)
2982{
2983	int path_fd;
2984
2985	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
2986	    attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
2987		return -EINVAL;
2988
2989	/* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2990	if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2991		return -EINVAL;
2992
2993	path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2994	return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
2995				attr->file_flags);
2996}
2997
2998void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
2999		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
3000{
3001	atomic64_set(&link->refcnt, 1);
3002	link->type = type;
3003	link->id = 0;
3004	link->ops = ops;
3005	link->prog = prog;
3006}
3007
3008static void bpf_link_free_id(int id)
3009{
3010	if (!id)
3011		return;
3012
3013	spin_lock_bh(&link_idr_lock);
3014	idr_remove(&link_idr, id);
3015	spin_unlock_bh(&link_idr_lock);
3016}
3017
3018/* Clean up bpf_link and corresponding anon_inode file and FD. After
3019 * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
3020 * anon_inode's release() call. This helper marks bpf_link as
3021 * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
3022 * is not decremented, it's the responsibility of a calling code that failed
3023 * to complete bpf_link initialization.
3024 * This helper eventually calls link's dealloc callback, but does not call
3025 * link's release callback.
3026 */
3027void bpf_link_cleanup(struct bpf_link_primer *primer)
3028{
3029	primer->link->prog = NULL;
3030	bpf_link_free_id(primer->id);
3031	fput(primer->file);
3032	put_unused_fd(primer->fd);
3033}
3034
3035void bpf_link_inc(struct bpf_link *link)
3036{
3037	atomic64_inc(&link->refcnt);
3038}
3039
3040static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
3041{
3042	struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
3043
3044	/* free bpf_link and its containing memory */
3045	link->ops->dealloc_deferred(link);
3046}
3047
3048static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
3049{
3050	if (rcu_trace_implies_rcu_gp())
3051		bpf_link_defer_dealloc_rcu_gp(rcu);
3052	else
3053		call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
3054}
3055
3056/* bpf_link_free is guaranteed to be called from process context */
3057static void bpf_link_free(struct bpf_link *link)
3058{
3059	bool sleepable = false;
3060
3061	bpf_link_free_id(link->id);
3062	if (link->prog) {
3063		sleepable = link->prog->sleepable;
3064		/* detach BPF program, clean up used resources */
3065		link->ops->release(link);
3066		bpf_prog_put(link->prog);
3067	}
3068	if (link->ops->dealloc_deferred) {
3069		/* schedule BPF link deallocation; if underlying BPF program
3070		 * is sleepable, we need to first wait for RCU tasks trace
3071		 * sync, then go through "classic" RCU grace period
3072		 */
3073		if (sleepable)
3074			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
3075		else
3076			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
3077	}
3078	if (link->ops->dealloc)
3079		link->ops->dealloc(link);
3080}
3081
3082static void bpf_link_put_deferred(struct work_struct *work)
3083{
3084	struct bpf_link *link = container_of(work, struct bpf_link, work);
3085
3086	bpf_link_free(link);
3087}
3088
3089/* bpf_link_put might be called from atomic context. It needs to be called
3090 * from sleepable context in order to acquire sleeping locks during the process.
3091 */
3092void bpf_link_put(struct bpf_link *link)
3093{
3094	if (!atomic64_dec_and_test(&link->refcnt))
3095		return;
3096
3097	INIT_WORK(&link->work, bpf_link_put_deferred);
3098	schedule_work(&link->work);
3099}
3100EXPORT_SYMBOL(bpf_link_put);
3101
3102static void bpf_link_put_direct(struct bpf_link *link)
3103{
3104	if (!atomic64_dec_and_test(&link->refcnt))
3105		return;
3106	bpf_link_free(link);
3107}
3108
3109static int bpf_link_release(struct inode *inode, struct file *filp)
3110{
3111	struct bpf_link *link = filp->private_data;
3112
3113	bpf_link_put_direct(link);
3114	return 0;
3115}
3116
3117#ifdef CONFIG_PROC_FS
3118#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
3119#define BPF_MAP_TYPE(_id, _ops)
3120#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
3121static const char *bpf_link_type_strs[] = {
3122	[BPF_LINK_TYPE_UNSPEC] = "<invalid>",
3123#include <linux/bpf_types.h>
3124};
3125#undef BPF_PROG_TYPE
3126#undef BPF_MAP_TYPE
3127#undef BPF_LINK_TYPE
3128
3129static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
3130{
3131	const struct bpf_link *link = filp->private_data;
3132	const struct bpf_prog *prog = link->prog;
3133	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
3134
3135	seq_printf(m,
3136		   "link_type:\t%s\n"
3137		   "link_id:\t%u\n",
3138		   bpf_link_type_strs[link->type],
3139		   link->id);
3140	if (prog) {
3141		bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
3142		seq_printf(m,
3143			   "prog_tag:\t%s\n"
3144			   "prog_id:\t%u\n",
3145			   prog_tag,
3146			   prog->aux->id);
3147	}
3148	if (link->ops->show_fdinfo)
3149		link->ops->show_fdinfo(link, m);
3150}
3151#endif
3152
3153static const struct file_operations bpf_link_fops = {
3154#ifdef CONFIG_PROC_FS
3155	.show_fdinfo	= bpf_link_show_fdinfo,
3156#endif
3157	.release	= bpf_link_release,
3158	.read		= bpf_dummy_read,
3159	.write		= bpf_dummy_write,
3160};
3161
3162static int bpf_link_alloc_id(struct bpf_link *link)
3163{
3164	int id;
3165
3166	idr_preload(GFP_KERNEL);
3167	spin_lock_bh(&link_idr_lock);
3168	id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
3169	spin_unlock_bh(&link_idr_lock);
3170	idr_preload_end();
3171
3172	return id;
3173}
3174
3175/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
3176 * reserving unused FD and allocating ID from link_idr. This is to be paired
3177 * with bpf_link_settle() to install FD and ID and expose bpf_link to
3178 * user-space, if bpf_link is successfully attached. If not, bpf_link and
3179 * pre-allocated resources are to be freed with bpf_cleanup() call. All the
3180 * transient state is passed around in struct bpf_link_primer.
3181 * This is preferred way to create and initialize bpf_link, especially when
3182 * there are complicated and expensive operations in between creating bpf_link
3183 * itself and attaching it to BPF hook. By using bpf_link_prime() and
3184 * bpf_link_settle() kernel code using bpf_link doesn't have to perform
3185 * expensive (and potentially failing) roll back operations in a rare case
3186 * that file, FD, or ID can't be allocated.
3187 */
3188int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
3189{
3190	struct file *file;
3191	int fd, id;
3192
3193	fd = get_unused_fd_flags(O_CLOEXEC);
3194	if (fd < 0)
3195		return fd;
3196
3197
3198	id = bpf_link_alloc_id(link);
3199	if (id < 0) {
3200		put_unused_fd(fd);
3201		return id;
3202	}
3203
3204	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
3205	if (IS_ERR(file)) {
3206		bpf_link_free_id(id);
3207		put_unused_fd(fd);
3208		return PTR_ERR(file);
3209	}
3210
3211	primer->link = link;
3212	primer->file = file;
3213	primer->fd = fd;
3214	primer->id = id;
3215	return 0;
3216}
3217
3218int bpf_link_settle(struct bpf_link_primer *primer)
3219{
3220	/* make bpf_link fetchable by ID */
3221	spin_lock_bh(&link_idr_lock);
3222	primer->link->id = primer->id;
3223	spin_unlock_bh(&link_idr_lock);
3224	/* make bpf_link fetchable by FD */
3225	fd_install(primer->fd, primer->file);
3226	/* pass through installed FD */
3227	return primer->fd;
3228}
3229
3230int bpf_link_new_fd(struct bpf_link *link)
3231{
3232	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
3233}
3234
3235struct bpf_link *bpf_link_get_from_fd(u32 ufd)
3236{
3237	struct fd f = fdget(ufd);
3238	struct bpf_link *link;
3239
3240	if (!f.file)
3241		return ERR_PTR(-EBADF);
3242	if (f.file->f_op != &bpf_link_fops) {
3243		fdput(f);
3244		return ERR_PTR(-EINVAL);
3245	}
3246
3247	link = f.file->private_data;
3248	bpf_link_inc(link);
3249	fdput(f);
3250
3251	return link;
3252}
3253EXPORT_SYMBOL(bpf_link_get_from_fd);
3254
3255static void bpf_tracing_link_release(struct bpf_link *link)
3256{
3257	struct bpf_tracing_link *tr_link =
3258		container_of(link, struct bpf_tracing_link, link.link);
3259
3260	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
3261						tr_link->trampoline));
3262
3263	bpf_trampoline_put(tr_link->trampoline);
3264
3265	/* tgt_prog is NULL if target is a kernel function */
3266	if (tr_link->tgt_prog)
3267		bpf_prog_put(tr_link->tgt_prog);
3268}
3269
3270static void bpf_tracing_link_dealloc(struct bpf_link *link)
3271{
3272	struct bpf_tracing_link *tr_link =
3273		container_of(link, struct bpf_tracing_link, link.link);
3274
3275	kfree(tr_link);
3276}
3277
3278static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
3279					 struct seq_file *seq)
3280{
3281	struct bpf_tracing_link *tr_link =
3282		container_of(link, struct bpf_tracing_link, link.link);
3283	u32 target_btf_id, target_obj_id;
3284
3285	bpf_trampoline_unpack_key(tr_link->trampoline->key,
3286				  &target_obj_id, &target_btf_id);
3287	seq_printf(seq,
3288		   "attach_type:\t%d\n"
3289		   "target_obj_id:\t%u\n"
3290		   "target_btf_id:\t%u\n",
3291		   tr_link->attach_type,
3292		   target_obj_id,
3293		   target_btf_id);
3294}
3295
3296static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
3297					   struct bpf_link_info *info)
3298{
3299	struct bpf_tracing_link *tr_link =
3300		container_of(link, struct bpf_tracing_link, link.link);
3301
3302	info->tracing.attach_type = tr_link->attach_type;
3303	bpf_trampoline_unpack_key(tr_link->trampoline->key,
3304				  &info->tracing.target_obj_id,
3305				  &info->tracing.target_btf_id);
3306
3307	return 0;
3308}
3309
3310static const struct bpf_link_ops bpf_tracing_link_lops = {
3311	.release = bpf_tracing_link_release,
3312	.dealloc = bpf_tracing_link_dealloc,
3313	.show_fdinfo = bpf_tracing_link_show_fdinfo,
3314	.fill_link_info = bpf_tracing_link_fill_link_info,
3315};
3316
3317static int bpf_tracing_prog_attach(struct bpf_prog *prog,
3318				   int tgt_prog_fd,
3319				   u32 btf_id,
3320				   u64 bpf_cookie)
3321{
3322	struct bpf_link_primer link_primer;
3323	struct bpf_prog *tgt_prog = NULL;
3324	struct bpf_trampoline *tr = NULL;
3325	struct bpf_tracing_link *link;
3326	u64 key = 0;
3327	int err;
3328
3329	switch (prog->type) {
3330	case BPF_PROG_TYPE_TRACING:
3331		if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
3332		    prog->expected_attach_type != BPF_TRACE_FEXIT &&
3333		    prog->expected_attach_type != BPF_MODIFY_RETURN) {
3334			err = -EINVAL;
3335			goto out_put_prog;
3336		}
3337		break;
3338	case BPF_PROG_TYPE_EXT:
3339		if (prog->expected_attach_type != 0) {
3340			err = -EINVAL;
3341			goto out_put_prog;
3342		}
3343		break;
3344	case BPF_PROG_TYPE_LSM:
3345		if (prog->expected_attach_type != BPF_LSM_MAC) {
3346			err = -EINVAL;
3347			goto out_put_prog;
3348		}
3349		break;
3350	default:
3351		err = -EINVAL;
3352		goto out_put_prog;
3353	}
3354
3355	if (!!tgt_prog_fd != !!btf_id) {
3356		err = -EINVAL;
3357		goto out_put_prog;
3358	}
3359
3360	if (tgt_prog_fd) {
3361		/*
3362		 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
3363		 * part would be changed to implement the same for
3364		 * BPF_PROG_TYPE_TRACING, do not forget to update the way how
3365		 * attach_tracing_prog flag is set.
3366		 */
3367		if (prog->type != BPF_PROG_TYPE_EXT) {
3368			err = -EINVAL;
3369			goto out_put_prog;
3370		}
3371
3372		tgt_prog = bpf_prog_get(tgt_prog_fd);
3373		if (IS_ERR(tgt_prog)) {
3374			err = PTR_ERR(tgt_prog);
3375			tgt_prog = NULL;
3376			goto out_put_prog;
3377		}
3378
3379		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
3380	}
3381
3382	link = kzalloc(sizeof(*link), GFP_USER);
3383	if (!link) {
3384		err = -ENOMEM;
3385		goto out_put_prog;
3386	}
3387	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
3388		      &bpf_tracing_link_lops, prog);
3389	link->attach_type = prog->expected_attach_type;
3390	link->link.cookie = bpf_cookie;
3391
3392	mutex_lock(&prog->aux->dst_mutex);
3393
3394	/* There are a few possible cases here:
3395	 *
3396	 * - if prog->aux->dst_trampoline is set, the program was just loaded
3397	 *   and not yet attached to anything, so we can use the values stored
3398	 *   in prog->aux
3399	 *
3400	 * - if prog->aux->dst_trampoline is NULL, the program has already been
3401         *   attached to a target and its initial target was cleared (below)
3402	 *
3403	 * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
3404	 *   target_btf_id using the link_create API.
3405	 *
3406	 * - if tgt_prog == NULL when this function was called using the old
3407	 *   raw_tracepoint_open API, and we need a target from prog->aux
3408	 *
3409	 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
3410	 *   was detached and is going for re-attachment.
3411	 *
3412	 * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
3413	 *   are NULL, then program was already attached and user did not provide
3414	 *   tgt_prog_fd so we have no way to find out or create trampoline
3415	 */
3416	if (!prog->aux->dst_trampoline && !tgt_prog) {
3417		/*
3418		 * Allow re-attach for TRACING and LSM programs. If it's
3419		 * currently linked, bpf_trampoline_link_prog will fail.
3420		 * EXT programs need to specify tgt_prog_fd, so they
3421		 * re-attach in separate code path.
3422		 */
3423		if (prog->type != BPF_PROG_TYPE_TRACING &&
3424		    prog->type != BPF_PROG_TYPE_LSM) {
3425			err = -EINVAL;
3426			goto out_unlock;
3427		}
3428		/* We can allow re-attach only if we have valid attach_btf. */
3429		if (!prog->aux->attach_btf) {
3430			err = -EINVAL;
3431			goto out_unlock;
3432		}
3433		btf_id = prog->aux->attach_btf_id;
3434		key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
3435	}
3436
3437	if (!prog->aux->dst_trampoline ||
3438	    (key && key != prog->aux->dst_trampoline->key)) {
3439		/* If there is no saved target, or the specified target is
3440		 * different from the destination specified at load time, we
3441		 * need a new trampoline and a check for compatibility
3442		 */
3443		struct bpf_attach_target_info tgt_info = {};
3444
3445		err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
3446					      &tgt_info);
3447		if (err)
3448			goto out_unlock;
3449
3450		if (tgt_info.tgt_mod) {
3451			module_put(prog->aux->mod);
3452			prog->aux->mod = tgt_info.tgt_mod;
3453		}
3454
3455		tr = bpf_trampoline_get(key, &tgt_info);
3456		if (!tr) {
3457			err = -ENOMEM;
3458			goto out_unlock;
3459		}
3460	} else {
3461		/* The caller didn't specify a target, or the target was the
3462		 * same as the destination supplied during program load. This
3463		 * means we can reuse the trampoline and reference from program
3464		 * load time, and there is no need to allocate a new one. This
3465		 * can only happen once for any program, as the saved values in
3466		 * prog->aux are cleared below.
3467		 */
3468		tr = prog->aux->dst_trampoline;
3469		tgt_prog = prog->aux->dst_prog;
3470	}
3471
3472	err = bpf_link_prime(&link->link.link, &link_primer);
3473	if (err)
3474		goto out_unlock;
3475
3476	err = bpf_trampoline_link_prog(&link->link, tr);
3477	if (err) {
3478		bpf_link_cleanup(&link_primer);
3479		link = NULL;
3480		goto out_unlock;
3481	}
3482
3483	link->tgt_prog = tgt_prog;
3484	link->trampoline = tr;
3485
3486	/* Always clear the trampoline and target prog from prog->aux to make
3487	 * sure the original attach destination is not kept alive after a
3488	 * program is (re-)attached to another target.
3489	 */
3490	if (prog->aux->dst_prog &&
3491	    (tgt_prog_fd || tr != prog->aux->dst_trampoline))
3492		/* got extra prog ref from syscall, or attaching to different prog */
3493		bpf_prog_put(prog->aux->dst_prog);
3494	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
3495		/* we allocated a new trampoline, so free the old one */
3496		bpf_trampoline_put(prog->aux->dst_trampoline);
3497
3498	prog->aux->dst_prog = NULL;
3499	prog->aux->dst_trampoline = NULL;
3500	mutex_unlock(&prog->aux->dst_mutex);
3501
3502	return bpf_link_settle(&link_primer);
3503out_unlock:
3504	if (tr && tr != prog->aux->dst_trampoline)
3505		bpf_trampoline_put(tr);
3506	mutex_unlock(&prog->aux->dst_mutex);
3507	kfree(link);
3508out_put_prog:
3509	if (tgt_prog_fd && tgt_prog)
3510		bpf_prog_put(tgt_prog);
3511	return err;
3512}
3513
3514static void bpf_raw_tp_link_release(struct bpf_link *link)
3515{
3516	struct bpf_raw_tp_link *raw_tp =
3517		container_of(link, struct bpf_raw_tp_link, link);
3518
3519	bpf_probe_unregister(raw_tp->btp, raw_tp);
3520	bpf_put_raw_tracepoint(raw_tp->btp);
3521}
3522
3523static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
3524{
3525	struct bpf_raw_tp_link *raw_tp =
3526		container_of(link, struct bpf_raw_tp_link, link);
3527
3528	kfree(raw_tp);
3529}
3530
3531static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
3532					struct seq_file *seq)
3533{
3534	struct bpf_raw_tp_link *raw_tp_link =
3535		container_of(link, struct bpf_raw_tp_link, link);
3536
3537	seq_printf(seq,
3538		   "tp_name:\t%s\n",
3539		   raw_tp_link->btp->tp->name);
3540}
3541
3542static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
3543			    u32 len)
3544{
3545	if (ulen >= len + 1) {
3546		if (copy_to_user(ubuf, buf, len + 1))
3547			return -EFAULT;
3548	} else {
3549		char zero = '\0';
3550
3551		if (copy_to_user(ubuf, buf, ulen - 1))
3552			return -EFAULT;
3553		if (put_user(zero, ubuf + ulen - 1))
3554			return -EFAULT;
3555		return -ENOSPC;
3556	}
3557
3558	return 0;
3559}
3560
3561static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
3562					  struct bpf_link_info *info)
3563{
3564	struct bpf_raw_tp_link *raw_tp_link =
3565		container_of(link, struct bpf_raw_tp_link, link);
3566	char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
3567	const char *tp_name = raw_tp_link->btp->tp->name;
3568	u32 ulen = info->raw_tracepoint.tp_name_len;
3569	size_t tp_len = strlen(tp_name);
3570
3571	if (!ulen ^ !ubuf)
3572		return -EINVAL;
3573
3574	info->raw_tracepoint.tp_name_len = tp_len + 1;
3575
3576	if (!ubuf)
3577		return 0;
3578
3579	return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
3580}
3581
3582static const struct bpf_link_ops bpf_raw_tp_link_lops = {
3583	.release = bpf_raw_tp_link_release,
3584	.dealloc_deferred = bpf_raw_tp_link_dealloc,
3585	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
3586	.fill_link_info = bpf_raw_tp_link_fill_link_info,
3587};
3588
3589#ifdef CONFIG_PERF_EVENTS
3590struct bpf_perf_link {
3591	struct bpf_link link;
3592	struct file *perf_file;
3593};
3594
3595static void bpf_perf_link_release(struct bpf_link *link)
3596{
3597	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3598	struct perf_event *event = perf_link->perf_file->private_data;
3599
3600	perf_event_free_bpf_prog(event);
3601	fput(perf_link->perf_file);
3602}
3603
3604static void bpf_perf_link_dealloc(struct bpf_link *link)
3605{
3606	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3607
3608	kfree(perf_link);
3609}
3610
3611static int bpf_perf_link_fill_common(const struct perf_event *event,
3612				     char __user *uname, u32 ulen,
3613				     u64 *probe_offset, u64 *probe_addr,
3614				     u32 *fd_type, unsigned long *missed)
3615{
3616	const char *buf;
3617	u32 prog_id;
3618	size_t len;
3619	int err;
3620
3621	if (!ulen ^ !uname)
3622		return -EINVAL;
3623
3624	err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
3625				      probe_offset, probe_addr, missed);
3626	if (err)
3627		return err;
3628	if (!uname)
3629		return 0;
3630	if (buf) {
3631		len = strlen(buf);
3632		err = bpf_copy_to_user(uname, buf, ulen, len);
3633		if (err)
3634			return err;
3635	} else {
3636		char zero = '\0';
3637
3638		if (put_user(zero, uname))
3639			return -EFAULT;
3640	}
3641	return 0;
3642}
3643
3644#ifdef CONFIG_KPROBE_EVENTS
3645static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
3646				     struct bpf_link_info *info)
3647{
3648	unsigned long missed;
3649	char __user *uname;
3650	u64 addr, offset;
3651	u32 ulen, type;
3652	int err;
3653
3654	uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
3655	ulen = info->perf_event.kprobe.name_len;
3656	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3657					&type, &missed);
3658	if (err)
3659		return err;
3660	if (type == BPF_FD_TYPE_KRETPROBE)
3661		info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
3662	else
3663		info->perf_event.type = BPF_PERF_EVENT_KPROBE;
3664
3665	info->perf_event.kprobe.offset = offset;
3666	info->perf_event.kprobe.missed = missed;
3667	if (!kallsyms_show_value(current_cred()))
3668		addr = 0;
3669	info->perf_event.kprobe.addr = addr;
3670	info->perf_event.kprobe.cookie = event->bpf_cookie;
3671	return 0;
3672}
3673#endif
3674
3675#ifdef CONFIG_UPROBE_EVENTS
3676static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
3677				     struct bpf_link_info *info)
3678{
3679	char __user *uname;
3680	u64 addr, offset;
3681	u32 ulen, type;
3682	int err;
3683
3684	uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
3685	ulen = info->perf_event.uprobe.name_len;
3686	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3687					&type, NULL);
3688	if (err)
3689		return err;
3690
3691	if (type == BPF_FD_TYPE_URETPROBE)
3692		info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
3693	else
3694		info->perf_event.type = BPF_PERF_EVENT_UPROBE;
3695	info->perf_event.uprobe.offset = offset;
3696	info->perf_event.uprobe.cookie = event->bpf_cookie;
3697	return 0;
3698}
3699#endif
3700
3701static int bpf_perf_link_fill_probe(const struct perf_event *event,
3702				    struct bpf_link_info *info)
3703{
3704#ifdef CONFIG_KPROBE_EVENTS
3705	if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
3706		return bpf_perf_link_fill_kprobe(event, info);
3707#endif
3708#ifdef CONFIG_UPROBE_EVENTS
3709	if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
3710		return bpf_perf_link_fill_uprobe(event, info);
3711#endif
3712	return -EOPNOTSUPP;
3713}
3714
3715static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
3716					 struct bpf_link_info *info)
3717{
3718	char __user *uname;
3719	u32 ulen;
3720
3721	uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
3722	ulen = info->perf_event.tracepoint.name_len;
3723	info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
3724	info->perf_event.tracepoint.cookie = event->bpf_cookie;
3725	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
3726}
3727
3728static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
3729					 struct bpf_link_info *info)
3730{
3731	info->perf_event.event.type = event->attr.type;
3732	info->perf_event.event.config = event->attr.config;
3733	info->perf_event.event.cookie = event->bpf_cookie;
3734	info->perf_event.type = BPF_PERF_EVENT_EVENT;
3735	return 0;
3736}
3737
3738static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
3739					struct bpf_link_info *info)
3740{
3741	struct bpf_perf_link *perf_link;
3742	const struct perf_event *event;
3743
3744	perf_link = container_of(link, struct bpf_perf_link, link);
3745	event = perf_get_event(perf_link->perf_file);
3746	if (IS_ERR(event))
3747		return PTR_ERR(event);
3748
3749	switch (event->prog->type) {
3750	case BPF_PROG_TYPE_PERF_EVENT:
3751		return bpf_perf_link_fill_perf_event(event, info);
3752	case BPF_PROG_TYPE_TRACEPOINT:
3753		return bpf_perf_link_fill_tracepoint(event, info);
3754	case BPF_PROG_TYPE_KPROBE:
3755		return bpf_perf_link_fill_probe(event, info);
3756	default:
3757		return -EOPNOTSUPP;
3758	}
3759}
3760
3761static const struct bpf_link_ops bpf_perf_link_lops = {
3762	.release = bpf_perf_link_release,
3763	.dealloc = bpf_perf_link_dealloc,
3764	.fill_link_info = bpf_perf_link_fill_link_info,
3765};
3766
3767static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3768{
3769	struct bpf_link_primer link_primer;
3770	struct bpf_perf_link *link;
3771	struct perf_event *event;
3772	struct file *perf_file;
3773	int err;
3774
3775	if (attr->link_create.flags)
3776		return -EINVAL;
3777
3778	perf_file = perf_event_get(attr->link_create.target_fd);
3779	if (IS_ERR(perf_file))
3780		return PTR_ERR(perf_file);
3781
3782	link = kzalloc(sizeof(*link), GFP_USER);
3783	if (!link) {
3784		err = -ENOMEM;
3785		goto out_put_file;
3786	}
3787	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
3788	link->perf_file = perf_file;
3789
3790	err = bpf_link_prime(&link->link, &link_primer);
3791	if (err) {
3792		kfree(link);
3793		goto out_put_file;
3794	}
3795
3796	event = perf_file->private_data;
3797	err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
3798	if (err) {
3799		bpf_link_cleanup(&link_primer);
3800		goto out_put_file;
3801	}
3802	/* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
3803	bpf_prog_inc(prog);
3804
3805	return bpf_link_settle(&link_primer);
3806
3807out_put_file:
3808	fput(perf_file);
3809	return err;
3810}
3811#else
3812static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3813{
3814	return -EOPNOTSUPP;
3815}
3816#endif /* CONFIG_PERF_EVENTS */
3817
3818static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
3819				  const char __user *user_tp_name, u64 cookie)
3820{
3821	struct bpf_link_primer link_primer;
3822	struct bpf_raw_tp_link *link;
3823	struct bpf_raw_event_map *btp;
3824	const char *tp_name;
3825	char buf[128];
3826	int err;
3827
3828	switch (prog->type) {
3829	case BPF_PROG_TYPE_TRACING:
3830	case BPF_PROG_TYPE_EXT:
3831	case BPF_PROG_TYPE_LSM:
3832		if (user_tp_name)
3833			/* The attach point for this category of programs
3834			 * should be specified via btf_id during program load.
3835			 */
3836			return -EINVAL;
3837		if (prog->type == BPF_PROG_TYPE_TRACING &&
3838		    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
3839			tp_name = prog->aux->attach_func_name;
3840			break;
3841		}
3842		return bpf_tracing_prog_attach(prog, 0, 0, 0);
3843	case BPF_PROG_TYPE_RAW_TRACEPOINT:
3844	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
3845		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
3846			return -EFAULT;
3847		buf[sizeof(buf) - 1] = 0;
3848		tp_name = buf;
3849		break;
3850	default:
3851		return -EINVAL;
3852	}
3853
3854	btp = bpf_get_raw_tracepoint(tp_name);
3855	if (!btp)
3856		return -ENOENT;
3857
3858	link = kzalloc(sizeof(*link), GFP_USER);
3859	if (!link) {
3860		err = -ENOMEM;
3861		goto out_put_btp;
3862	}
3863	bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
3864		      &bpf_raw_tp_link_lops, prog);
3865	link->btp = btp;
3866	link->cookie = cookie;
3867
3868	err = bpf_link_prime(&link->link, &link_primer);
3869	if (err) {
3870		kfree(link);
3871		goto out_put_btp;
3872	}
3873
3874	err = bpf_probe_register(link->btp, link);
3875	if (err) {
3876		bpf_link_cleanup(&link_primer);
3877		goto out_put_btp;
3878	}
3879
3880	return bpf_link_settle(&link_primer);
3881
3882out_put_btp:
3883	bpf_put_raw_tracepoint(btp);
3884	return err;
3885}
3886
3887#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
3888
3889static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
3890{
3891	struct bpf_prog *prog;
3892	void __user *tp_name;
3893	__u64 cookie;
3894	int fd;
3895
3896	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
3897		return -EINVAL;
3898
3899	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
3900	if (IS_ERR(prog))
3901		return PTR_ERR(prog);
3902
3903	tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
3904	cookie = attr->raw_tracepoint.cookie;
3905	fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
3906	if (fd < 0)
3907		bpf_prog_put(prog);
3908	return fd;
3909}
3910
3911static enum bpf_prog_type
3912attach_type_to_prog_type(enum bpf_attach_type attach_type)
3913{
3914	switch (attach_type) {
3915	case BPF_CGROUP_INET_INGRESS:
3916	case BPF_CGROUP_INET_EGRESS:
3917		return BPF_PROG_TYPE_CGROUP_SKB;
3918	case BPF_CGROUP_INET_SOCK_CREATE:
3919	case BPF_CGROUP_INET_SOCK_RELEASE:
3920	case BPF_CGROUP_INET4_POST_BIND:
3921	case BPF_CGROUP_INET6_POST_BIND:
3922		return BPF_PROG_TYPE_CGROUP_SOCK;
3923	case BPF_CGROUP_INET4_BIND:
3924	case BPF_CGROUP_INET6_BIND:
3925	case BPF_CGROUP_INET4_CONNECT:
3926	case BPF_CGROUP_INET6_CONNECT:
3927	case BPF_CGROUP_UNIX_CONNECT:
3928	case BPF_CGROUP_INET4_GETPEERNAME:
3929	case BPF_CGROUP_INET6_GETPEERNAME:
3930	case BPF_CGROUP_UNIX_GETPEERNAME:
3931	case BPF_CGROUP_INET4_GETSOCKNAME:
3932	case BPF_CGROUP_INET6_GETSOCKNAME:
3933	case BPF_CGROUP_UNIX_GETSOCKNAME:
3934	case BPF_CGROUP_UDP4_SENDMSG:
3935	case BPF_CGROUP_UDP6_SENDMSG:
3936	case BPF_CGROUP_UNIX_SENDMSG:
3937	case BPF_CGROUP_UDP4_RECVMSG:
3938	case BPF_CGROUP_UDP6_RECVMSG:
3939	case BPF_CGROUP_UNIX_RECVMSG:
3940		return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
3941	case BPF_CGROUP_SOCK_OPS:
3942		return BPF_PROG_TYPE_SOCK_OPS;
3943	case BPF_CGROUP_DEVICE:
3944		return BPF_PROG_TYPE_CGROUP_DEVICE;
3945	case BPF_SK_MSG_VERDICT:
3946		return BPF_PROG_TYPE_SK_MSG;
3947	case BPF_SK_SKB_STREAM_PARSER:
3948	case BPF_SK_SKB_STREAM_VERDICT:
3949	case BPF_SK_SKB_VERDICT:
3950		return BPF_PROG_TYPE_SK_SKB;
3951	case BPF_LIRC_MODE2:
3952		return BPF_PROG_TYPE_LIRC_MODE2;
3953	case BPF_FLOW_DISSECTOR:
3954		return BPF_PROG_TYPE_FLOW_DISSECTOR;
3955	case BPF_CGROUP_SYSCTL:
3956		return BPF_PROG_TYPE_CGROUP_SYSCTL;
3957	case BPF_CGROUP_GETSOCKOPT:
3958	case BPF_CGROUP_SETSOCKOPT:
3959		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
3960	case BPF_TRACE_ITER:
3961	case BPF_TRACE_RAW_TP:
3962	case BPF_TRACE_FENTRY:
3963	case BPF_TRACE_FEXIT:
3964	case BPF_MODIFY_RETURN:
3965		return BPF_PROG_TYPE_TRACING;
3966	case BPF_LSM_MAC:
3967		return BPF_PROG_TYPE_LSM;
3968	case BPF_SK_LOOKUP:
3969		return BPF_PROG_TYPE_SK_LOOKUP;
3970	case BPF_XDP:
3971		return BPF_PROG_TYPE_XDP;
3972	case BPF_LSM_CGROUP:
3973		return BPF_PROG_TYPE_LSM;
3974	case BPF_TCX_INGRESS:
3975	case BPF_TCX_EGRESS:
3976	case BPF_NETKIT_PRIMARY:
3977	case BPF_NETKIT_PEER:
3978		return BPF_PROG_TYPE_SCHED_CLS;
3979	default:
3980		return BPF_PROG_TYPE_UNSPEC;
3981	}
3982}
3983
3984static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
3985					     enum bpf_attach_type attach_type)
3986{
3987	enum bpf_prog_type ptype;
3988
3989	switch (prog->type) {
3990	case BPF_PROG_TYPE_CGROUP_SOCK:
3991	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3992	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3993	case BPF_PROG_TYPE_SK_LOOKUP:
3994		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
3995	case BPF_PROG_TYPE_CGROUP_SKB:
3996		if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
3997			/* cg-skb progs can be loaded by unpriv user.
3998			 * check permissions at attach time.
3999			 */
4000			return -EPERM;
4001
4002		ptype = attach_type_to_prog_type(attach_type);
4003		if (prog->type != ptype)
4004			return -EINVAL;
4005
4006		return prog->enforce_expected_attach_type &&
4007			prog->expected_attach_type != attach_type ?
4008			-EINVAL : 0;
4009	case BPF_PROG_TYPE_EXT:
4010		return 0;
4011	case BPF_PROG_TYPE_NETFILTER:
4012		if (attach_type != BPF_NETFILTER)
4013			return -EINVAL;
4014		return 0;
4015	case BPF_PROG_TYPE_PERF_EVENT:
4016	case BPF_PROG_TYPE_TRACEPOINT:
4017		if (attach_type != BPF_PERF_EVENT)
4018			return -EINVAL;
4019		return 0;
4020	case BPF_PROG_TYPE_KPROBE:
4021		if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
4022		    attach_type != BPF_TRACE_KPROBE_MULTI)
4023			return -EINVAL;
4024		if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
4025		    attach_type != BPF_TRACE_KPROBE_SESSION)
4026			return -EINVAL;
4027		if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
4028		    attach_type != BPF_TRACE_UPROBE_MULTI)
4029			return -EINVAL;
4030		if (attach_type != BPF_PERF_EVENT &&
4031		    attach_type != BPF_TRACE_KPROBE_MULTI &&
4032		    attach_type != BPF_TRACE_KPROBE_SESSION &&
4033		    attach_type != BPF_TRACE_UPROBE_MULTI)
4034			return -EINVAL;
4035		return 0;
4036	case BPF_PROG_TYPE_SCHED_CLS:
4037		if (attach_type != BPF_TCX_INGRESS &&
4038		    attach_type != BPF_TCX_EGRESS &&
4039		    attach_type != BPF_NETKIT_PRIMARY &&
4040		    attach_type != BPF_NETKIT_PEER)
4041			return -EINVAL;
4042		return 0;
4043	default:
4044		ptype = attach_type_to_prog_type(attach_type);
4045		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
4046			return -EINVAL;
4047		return 0;
4048	}
4049}
4050
4051#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
4052
4053#define BPF_F_ATTACH_MASK_BASE	\
4054	(BPF_F_ALLOW_OVERRIDE |	\
4055	 BPF_F_ALLOW_MULTI |	\
4056	 BPF_F_REPLACE)
4057
4058#define BPF_F_ATTACH_MASK_MPROG	\
4059	(BPF_F_REPLACE |	\
4060	 BPF_F_BEFORE |		\
4061	 BPF_F_AFTER |		\
4062	 BPF_F_ID |		\
4063	 BPF_F_LINK)
4064
4065static int bpf_prog_attach(const union bpf_attr *attr)
4066{
4067	enum bpf_prog_type ptype;
4068	struct bpf_prog *prog;
4069	int ret;
4070
4071	if (CHECK_ATTR(BPF_PROG_ATTACH))
4072		return -EINVAL;
4073
4074	ptype = attach_type_to_prog_type(attr->attach_type);
4075	if (ptype == BPF_PROG_TYPE_UNSPEC)
4076		return -EINVAL;
4077	if (bpf_mprog_supported(ptype)) {
4078		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
4079			return -EINVAL;
4080	} else {
4081		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
4082			return -EINVAL;
4083		if (attr->relative_fd ||
4084		    attr->expected_revision)
4085			return -EINVAL;
4086	}
4087
4088	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
4089	if (IS_ERR(prog))
4090		return PTR_ERR(prog);
4091
4092	if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
4093		bpf_prog_put(prog);
4094		return -EINVAL;
4095	}
4096
4097	switch (ptype) {
4098	case BPF_PROG_TYPE_SK_SKB:
4099	case BPF_PROG_TYPE_SK_MSG:
4100		ret = sock_map_get_from_fd(attr, prog);
4101		break;
4102	case BPF_PROG_TYPE_LIRC_MODE2:
4103		ret = lirc_prog_attach(attr, prog);
4104		break;
4105	case BPF_PROG_TYPE_FLOW_DISSECTOR:
4106		ret = netns_bpf_prog_attach(attr, prog);
4107		break;
4108	case BPF_PROG_TYPE_CGROUP_DEVICE:
4109	case BPF_PROG_TYPE_CGROUP_SKB:
4110	case BPF_PROG_TYPE_CGROUP_SOCK:
4111	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4112	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4113	case BPF_PROG_TYPE_CGROUP_SYSCTL:
4114	case BPF_PROG_TYPE_SOCK_OPS:
4115	case BPF_PROG_TYPE_LSM:
4116		if (ptype == BPF_PROG_TYPE_LSM &&
4117		    prog->expected_attach_type != BPF_LSM_CGROUP)
4118			ret = -EINVAL;
4119		else
4120			ret = cgroup_bpf_prog_attach(attr, ptype, prog);
4121		break;
4122	case BPF_PROG_TYPE_SCHED_CLS:
4123		if (attr->attach_type == BPF_TCX_INGRESS ||
4124		    attr->attach_type == BPF_TCX_EGRESS)
4125			ret = tcx_prog_attach(attr, prog);
4126		else
4127			ret = netkit_prog_attach(attr, prog);
4128		break;
4129	default:
4130		ret = -EINVAL;
4131	}
4132
4133	if (ret)
4134		bpf_prog_put(prog);
4135	return ret;
4136}
4137
4138#define BPF_PROG_DETACH_LAST_FIELD expected_revision
4139
4140static int bpf_prog_detach(const union bpf_attr *attr)
4141{
4142	struct bpf_prog *prog = NULL;
4143	enum bpf_prog_type ptype;
4144	int ret;
4145
4146	if (CHECK_ATTR(BPF_PROG_DETACH))
4147		return -EINVAL;
4148
4149	ptype = attach_type_to_prog_type(attr->attach_type);
4150	if (bpf_mprog_supported(ptype)) {
4151		if (ptype == BPF_PROG_TYPE_UNSPEC)
4152			return -EINVAL;
4153		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
4154			return -EINVAL;
4155		if (attr->attach_bpf_fd) {
4156			prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
4157			if (IS_ERR(prog))
4158				return PTR_ERR(prog);
4159		}
4160	} else if (attr->attach_flags ||
4161		   attr->relative_fd ||
4162		   attr->expected_revision) {
4163		return -EINVAL;
4164	}
4165
4166	switch (ptype) {
4167	case BPF_PROG_TYPE_SK_MSG:
4168	case BPF_PROG_TYPE_SK_SKB:
4169		ret = sock_map_prog_detach(attr, ptype);
4170		break;
4171	case BPF_PROG_TYPE_LIRC_MODE2:
4172		ret = lirc_prog_detach(attr);
4173		break;
4174	case BPF_PROG_TYPE_FLOW_DISSECTOR:
4175		ret = netns_bpf_prog_detach(attr, ptype);
4176		break;
4177	case BPF_PROG_TYPE_CGROUP_DEVICE:
4178	case BPF_PROG_TYPE_CGROUP_SKB:
4179	case BPF_PROG_TYPE_CGROUP_SOCK:
4180	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4181	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4182	case BPF_PROG_TYPE_CGROUP_SYSCTL:
4183	case BPF_PROG_TYPE_SOCK_OPS:
4184	case BPF_PROG_TYPE_LSM:
4185		ret = cgroup_bpf_prog_detach(attr, ptype);
4186		break;
4187	case BPF_PROG_TYPE_SCHED_CLS:
4188		if (attr->attach_type == BPF_TCX_INGRESS ||
4189		    attr->attach_type == BPF_TCX_EGRESS)
4190			ret = tcx_prog_detach(attr, prog);
4191		else
4192			ret = netkit_prog_detach(attr, prog);
4193		break;
4194	default:
4195		ret = -EINVAL;
4196	}
4197
4198	if (prog)
4199		bpf_prog_put(prog);
4200	return ret;
4201}
4202
4203#define BPF_PROG_QUERY_LAST_FIELD query.revision
4204
4205static int bpf_prog_query(const union bpf_attr *attr,
4206			  union bpf_attr __user *uattr)
4207{
4208	if (!bpf_net_capable())
4209		return -EPERM;
4210	if (CHECK_ATTR(BPF_PROG_QUERY))
4211		return -EINVAL;
4212	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
4213		return -EINVAL;
4214
4215	switch (attr->query.attach_type) {
4216	case BPF_CGROUP_INET_INGRESS:
4217	case BPF_CGROUP_INET_EGRESS:
4218	case BPF_CGROUP_INET_SOCK_CREATE:
4219	case BPF_CGROUP_INET_SOCK_RELEASE:
4220	case BPF_CGROUP_INET4_BIND:
4221	case BPF_CGROUP_INET6_BIND:
4222	case BPF_CGROUP_INET4_POST_BIND:
4223	case BPF_CGROUP_INET6_POST_BIND:
4224	case BPF_CGROUP_INET4_CONNECT:
4225	case BPF_CGROUP_INET6_CONNECT:
4226	case BPF_CGROUP_UNIX_CONNECT:
4227	case BPF_CGROUP_INET4_GETPEERNAME:
4228	case BPF_CGROUP_INET6_GETPEERNAME:
4229	case BPF_CGROUP_UNIX_GETPEERNAME:
4230	case BPF_CGROUP_INET4_GETSOCKNAME:
4231	case BPF_CGROUP_INET6_GETSOCKNAME:
4232	case BPF_CGROUP_UNIX_GETSOCKNAME:
4233	case BPF_CGROUP_UDP4_SENDMSG:
4234	case BPF_CGROUP_UDP6_SENDMSG:
4235	case BPF_CGROUP_UNIX_SENDMSG:
4236	case BPF_CGROUP_UDP4_RECVMSG:
4237	case BPF_CGROUP_UDP6_RECVMSG:
4238	case BPF_CGROUP_UNIX_RECVMSG:
4239	case BPF_CGROUP_SOCK_OPS:
4240	case BPF_CGROUP_DEVICE:
4241	case BPF_CGROUP_SYSCTL:
4242	case BPF_CGROUP_GETSOCKOPT:
4243	case BPF_CGROUP_SETSOCKOPT:
4244	case BPF_LSM_CGROUP:
4245		return cgroup_bpf_prog_query(attr, uattr);
4246	case BPF_LIRC_MODE2:
4247		return lirc_prog_query(attr, uattr);
4248	case BPF_FLOW_DISSECTOR:
4249	case BPF_SK_LOOKUP:
4250		return netns_bpf_prog_query(attr, uattr);
4251	case BPF_SK_SKB_STREAM_PARSER:
4252	case BPF_SK_SKB_STREAM_VERDICT:
4253	case BPF_SK_MSG_VERDICT:
4254	case BPF_SK_SKB_VERDICT:
4255		return sock_map_bpf_prog_query(attr, uattr);
4256	case BPF_TCX_INGRESS:
4257	case BPF_TCX_EGRESS:
4258		return tcx_prog_query(attr, uattr);
4259	case BPF_NETKIT_PRIMARY:
4260	case BPF_NETKIT_PEER:
4261		return netkit_prog_query(attr, uattr);
4262	default:
4263		return -EINVAL;
4264	}
4265}
4266
4267#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
4268
4269static int bpf_prog_test_run(const union bpf_attr *attr,
4270			     union bpf_attr __user *uattr)
4271{
4272	struct bpf_prog *prog;
4273	int ret = -ENOTSUPP;
4274
4275	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
4276		return -EINVAL;
4277
4278	if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
4279	    (!attr->test.ctx_size_in && attr->test.ctx_in))
4280		return -EINVAL;
4281
4282	if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
4283	    (!attr->test.ctx_size_out && attr->test.ctx_out))
4284		return -EINVAL;
4285
4286	prog = bpf_prog_get(attr->test.prog_fd);
4287	if (IS_ERR(prog))
4288		return PTR_ERR(prog);
4289
4290	if (prog->aux->ops->test_run)
4291		ret = prog->aux->ops->test_run(prog, attr, uattr);
4292
4293	bpf_prog_put(prog);
4294	return ret;
4295}
4296
4297#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
4298
4299static int bpf_obj_get_next_id(const union bpf_attr *attr,
4300			       union bpf_attr __user *uattr,
4301			       struct idr *idr,
4302			       spinlock_t *lock)
4303{
4304	u32 next_id = attr->start_id;
4305	int err = 0;
4306
4307	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
4308		return -EINVAL;
4309
4310	if (!capable(CAP_SYS_ADMIN))
4311		return -EPERM;
4312
4313	next_id++;
4314	spin_lock_bh(lock);
4315	if (!idr_get_next(idr, &next_id))
4316		err = -ENOENT;
4317	spin_unlock_bh(lock);
4318
4319	if (!err)
4320		err = put_user(next_id, &uattr->next_id);
4321
4322	return err;
4323}
4324
4325struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
4326{
4327	struct bpf_map *map;
4328
4329	spin_lock_bh(&map_idr_lock);
4330again:
4331	map = idr_get_next(&map_idr, id);
4332	if (map) {
4333		map = __bpf_map_inc_not_zero(map, false);
4334		if (IS_ERR(map)) {
4335			(*id)++;
4336			goto again;
4337		}
4338	}
4339	spin_unlock_bh(&map_idr_lock);
4340
4341	return map;
4342}
4343
4344struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
4345{
4346	struct bpf_prog *prog;
4347
4348	spin_lock_bh(&prog_idr_lock);
4349again:
4350	prog = idr_get_next(&prog_idr, id);
4351	if (prog) {
4352		prog = bpf_prog_inc_not_zero(prog);
4353		if (IS_ERR(prog)) {
4354			(*id)++;
4355			goto again;
4356		}
4357	}
4358	spin_unlock_bh(&prog_idr_lock);
4359
4360	return prog;
4361}
4362
4363#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
4364
4365struct bpf_prog *bpf_prog_by_id(u32 id)
4366{
4367	struct bpf_prog *prog;
4368
4369	if (!id)
4370		return ERR_PTR(-ENOENT);
4371
4372	spin_lock_bh(&prog_idr_lock);
4373	prog = idr_find(&prog_idr, id);
4374	if (prog)
4375		prog = bpf_prog_inc_not_zero(prog);
4376	else
4377		prog = ERR_PTR(-ENOENT);
4378	spin_unlock_bh(&prog_idr_lock);
4379	return prog;
4380}
4381
4382static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
4383{
4384	struct bpf_prog *prog;
4385	u32 id = attr->prog_id;
4386	int fd;
4387
4388	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
4389		return -EINVAL;
4390
4391	if (!capable(CAP_SYS_ADMIN))
4392		return -EPERM;
4393
4394	prog = bpf_prog_by_id(id);
4395	if (IS_ERR(prog))
4396		return PTR_ERR(prog);
4397
4398	fd = bpf_prog_new_fd(prog);
4399	if (fd < 0)
4400		bpf_prog_put(prog);
4401
4402	return fd;
4403}
4404
4405#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
4406
4407static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
4408{
4409	struct bpf_map *map;
4410	u32 id = attr->map_id;
4411	int f_flags;
4412	int fd;
4413
4414	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
4415	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
4416		return -EINVAL;
4417
4418	if (!capable(CAP_SYS_ADMIN))
4419		return -EPERM;
4420
4421	f_flags = bpf_get_file_flag(attr->open_flags);
4422	if (f_flags < 0)
4423		return f_flags;
4424
4425	spin_lock_bh(&map_idr_lock);
4426	map = idr_find(&map_idr, id);
4427	if (map)
4428		map = __bpf_map_inc_not_zero(map, true);
4429	else
4430		map = ERR_PTR(-ENOENT);
4431	spin_unlock_bh(&map_idr_lock);
4432
4433	if (IS_ERR(map))
4434		return PTR_ERR(map);
4435
4436	fd = bpf_map_new_fd(map, f_flags);
4437	if (fd < 0)
4438		bpf_map_put_with_uref(map);
4439
4440	return fd;
4441}
4442
4443static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
4444					      unsigned long addr, u32 *off,
4445					      u32 *type)
4446{
4447	const struct bpf_map *map;
4448	int i;
4449
4450	mutex_lock(&prog->aux->used_maps_mutex);
4451	for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
4452		map = prog->aux->used_maps[i];
4453		if (map == (void *)addr) {
4454			*type = BPF_PSEUDO_MAP_FD;
4455			goto out;
4456		}
4457		if (!map->ops->map_direct_value_meta)
4458			continue;
4459		if (!map->ops->map_direct_value_meta(map, addr, off)) {
4460			*type = BPF_PSEUDO_MAP_VALUE;
4461			goto out;
4462		}
4463	}
4464	map = NULL;
4465
4466out:
4467	mutex_unlock(&prog->aux->used_maps_mutex);
4468	return map;
4469}
4470
4471static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
4472					      const struct cred *f_cred)
4473{
4474	const struct bpf_map *map;
4475	struct bpf_insn *insns;
4476	u32 off, type;
4477	u64 imm;
4478	u8 code;
4479	int i;
4480
4481	insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
4482			GFP_USER);
4483	if (!insns)
4484		return insns;
4485
4486	for (i = 0; i < prog->len; i++) {
4487		code = insns[i].code;
4488
4489		if (code == (BPF_JMP | BPF_TAIL_CALL)) {
4490			insns[i].code = BPF_JMP | BPF_CALL;
4491			insns[i].imm = BPF_FUNC_tail_call;
4492			/* fall-through */
4493		}
4494		if (code == (BPF_JMP | BPF_CALL) ||
4495		    code == (BPF_JMP | BPF_CALL_ARGS)) {
4496			if (code == (BPF_JMP | BPF_CALL_ARGS))
4497				insns[i].code = BPF_JMP | BPF_CALL;
4498			if (!bpf_dump_raw_ok(f_cred))
4499				insns[i].imm = 0;
4500			continue;
4501		}
4502		if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
4503			insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
4504			continue;
4505		}
4506
4507		if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
4508		     BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
4509			insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
4510			continue;
4511		}
4512
4513		if (code != (BPF_LD | BPF_IMM | BPF_DW))
4514			continue;
4515
4516		imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
4517		map = bpf_map_from_imm(prog, imm, &off, &type);
4518		if (map) {
4519			insns[i].src_reg = type;
4520			insns[i].imm = map->id;
4521			insns[i + 1].imm = off;
4522			continue;
4523		}
4524	}
4525
4526	return insns;
4527}
4528
4529static int set_info_rec_size(struct bpf_prog_info *info)
4530{
4531	/*
4532	 * Ensure info.*_rec_size is the same as kernel expected size
4533	 *
4534	 * or
4535	 *
4536	 * Only allow zero *_rec_size if both _rec_size and _cnt are
4537	 * zero.  In this case, the kernel will set the expected
4538	 * _rec_size back to the info.
4539	 */
4540
4541	if ((info->nr_func_info || info->func_info_rec_size) &&
4542	    info->func_info_rec_size != sizeof(struct bpf_func_info))
4543		return -EINVAL;
4544
4545	if ((info->nr_line_info || info->line_info_rec_size) &&
4546	    info->line_info_rec_size != sizeof(struct bpf_line_info))
4547		return -EINVAL;
4548
4549	if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
4550	    info->jited_line_info_rec_size != sizeof(__u64))
4551		return -EINVAL;
4552
4553	info->func_info_rec_size = sizeof(struct bpf_func_info);
4554	info->line_info_rec_size = sizeof(struct bpf_line_info);
4555	info->jited_line_info_rec_size = sizeof(__u64);
4556
4557	return 0;
4558}
4559
4560static int bpf_prog_get_info_by_fd(struct file *file,
4561				   struct bpf_prog *prog,
4562				   const union bpf_attr *attr,
4563				   union bpf_attr __user *uattr)
4564{
4565	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4566	struct btf *attach_btf = bpf_prog_get_target_btf(prog);
4567	struct bpf_prog_info info;
4568	u32 info_len = attr->info.info_len;
4569	struct bpf_prog_kstats stats;
4570	char __user *uinsns;
4571	u32 ulen;
4572	int err;
4573
4574	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4575	if (err)
4576		return err;
4577	info_len = min_t(u32, sizeof(info), info_len);
4578
4579	memset(&info, 0, sizeof(info));
4580	if (copy_from_user(&info, uinfo, info_len))
4581		return -EFAULT;
4582
4583	info.type = prog->type;
4584	info.id = prog->aux->id;
4585	info.load_time = prog->aux->load_time;
4586	info.created_by_uid = from_kuid_munged(current_user_ns(),
4587					       prog->aux->user->uid);
4588	info.gpl_compatible = prog->gpl_compatible;
4589
4590	memcpy(info.tag, prog->tag, sizeof(prog->tag));
4591	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
4592
4593	mutex_lock(&prog->aux->used_maps_mutex);
4594	ulen = info.nr_map_ids;
4595	info.nr_map_ids = prog->aux->used_map_cnt;
4596	ulen = min_t(u32, info.nr_map_ids, ulen);
4597	if (ulen) {
4598		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
4599		u32 i;
4600
4601		for (i = 0; i < ulen; i++)
4602			if (put_user(prog->aux->used_maps[i]->id,
4603				     &user_map_ids[i])) {
4604				mutex_unlock(&prog->aux->used_maps_mutex);
4605				return -EFAULT;
4606			}
4607	}
4608	mutex_unlock(&prog->aux->used_maps_mutex);
4609
4610	err = set_info_rec_size(&info);
4611	if (err)
4612		return err;
4613
4614	bpf_prog_get_stats(prog, &stats);
4615	info.run_time_ns = stats.nsecs;
4616	info.run_cnt = stats.cnt;
4617	info.recursion_misses = stats.misses;
4618
4619	info.verified_insns = prog->aux->verified_insns;
4620
4621	if (!bpf_capable()) {
4622		info.jited_prog_len = 0;
4623		info.xlated_prog_len = 0;
4624		info.nr_jited_ksyms = 0;
4625		info.nr_jited_func_lens = 0;
4626		info.nr_func_info = 0;
4627		info.nr_line_info = 0;
4628		info.nr_jited_line_info = 0;
4629		goto done;
4630	}
4631
4632	ulen = info.xlated_prog_len;
4633	info.xlated_prog_len = bpf_prog_insn_size(prog);
4634	if (info.xlated_prog_len && ulen) {
4635		struct bpf_insn *insns_sanitized;
4636		bool fault;
4637
4638		if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
4639			info.xlated_prog_insns = 0;
4640			goto done;
4641		}
4642		insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
4643		if (!insns_sanitized)
4644			return -ENOMEM;
4645		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
4646		ulen = min_t(u32, info.xlated_prog_len, ulen);
4647		fault = copy_to_user(uinsns, insns_sanitized, ulen);
4648		kfree(insns_sanitized);
4649		if (fault)
4650			return -EFAULT;
4651	}
4652
4653	if (bpf_prog_is_offloaded(prog->aux)) {
4654		err = bpf_prog_offload_info_fill(&info, prog);
4655		if (err)
4656			return err;
4657		goto done;
4658	}
4659
4660	/* NOTE: the following code is supposed to be skipped for offload.
4661	 * bpf_prog_offload_info_fill() is the place to fill similar fields
4662	 * for offload.
4663	 */
4664	ulen = info.jited_prog_len;
4665	if (prog->aux->func_cnt) {
4666		u32 i;
4667
4668		info.jited_prog_len = 0;
4669		for (i = 0; i < prog->aux->func_cnt; i++)
4670			info.jited_prog_len += prog->aux->func[i]->jited_len;
4671	} else {
4672		info.jited_prog_len = prog->jited_len;
4673	}
4674
4675	if (info.jited_prog_len && ulen) {
4676		if (bpf_dump_raw_ok(file->f_cred)) {
4677			uinsns = u64_to_user_ptr(info.jited_prog_insns);
4678			ulen = min_t(u32, info.jited_prog_len, ulen);
4679
4680			/* for multi-function programs, copy the JITed
4681			 * instructions for all the functions
4682			 */
4683			if (prog->aux->func_cnt) {
4684				u32 len, free, i;
4685				u8 *img;
4686
4687				free = ulen;
4688				for (i = 0; i < prog->aux->func_cnt; i++) {
4689					len = prog->aux->func[i]->jited_len;
4690					len = min_t(u32, len, free);
4691					img = (u8 *) prog->aux->func[i]->bpf_func;
4692					if (copy_to_user(uinsns, img, len))
4693						return -EFAULT;
4694					uinsns += len;
4695					free -= len;
4696					if (!free)
4697						break;
4698				}
4699			} else {
4700				if (copy_to_user(uinsns, prog->bpf_func, ulen))
4701					return -EFAULT;
4702			}
4703		} else {
4704			info.jited_prog_insns = 0;
4705		}
4706	}
4707
4708	ulen = info.nr_jited_ksyms;
4709	info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
4710	if (ulen) {
4711		if (bpf_dump_raw_ok(file->f_cred)) {
4712			unsigned long ksym_addr;
4713			u64 __user *user_ksyms;
4714			u32 i;
4715
4716			/* copy the address of the kernel symbol
4717			 * corresponding to each function
4718			 */
4719			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
4720			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
4721			if (prog->aux->func_cnt) {
4722				for (i = 0; i < ulen; i++) {
4723					ksym_addr = (unsigned long)
4724						prog->aux->func[i]->bpf_func;
4725					if (put_user((u64) ksym_addr,
4726						     &user_ksyms[i]))
4727						return -EFAULT;
4728				}
4729			} else {
4730				ksym_addr = (unsigned long) prog->bpf_func;
4731				if (put_user((u64) ksym_addr, &user_ksyms[0]))
4732					return -EFAULT;
4733			}
4734		} else {
4735			info.jited_ksyms = 0;
4736		}
4737	}
4738
4739	ulen = info.nr_jited_func_lens;
4740	info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
4741	if (ulen) {
4742		if (bpf_dump_raw_ok(file->f_cred)) {
4743			u32 __user *user_lens;
4744			u32 func_len, i;
4745
4746			/* copy the JITed image lengths for each function */
4747			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
4748			user_lens = u64_to_user_ptr(info.jited_func_lens);
4749			if (prog->aux->func_cnt) {
4750				for (i = 0; i < ulen; i++) {
4751					func_len =
4752						prog->aux->func[i]->jited_len;
4753					if (put_user(func_len, &user_lens[i]))
4754						return -EFAULT;
4755				}
4756			} else {
4757				func_len = prog->jited_len;
4758				if (put_user(func_len, &user_lens[0]))
4759					return -EFAULT;
4760			}
4761		} else {
4762			info.jited_func_lens = 0;
4763		}
4764	}
4765
4766	if (prog->aux->btf)
4767		info.btf_id = btf_obj_id(prog->aux->btf);
4768	info.attach_btf_id = prog->aux->attach_btf_id;
4769	if (attach_btf)
4770		info.attach_btf_obj_id = btf_obj_id(attach_btf);
4771
4772	ulen = info.nr_func_info;
4773	info.nr_func_info = prog->aux->func_info_cnt;
4774	if (info.nr_func_info && ulen) {
4775		char __user *user_finfo;
4776
4777		user_finfo = u64_to_user_ptr(info.func_info);
4778		ulen = min_t(u32, info.nr_func_info, ulen);
4779		if (copy_to_user(user_finfo, prog->aux->func_info,
4780				 info.func_info_rec_size * ulen))
4781			return -EFAULT;
4782	}
4783
4784	ulen = info.nr_line_info;
4785	info.nr_line_info = prog->aux->nr_linfo;
4786	if (info.nr_line_info && ulen) {
4787		__u8 __user *user_linfo;
4788
4789		user_linfo = u64_to_user_ptr(info.line_info);
4790		ulen = min_t(u32, info.nr_line_info, ulen);
4791		if (copy_to_user(user_linfo, prog->aux->linfo,
4792				 info.line_info_rec_size * ulen))
4793			return -EFAULT;
4794	}
4795
4796	ulen = info.nr_jited_line_info;
4797	if (prog->aux->jited_linfo)
4798		info.nr_jited_line_info = prog->aux->nr_linfo;
4799	else
4800		info.nr_jited_line_info = 0;
4801	if (info.nr_jited_line_info && ulen) {
4802		if (bpf_dump_raw_ok(file->f_cred)) {
4803			unsigned long line_addr;
4804			__u64 __user *user_linfo;
4805			u32 i;
4806
4807			user_linfo = u64_to_user_ptr(info.jited_line_info);
4808			ulen = min_t(u32, info.nr_jited_line_info, ulen);
4809			for (i = 0; i < ulen; i++) {
4810				line_addr = (unsigned long)prog->aux->jited_linfo[i];
4811				if (put_user((__u64)line_addr, &user_linfo[i]))
4812					return -EFAULT;
4813			}
4814		} else {
4815			info.jited_line_info = 0;
4816		}
4817	}
4818
4819	ulen = info.nr_prog_tags;
4820	info.nr_prog_tags = prog->aux->func_cnt ? : 1;
4821	if (ulen) {
4822		__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
4823		u32 i;
4824
4825		user_prog_tags = u64_to_user_ptr(info.prog_tags);
4826		ulen = min_t(u32, info.nr_prog_tags, ulen);
4827		if (prog->aux->func_cnt) {
4828			for (i = 0; i < ulen; i++) {
4829				if (copy_to_user(user_prog_tags[i],
4830						 prog->aux->func[i]->tag,
4831						 BPF_TAG_SIZE))
4832					return -EFAULT;
4833			}
4834		} else {
4835			if (copy_to_user(user_prog_tags[0],
4836					 prog->tag, BPF_TAG_SIZE))
4837				return -EFAULT;
4838		}
4839	}
4840
4841done:
4842	if (copy_to_user(uinfo, &info, info_len) ||
4843	    put_user(info_len, &uattr->info.info_len))
4844		return -EFAULT;
4845
4846	return 0;
4847}
4848
4849static int bpf_map_get_info_by_fd(struct file *file,
4850				  struct bpf_map *map,
4851				  const union bpf_attr *attr,
4852				  union bpf_attr __user *uattr)
4853{
4854	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4855	struct bpf_map_info info;
4856	u32 info_len = attr->info.info_len;
4857	int err;
4858
4859	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4860	if (err)
4861		return err;
4862	info_len = min_t(u32, sizeof(info), info_len);
4863
4864	memset(&info, 0, sizeof(info));
4865	info.type = map->map_type;
4866	info.id = map->id;
4867	info.key_size = map->key_size;
4868	info.value_size = map->value_size;
4869	info.max_entries = map->max_entries;
4870	info.map_flags = map->map_flags;
4871	info.map_extra = map->map_extra;
4872	memcpy(info.name, map->name, sizeof(map->name));
4873
4874	if (map->btf) {
4875		info.btf_id = btf_obj_id(map->btf);
4876		info.btf_key_type_id = map->btf_key_type_id;
4877		info.btf_value_type_id = map->btf_value_type_id;
4878	}
4879	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
4880	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
4881		bpf_map_struct_ops_info_fill(&info, map);
4882
4883	if (bpf_map_is_offloaded(map)) {
4884		err = bpf_map_offload_info_fill(&info, map);
4885		if (err)
4886			return err;
4887	}
4888
4889	if (copy_to_user(uinfo, &info, info_len) ||
4890	    put_user(info_len, &uattr->info.info_len))
4891		return -EFAULT;
4892
4893	return 0;
4894}
4895
4896static int bpf_btf_get_info_by_fd(struct file *file,
4897				  struct btf *btf,
4898				  const union bpf_attr *attr,
4899				  union bpf_attr __user *uattr)
4900{
4901	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4902	u32 info_len = attr->info.info_len;
4903	int err;
4904
4905	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
4906	if (err)
4907		return err;
4908
4909	return btf_get_info_by_fd(btf, attr, uattr);
4910}
4911
4912static int bpf_link_get_info_by_fd(struct file *file,
4913				  struct bpf_link *link,
4914				  const union bpf_attr *attr,
4915				  union bpf_attr __user *uattr)
4916{
4917	struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4918	struct bpf_link_info info;
4919	u32 info_len = attr->info.info_len;
4920	int err;
4921
4922	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4923	if (err)
4924		return err;
4925	info_len = min_t(u32, sizeof(info), info_len);
4926
4927	memset(&info, 0, sizeof(info));
4928	if (copy_from_user(&info, uinfo, info_len))
4929		return -EFAULT;
4930
4931	info.type = link->type;
4932	info.id = link->id;
4933	if (link->prog)
4934		info.prog_id = link->prog->aux->id;
4935
4936	if (link->ops->fill_link_info) {
4937		err = link->ops->fill_link_info(link, &info);
4938		if (err)
4939			return err;
4940	}
4941
4942	if (copy_to_user(uinfo, &info, info_len) ||
4943	    put_user(info_len, &uattr->info.info_len))
4944		return -EFAULT;
4945
4946	return 0;
4947}
4948
4949
4950#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
4951
4952static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
4953				  union bpf_attr __user *uattr)
4954{
4955	int ufd = attr->info.bpf_fd;
4956	struct fd f;
4957	int err;
4958
4959	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
4960		return -EINVAL;
4961
4962	f = fdget(ufd);
4963	if (!f.file)
4964		return -EBADFD;
4965
4966	if (f.file->f_op == &bpf_prog_fops)
4967		err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
4968					      uattr);
4969	else if (f.file->f_op == &bpf_map_fops)
4970		err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
4971					     uattr);
4972	else if (f.file->f_op == &btf_fops)
4973		err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
4974	else if (f.file->f_op == &bpf_link_fops)
4975		err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
4976					      attr, uattr);
4977	else
4978		err = -EINVAL;
4979
4980	fdput(f);
4981	return err;
4982}
4983
4984#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
4985
4986static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
4987{
4988	struct bpf_token *token = NULL;
4989
4990	if (CHECK_ATTR(BPF_BTF_LOAD))
4991		return -EINVAL;
4992
4993	if (attr->btf_flags & ~BPF_F_TOKEN_FD)
4994		return -EINVAL;
4995
4996	if (attr->btf_flags & BPF_F_TOKEN_FD) {
4997		token = bpf_token_get_from_fd(attr->btf_token_fd);
4998		if (IS_ERR(token))
4999			return PTR_ERR(token);
5000		if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
5001			bpf_token_put(token);
5002			token = NULL;
5003		}
5004	}
5005
5006	if (!bpf_token_capable(token, CAP_BPF)) {
5007		bpf_token_put(token);
5008		return -EPERM;
5009	}
5010
5011	bpf_token_put(token);
5012
5013	return btf_new_fd(attr, uattr, uattr_size);
5014}
5015
5016#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
5017
5018static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
5019{
5020	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
5021		return -EINVAL;
5022
5023	if (!capable(CAP_SYS_ADMIN))
5024		return -EPERM;
5025
5026	return btf_get_fd_by_id(attr->btf_id);
5027}
5028
5029static int bpf_task_fd_query_copy(const union bpf_attr *attr,
5030				    union bpf_attr __user *uattr,
5031				    u32 prog_id, u32 fd_type,
5032				    const char *buf, u64 probe_offset,
5033				    u64 probe_addr)
5034{
5035	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
5036	u32 len = buf ? strlen(buf) : 0, input_len;
5037	int err = 0;
5038
5039	if (put_user(len, &uattr->task_fd_query.buf_len))
5040		return -EFAULT;
5041	input_len = attr->task_fd_query.buf_len;
5042	if (input_len && ubuf) {
5043		if (!len) {
5044			/* nothing to copy, just make ubuf NULL terminated */
5045			char zero = '\0';
5046
5047			if (put_user(zero, ubuf))
5048				return -EFAULT;
5049		} else if (input_len >= len + 1) {
5050			/* ubuf can hold the string with NULL terminator */
5051			if (copy_to_user(ubuf, buf, len + 1))
5052				return -EFAULT;
5053		} else {
5054			/* ubuf cannot hold the string with NULL terminator,
5055			 * do a partial copy with NULL terminator.
5056			 */
5057			char zero = '\0';
5058
5059			err = -ENOSPC;
5060			if (copy_to_user(ubuf, buf, input_len - 1))
5061				return -EFAULT;
5062			if (put_user(zero, ubuf + input_len - 1))
5063				return -EFAULT;
5064		}
5065	}
5066
5067	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
5068	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
5069	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
5070	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
5071		return -EFAULT;
5072
5073	return err;
5074}
5075
5076#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
5077
5078static int bpf_task_fd_query(const union bpf_attr *attr,
5079			     union bpf_attr __user *uattr)
5080{
5081	pid_t pid = attr->task_fd_query.pid;
5082	u32 fd = attr->task_fd_query.fd;
5083	const struct perf_event *event;
5084	struct task_struct *task;
5085	struct file *file;
5086	int err;
5087
5088	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
5089		return -EINVAL;
5090
5091	if (!capable(CAP_SYS_ADMIN))
5092		return -EPERM;
5093
5094	if (attr->task_fd_query.flags != 0)
5095		return -EINVAL;
5096
5097	rcu_read_lock();
5098	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
5099	rcu_read_unlock();
5100	if (!task)
5101		return -ENOENT;
5102
5103	err = 0;
5104	file = fget_task(task, fd);
5105	put_task_struct(task);
5106	if (!file)
5107		return -EBADF;
5108
5109	if (file->f_op == &bpf_link_fops) {
5110		struct bpf_link *link = file->private_data;
5111
5112		if (link->ops == &bpf_raw_tp_link_lops) {
5113			struct bpf_raw_tp_link *raw_tp =
5114				container_of(link, struct bpf_raw_tp_link, link);
5115			struct bpf_raw_event_map *btp = raw_tp->btp;
5116
5117			err = bpf_task_fd_query_copy(attr, uattr,
5118						     raw_tp->link.prog->aux->id,
5119						     BPF_FD_TYPE_RAW_TRACEPOINT,
5120						     btp->tp->name, 0, 0);
5121			goto put_file;
5122		}
5123		goto out_not_supp;
5124	}
5125
5126	event = perf_get_event(file);
5127	if (!IS_ERR(event)) {
5128		u64 probe_offset, probe_addr;
5129		u32 prog_id, fd_type;
5130		const char *buf;
5131
5132		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
5133					      &buf, &probe_offset,
5134					      &probe_addr, NULL);
5135		if (!err)
5136			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
5137						     fd_type, buf,
5138						     probe_offset,
5139						     probe_addr);
5140		goto put_file;
5141	}
5142
5143out_not_supp:
5144	err = -ENOTSUPP;
5145put_file:
5146	fput(file);
5147	return err;
5148}
5149
5150#define BPF_MAP_BATCH_LAST_FIELD batch.flags
5151
5152#define BPF_DO_BATCH(fn, ...)			\
5153	do {					\
5154		if (!fn) {			\
5155			err = -ENOTSUPP;	\
5156			goto err_put;		\
5157		}				\
5158		err = fn(__VA_ARGS__);		\
5159	} while (0)
5160
5161static int bpf_map_do_batch(const union bpf_attr *attr,
5162			    union bpf_attr __user *uattr,
5163			    int cmd)
5164{
5165	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
5166			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
5167	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
5168	struct bpf_map *map;
5169	int err, ufd;
5170	struct fd f;
5171
5172	if (CHECK_ATTR(BPF_MAP_BATCH))
5173		return -EINVAL;
5174
5175	ufd = attr->batch.map_fd;
5176	f = fdget(ufd);
5177	map = __bpf_map_get(f);
5178	if (IS_ERR(map))
5179		return PTR_ERR(map);
5180	if (has_write)
5181		bpf_map_write_active_inc(map);
5182	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
5183		err = -EPERM;
5184		goto err_put;
5185	}
5186	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
5187		err = -EPERM;
5188		goto err_put;
5189	}
5190
5191	if (cmd == BPF_MAP_LOOKUP_BATCH)
5192		BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
5193	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
5194		BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
5195	else if (cmd == BPF_MAP_UPDATE_BATCH)
5196		BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
5197	else
5198		BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
5199err_put:
5200	if (has_write) {
5201		maybe_wait_bpf_programs(map);
5202		bpf_map_write_active_dec(map);
5203	}
5204	fdput(f);
5205	return err;
5206}
5207
5208#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
5209static int link_create(union bpf_attr *attr, bpfptr_t uattr)
5210{
5211	struct bpf_prog *prog;
5212	int ret;
5213
5214	if (CHECK_ATTR(BPF_LINK_CREATE))
5215		return -EINVAL;
5216
5217	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
5218		return bpf_struct_ops_link_create(attr);
5219
5220	prog = bpf_prog_get(attr->link_create.prog_fd);
5221	if (IS_ERR(prog))
5222		return PTR_ERR(prog);
5223
5224	ret = bpf_prog_attach_check_attach_type(prog,
5225						attr->link_create.attach_type);
5226	if (ret)
5227		goto out;
5228
5229	switch (prog->type) {
5230	case BPF_PROG_TYPE_CGROUP_SKB:
5231	case BPF_PROG_TYPE_CGROUP_SOCK:
5232	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
5233	case BPF_PROG_TYPE_SOCK_OPS:
5234	case BPF_PROG_TYPE_CGROUP_DEVICE:
5235	case BPF_PROG_TYPE_CGROUP_SYSCTL:
5236	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
5237		ret = cgroup_bpf_link_attach(attr, prog);
5238		break;
5239	case BPF_PROG_TYPE_EXT:
5240		ret = bpf_tracing_prog_attach(prog,
5241					      attr->link_create.target_fd,
5242					      attr->link_create.target_btf_id,
5243					      attr->link_create.tracing.cookie);
5244		break;
5245	case BPF_PROG_TYPE_LSM:
5246	case BPF_PROG_TYPE_TRACING:
5247		if (attr->link_create.attach_type != prog->expected_attach_type) {
5248			ret = -EINVAL;
5249			goto out;
5250		}
5251		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
5252			ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
5253		else if (prog->expected_attach_type == BPF_TRACE_ITER)
5254			ret = bpf_iter_link_attach(attr, uattr, prog);
5255		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
5256			ret = cgroup_bpf_link_attach(attr, prog);
5257		else
5258			ret = bpf_tracing_prog_attach(prog,
5259						      attr->link_create.target_fd,
5260						      attr->link_create.target_btf_id,
5261						      attr->link_create.tracing.cookie);
5262		break;
5263	case BPF_PROG_TYPE_FLOW_DISSECTOR:
5264	case BPF_PROG_TYPE_SK_LOOKUP:
5265		ret = netns_bpf_link_create(attr, prog);
5266		break;
5267	case BPF_PROG_TYPE_SK_MSG:
5268	case BPF_PROG_TYPE_SK_SKB:
5269		ret = sock_map_link_create(attr, prog);
5270		break;
5271#ifdef CONFIG_NET
5272	case BPF_PROG_TYPE_XDP:
5273		ret = bpf_xdp_link_attach(attr, prog);
5274		break;
5275	case BPF_PROG_TYPE_SCHED_CLS:
5276		if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
5277		    attr->link_create.attach_type == BPF_TCX_EGRESS)
5278			ret = tcx_link_attach(attr, prog);
5279		else
5280			ret = netkit_link_attach(attr, prog);
5281		break;
5282	case BPF_PROG_TYPE_NETFILTER:
5283		ret = bpf_nf_link_attach(attr, prog);
5284		break;
5285#endif
5286	case BPF_PROG_TYPE_PERF_EVENT:
5287	case BPF_PROG_TYPE_TRACEPOINT:
5288		ret = bpf_perf_link_attach(attr, prog);
5289		break;
5290	case BPF_PROG_TYPE_KPROBE:
5291		if (attr->link_create.attach_type == BPF_PERF_EVENT)
5292			ret = bpf_perf_link_attach(attr, prog);
5293		else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
5294			 attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
5295			ret = bpf_kprobe_multi_link_attach(attr, prog);
5296		else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
5297			ret = bpf_uprobe_multi_link_attach(attr, prog);
5298		break;
5299	default:
5300		ret = -EINVAL;
5301	}
5302
5303out:
5304	if (ret < 0)
5305		bpf_prog_put(prog);
5306	return ret;
5307}
5308
5309static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
5310{
5311	struct bpf_map *new_map, *old_map = NULL;
5312	int ret;
5313
5314	new_map = bpf_map_get(attr->link_update.new_map_fd);
5315	if (IS_ERR(new_map))
5316		return PTR_ERR(new_map);
5317
5318	if (attr->link_update.flags & BPF_F_REPLACE) {
5319		old_map = bpf_map_get(attr->link_update.old_map_fd);
5320		if (IS_ERR(old_map)) {
5321			ret = PTR_ERR(old_map);
5322			goto out_put;
5323		}
5324	} else if (attr->link_update.old_map_fd) {
5325		ret = -EINVAL;
5326		goto out_put;
5327	}
5328
5329	ret = link->ops->update_map(link, new_map, old_map);
5330
5331	if (old_map)
5332		bpf_map_put(old_map);
5333out_put:
5334	bpf_map_put(new_map);
5335	return ret;
5336}
5337
5338#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
5339
5340static int link_update(union bpf_attr *attr)
5341{
5342	struct bpf_prog *old_prog = NULL, *new_prog;
5343	struct bpf_link *link;
5344	u32 flags;
5345	int ret;
5346
5347	if (CHECK_ATTR(BPF_LINK_UPDATE))
5348		return -EINVAL;
5349
5350	flags = attr->link_update.flags;
5351	if (flags & ~BPF_F_REPLACE)
5352		return -EINVAL;
5353
5354	link = bpf_link_get_from_fd(attr->link_update.link_fd);
5355	if (IS_ERR(link))
5356		return PTR_ERR(link);
5357
5358	if (link->ops->update_map) {
5359		ret = link_update_map(link, attr);
5360		goto out_put_link;
5361	}
5362
5363	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
5364	if (IS_ERR(new_prog)) {
5365		ret = PTR_ERR(new_prog);
5366		goto out_put_link;
5367	}
5368
5369	if (flags & BPF_F_REPLACE) {
5370		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
5371		if (IS_ERR(old_prog)) {
5372			ret = PTR_ERR(old_prog);
5373			old_prog = NULL;
5374			goto out_put_progs;
5375		}
5376	} else if (attr->link_update.old_prog_fd) {
5377		ret = -EINVAL;
5378		goto out_put_progs;
5379	}
5380
5381	if (link->ops->update_prog)
5382		ret = link->ops->update_prog(link, new_prog, old_prog);
5383	else
5384		ret = -EINVAL;
5385
5386out_put_progs:
5387	if (old_prog)
5388		bpf_prog_put(old_prog);
5389	if (ret)
5390		bpf_prog_put(new_prog);
5391out_put_link:
5392	bpf_link_put_direct(link);
5393	return ret;
5394}
5395
5396#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
5397
5398static int link_detach(union bpf_attr *attr)
5399{
5400	struct bpf_link *link;
5401	int ret;
5402
5403	if (CHECK_ATTR(BPF_LINK_DETACH))
5404		return -EINVAL;
5405
5406	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
5407	if (IS_ERR(link))
5408		return PTR_ERR(link);
5409
5410	if (link->ops->detach)
5411		ret = link->ops->detach(link);
5412	else
5413		ret = -EOPNOTSUPP;
5414
5415	bpf_link_put_direct(link);
5416	return ret;
5417}
5418
5419static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
5420{
5421	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
5422}
5423
5424struct bpf_link *bpf_link_by_id(u32 id)
5425{
5426	struct bpf_link *link;
5427
5428	if (!id)
5429		return ERR_PTR(-ENOENT);
5430
5431	spin_lock_bh(&link_idr_lock);
5432	/* before link is "settled", ID is 0, pretend it doesn't exist yet */
5433	link = idr_find(&link_idr, id);
5434	if (link) {
5435		if (link->id)
5436			link = bpf_link_inc_not_zero(link);
5437		else
5438			link = ERR_PTR(-EAGAIN);
5439	} else {
5440		link = ERR_PTR(-ENOENT);
5441	}
5442	spin_unlock_bh(&link_idr_lock);
5443	return link;
5444}
5445
5446struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
5447{
5448	struct bpf_link *link;
5449
5450	spin_lock_bh(&link_idr_lock);
5451again:
5452	link = idr_get_next(&link_idr, id);
5453	if (link) {
5454		link = bpf_link_inc_not_zero(link);
5455		if (IS_ERR(link)) {
5456			(*id)++;
5457			goto again;
5458		}
5459	}
5460	spin_unlock_bh(&link_idr_lock);
5461
5462	return link;
5463}
5464
5465#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
5466
5467static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
5468{
5469	struct bpf_link *link;
5470	u32 id = attr->link_id;
5471	int fd;
5472
5473	if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
5474		return -EINVAL;
5475
5476	if (!capable(CAP_SYS_ADMIN))
5477		return -EPERM;
5478
5479	link = bpf_link_by_id(id);
5480	if (IS_ERR(link))
5481		return PTR_ERR(link);
5482
5483	fd = bpf_link_new_fd(link);
5484	if (fd < 0)
5485		bpf_link_put_direct(link);
5486
5487	return fd;
5488}
5489
5490DEFINE_MUTEX(bpf_stats_enabled_mutex);
5491
5492static int bpf_stats_release(struct inode *inode, struct file *file)
5493{
5494	mutex_lock(&bpf_stats_enabled_mutex);
5495	static_key_slow_dec(&bpf_stats_enabled_key.key);
5496	mutex_unlock(&bpf_stats_enabled_mutex);
5497	return 0;
5498}
5499
5500static const struct file_operations bpf_stats_fops = {
5501	.release = bpf_stats_release,
5502};
5503
5504static int bpf_enable_runtime_stats(void)
5505{
5506	int fd;
5507
5508	mutex_lock(&bpf_stats_enabled_mutex);
5509
5510	/* Set a very high limit to avoid overflow */
5511	if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
5512		mutex_unlock(&bpf_stats_enabled_mutex);
5513		return -EBUSY;
5514	}
5515
5516	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
5517	if (fd >= 0)
5518		static_key_slow_inc(&bpf_stats_enabled_key.key);
5519
5520	mutex_unlock(&bpf_stats_enabled_mutex);
5521	return fd;
5522}
5523
5524#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
5525
5526static int bpf_enable_stats(union bpf_attr *attr)
5527{
5528
5529	if (CHECK_ATTR(BPF_ENABLE_STATS))
5530		return -EINVAL;
5531
5532	if (!capable(CAP_SYS_ADMIN))
5533		return -EPERM;
5534
5535	switch (attr->enable_stats.type) {
5536	case BPF_STATS_RUN_TIME:
5537		return bpf_enable_runtime_stats();
5538	default:
5539		break;
5540	}
5541	return -EINVAL;
5542}
5543
5544#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
5545
5546static int bpf_iter_create(union bpf_attr *attr)
5547{
5548	struct bpf_link *link;
5549	int err;
5550
5551	if (CHECK_ATTR(BPF_ITER_CREATE))
5552		return -EINVAL;
5553
5554	if (attr->iter_create.flags)
5555		return -EINVAL;
5556
5557	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
5558	if (IS_ERR(link))
5559		return PTR_ERR(link);
5560
5561	err = bpf_iter_new_fd(link);
5562	bpf_link_put_direct(link);
5563
5564	return err;
5565}
5566
5567#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
5568
5569static int bpf_prog_bind_map(union bpf_attr *attr)
5570{
5571	struct bpf_prog *prog;
5572	struct bpf_map *map;
5573	struct bpf_map **used_maps_old, **used_maps_new;
5574	int i, ret = 0;
5575
5576	if (CHECK_ATTR(BPF_PROG_BIND_MAP))
5577		return -EINVAL;
5578
5579	if (attr->prog_bind_map.flags)
5580		return -EINVAL;
5581
5582	prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
5583	if (IS_ERR(prog))
5584		return PTR_ERR(prog);
5585
5586	map = bpf_map_get(attr->prog_bind_map.map_fd);
5587	if (IS_ERR(map)) {
5588		ret = PTR_ERR(map);
5589		goto out_prog_put;
5590	}
5591
5592	mutex_lock(&prog->aux->used_maps_mutex);
5593
5594	used_maps_old = prog->aux->used_maps;
5595
5596	for (i = 0; i < prog->aux->used_map_cnt; i++)
5597		if (used_maps_old[i] == map) {
5598			bpf_map_put(map);
5599			goto out_unlock;
5600		}
5601
5602	used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
5603				      sizeof(used_maps_new[0]),
5604				      GFP_KERNEL);
5605	if (!used_maps_new) {
5606		ret = -ENOMEM;
5607		goto out_unlock;
5608	}
5609
5610	/* The bpf program will not access the bpf map, but for the sake of
5611	 * simplicity, increase sleepable_refcnt for sleepable program as well.
5612	 */
5613	if (prog->sleepable)
5614		atomic64_inc(&map->sleepable_refcnt);
5615	memcpy(used_maps_new, used_maps_old,
5616	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
5617	used_maps_new[prog->aux->used_map_cnt] = map;
5618
5619	prog->aux->used_map_cnt++;
5620	prog->aux->used_maps = used_maps_new;
5621
5622	kfree(used_maps_old);
5623
5624out_unlock:
5625	mutex_unlock(&prog->aux->used_maps_mutex);
5626
5627	if (ret)
5628		bpf_map_put(map);
5629out_prog_put:
5630	bpf_prog_put(prog);
5631	return ret;
5632}
5633
5634#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
5635
5636static int token_create(union bpf_attr *attr)
5637{
5638	if (CHECK_ATTR(BPF_TOKEN_CREATE))
5639		return -EINVAL;
5640
5641	/* no flags are supported yet */
5642	if (attr->token_create.flags)
5643		return -EINVAL;
5644
5645	return bpf_token_create(attr);
5646}
5647
5648static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
5649{
5650	union bpf_attr attr;
5651	int err;
5652
5653	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
5654	if (err)
5655		return err;
5656	size = min_t(u32, size, sizeof(attr));
5657
5658	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
5659	memset(&attr, 0, sizeof(attr));
5660	if (copy_from_bpfptr(&attr, uattr, size) != 0)
5661		return -EFAULT;
5662
5663	err = security_bpf(cmd, &attr, size);
5664	if (err < 0)
5665		return err;
5666
5667	switch (cmd) {
5668	case BPF_MAP_CREATE:
5669		err = map_create(&attr);
5670		break;
5671	case BPF_MAP_LOOKUP_ELEM:
5672		err = map_lookup_elem(&attr);
5673		break;
5674	case BPF_MAP_UPDATE_ELEM:
5675		err = map_update_elem(&attr, uattr);
5676		break;
5677	case BPF_MAP_DELETE_ELEM:
5678		err = map_delete_elem(&attr, uattr);
5679		break;
5680	case BPF_MAP_GET_NEXT_KEY:
5681		err = map_get_next_key(&attr);
5682		break;
5683	case BPF_MAP_FREEZE:
5684		err = map_freeze(&attr);
5685		break;
5686	case BPF_PROG_LOAD:
5687		err = bpf_prog_load(&attr, uattr, size);
5688		break;
5689	case BPF_OBJ_PIN:
5690		err = bpf_obj_pin(&attr);
5691		break;
5692	case BPF_OBJ_GET:
5693		err = bpf_obj_get(&attr);
5694		break;
5695	case BPF_PROG_ATTACH:
5696		err = bpf_prog_attach(&attr);
5697		break;
5698	case BPF_PROG_DETACH:
5699		err = bpf_prog_detach(&attr);
5700		break;
5701	case BPF_PROG_QUERY:
5702		err = bpf_prog_query(&attr, uattr.user);
5703		break;
5704	case BPF_PROG_TEST_RUN:
5705		err = bpf_prog_test_run(&attr, uattr.user);
5706		break;
5707	case BPF_PROG_GET_NEXT_ID:
5708		err = bpf_obj_get_next_id(&attr, uattr.user,
5709					  &prog_idr, &prog_idr_lock);
5710		break;
5711	case BPF_MAP_GET_NEXT_ID:
5712		err = bpf_obj_get_next_id(&attr, uattr.user,
5713					  &map_idr, &map_idr_lock);
5714		break;
5715	case BPF_BTF_GET_NEXT_ID:
5716		err = bpf_obj_get_next_id(&attr, uattr.user,
5717					  &btf_idr, &btf_idr_lock);
5718		break;
5719	case BPF_PROG_GET_FD_BY_ID:
5720		err = bpf_prog_get_fd_by_id(&attr);
5721		break;
5722	case BPF_MAP_GET_FD_BY_ID:
5723		err = bpf_map_get_fd_by_id(&attr);
5724		break;
5725	case BPF_OBJ_GET_INFO_BY_FD:
5726		err = bpf_obj_get_info_by_fd(&attr, uattr.user);
5727		break;
5728	case BPF_RAW_TRACEPOINT_OPEN:
5729		err = bpf_raw_tracepoint_open(&attr);
5730		break;
5731	case BPF_BTF_LOAD:
5732		err = bpf_btf_load(&attr, uattr, size);
5733		break;
5734	case BPF_BTF_GET_FD_BY_ID:
5735		err = bpf_btf_get_fd_by_id(&attr);
5736		break;
5737	case BPF_TASK_FD_QUERY:
5738		err = bpf_task_fd_query(&attr, uattr.user);
5739		break;
5740	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
5741		err = map_lookup_and_delete_elem(&attr);
5742		break;
5743	case BPF_MAP_LOOKUP_BATCH:
5744		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
5745		break;
5746	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
5747		err = bpf_map_do_batch(&attr, uattr.user,
5748				       BPF_MAP_LOOKUP_AND_DELETE_BATCH);
5749		break;
5750	case BPF_MAP_UPDATE_BATCH:
5751		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
5752		break;
5753	case BPF_MAP_DELETE_BATCH:
5754		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
5755		break;
5756	case BPF_LINK_CREATE:
5757		err = link_create(&attr, uattr);
5758		break;
5759	case BPF_LINK_UPDATE:
5760		err = link_update(&attr);
5761		break;
5762	case BPF_LINK_GET_FD_BY_ID:
5763		err = bpf_link_get_fd_by_id(&attr);
5764		break;
5765	case BPF_LINK_GET_NEXT_ID:
5766		err = bpf_obj_get_next_id(&attr, uattr.user,
5767					  &link_idr, &link_idr_lock);
5768		break;
5769	case BPF_ENABLE_STATS:
5770		err = bpf_enable_stats(&attr);
5771		break;
5772	case BPF_ITER_CREATE:
5773		err = bpf_iter_create(&attr);
5774		break;
5775	case BPF_LINK_DETACH:
5776		err = link_detach(&attr);
5777		break;
5778	case BPF_PROG_BIND_MAP:
5779		err = bpf_prog_bind_map(&attr);
5780		break;
5781	case BPF_TOKEN_CREATE:
5782		err = token_create(&attr);
5783		break;
5784	default:
5785		err = -EINVAL;
5786		break;
5787	}
5788
5789	return err;
5790}
5791
5792SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
5793{
5794	return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
5795}
5796
5797static bool syscall_prog_is_valid_access(int off, int size,
5798					 enum bpf_access_type type,
5799					 const struct bpf_prog *prog,
5800					 struct bpf_insn_access_aux *info)
5801{
5802	if (off < 0 || off >= U16_MAX)
5803		return false;
5804	if (off % size != 0)
5805		return false;
5806	return true;
5807}
5808
5809BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
5810{
5811	switch (cmd) {
5812	case BPF_MAP_CREATE:
5813	case BPF_MAP_DELETE_ELEM:
5814	case BPF_MAP_UPDATE_ELEM:
5815	case BPF_MAP_FREEZE:
5816	case BPF_MAP_GET_FD_BY_ID:
5817	case BPF_PROG_LOAD:
5818	case BPF_BTF_LOAD:
5819	case BPF_LINK_CREATE:
5820	case BPF_RAW_TRACEPOINT_OPEN:
5821		break;
5822	default:
5823		return -EINVAL;
5824	}
5825	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
5826}
5827
5828
5829/* To shut up -Wmissing-prototypes.
5830 * This function is used by the kernel light skeleton
5831 * to load bpf programs when modules are loaded or during kernel boot.
5832 * See tools/lib/bpf/skel_internal.h
5833 */
5834int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
5835
5836int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
5837{
5838	struct bpf_prog * __maybe_unused prog;
5839	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
5840
5841	switch (cmd) {
5842#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
5843	case BPF_PROG_TEST_RUN:
5844		if (attr->test.data_in || attr->test.data_out ||
5845		    attr->test.ctx_out || attr->test.duration ||
5846		    attr->test.repeat || attr->test.flags)
5847			return -EINVAL;
5848
5849		prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
5850		if (IS_ERR(prog))
5851			return PTR_ERR(prog);
5852
5853		if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
5854		    attr->test.ctx_size_in > U16_MAX) {
5855			bpf_prog_put(prog);
5856			return -EINVAL;
5857		}
5858
5859		run_ctx.bpf_cookie = 0;
5860		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
5861			/* recursion detected */
5862			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
5863			bpf_prog_put(prog);
5864			return -EBUSY;
5865		}
5866		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
5867		__bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
5868						&run_ctx);
5869		bpf_prog_put(prog);
5870		return 0;
5871#endif
5872	default:
5873		return ____bpf_sys_bpf(cmd, attr, size);
5874	}
5875}
5876EXPORT_SYMBOL(kern_sys_bpf);
5877
5878static const struct bpf_func_proto bpf_sys_bpf_proto = {
5879	.func		= bpf_sys_bpf,
5880	.gpl_only	= false,
5881	.ret_type	= RET_INTEGER,
5882	.arg1_type	= ARG_ANYTHING,
5883	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
5884	.arg3_type	= ARG_CONST_SIZE,
5885};
5886
5887const struct bpf_func_proto * __weak
5888tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5889{
5890	return bpf_base_func_proto(func_id, prog);
5891}
5892
5893BPF_CALL_1(bpf_sys_close, u32, fd)
5894{
5895	/* When bpf program calls this helper there should not be
5896	 * an fdget() without matching completed fdput().
5897	 * This helper is allowed in the following callchain only:
5898	 * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
5899	 */
5900	return close_fd(fd);
5901}
5902
5903static const struct bpf_func_proto bpf_sys_close_proto = {
5904	.func		= bpf_sys_close,
5905	.gpl_only	= false,
5906	.ret_type	= RET_INTEGER,
5907	.arg1_type	= ARG_ANYTHING,
5908};
5909
5910BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
5911{
5912	if (flags)
5913		return -EINVAL;
5914
5915	if (name_sz <= 1 || name[name_sz - 1])
5916		return -EINVAL;
5917
5918	if (!bpf_dump_raw_ok(current_cred()))
5919		return -EPERM;
5920
5921	*res = kallsyms_lookup_name(name);
5922	return *res ? 0 : -ENOENT;
5923}
5924
5925static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
5926	.func		= bpf_kallsyms_lookup_name,
5927	.gpl_only	= false,
5928	.ret_type	= RET_INTEGER,
5929	.arg1_type	= ARG_PTR_TO_MEM,
5930	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
5931	.arg3_type	= ARG_ANYTHING,
5932	.arg4_type	= ARG_PTR_TO_LONG,
5933};
5934
5935static const struct bpf_func_proto *
5936syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5937{
5938	switch (func_id) {
5939	case BPF_FUNC_sys_bpf:
5940		return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
5941		       ? NULL : &bpf_sys_bpf_proto;
5942	case BPF_FUNC_btf_find_by_name_kind:
5943		return &bpf_btf_find_by_name_kind_proto;
5944	case BPF_FUNC_sys_close:
5945		return &bpf_sys_close_proto;
5946	case BPF_FUNC_kallsyms_lookup_name:
5947		return &bpf_kallsyms_lookup_name_proto;
5948	default:
5949		return tracing_prog_func_proto(func_id, prog);
5950	}
5951}
5952
5953const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
5954	.get_func_proto  = syscall_prog_func_proto,
5955	.is_valid_access = syscall_prog_is_valid_access,
5956};
5957
5958const struct bpf_prog_ops bpf_syscall_prog_ops = {
5959	.test_run = bpf_prog_test_run_syscall,
5960};
5961
5962#ifdef CONFIG_SYSCTL
5963static int bpf_stats_handler(struct ctl_table *table, int write,
5964			     void *buffer, size_t *lenp, loff_t *ppos)
5965{
5966	struct static_key *key = (struct static_key *)table->data;
5967	static int saved_val;
5968	int val, ret;
5969	struct ctl_table tmp = {
5970		.data   = &val,
5971		.maxlen = sizeof(val),
5972		.mode   = table->mode,
5973		.extra1 = SYSCTL_ZERO,
5974		.extra2 = SYSCTL_ONE,
5975	};
5976
5977	if (write && !capable(CAP_SYS_ADMIN))
5978		return -EPERM;
5979
5980	mutex_lock(&bpf_stats_enabled_mutex);
5981	val = saved_val;
5982	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5983	if (write && !ret && val != saved_val) {
5984		if (val)
5985			static_key_slow_inc(key);
5986		else
5987			static_key_slow_dec(key);
5988		saved_val = val;
5989	}
5990	mutex_unlock(&bpf_stats_enabled_mutex);
5991	return ret;
5992}
5993
5994void __weak unpriv_ebpf_notify(int new_state)
5995{
5996}
5997
5998static int bpf_unpriv_handler(struct ctl_table *table, int write,
5999			      void *buffer, size_t *lenp, loff_t *ppos)
6000{
6001	int ret, unpriv_enable = *(int *)table->data;
6002	bool locked_state = unpriv_enable == 1;
6003	struct ctl_table tmp = *table;
6004
6005	if (write && !capable(CAP_SYS_ADMIN))
6006		return -EPERM;
6007
6008	tmp.data = &unpriv_enable;
6009	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
6010	if (write && !ret) {
6011		if (locked_state && unpriv_enable != 1)
6012			return -EPERM;
6013		*(int *)table->data = unpriv_enable;
6014	}
6015
6016	if (write)
6017		unpriv_ebpf_notify(unpriv_enable);
6018
6019	return ret;
6020}
6021
6022static struct ctl_table bpf_syscall_table[] = {
6023	{
6024		.procname	= "unprivileged_bpf_disabled",
6025		.data		= &sysctl_unprivileged_bpf_disabled,
6026		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
6027		.mode		= 0644,
6028		.proc_handler	= bpf_unpriv_handler,
6029		.extra1		= SYSCTL_ZERO,
6030		.extra2		= SYSCTL_TWO,
6031	},
6032	{
6033		.procname	= "bpf_stats_enabled",
6034		.data		= &bpf_stats_enabled_key.key,
6035		.mode		= 0644,
6036		.proc_handler	= bpf_stats_handler,
6037	},
6038};
6039
6040static int __init bpf_syscall_sysctl_init(void)
6041{
6042	register_sysctl_init("kernel", bpf_syscall_table);
6043	return 0;
6044}
6045late_initcall(bpf_syscall_sysctl_init);
6046#endif /* CONFIG_SYSCTL */
6047