1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * Copyright (c) 2016 Facebook
4 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
5 */
6#include <uapi/linux/btf.h>
7#include <linux/bpf-cgroup.h>
8#include <linux/kernel.h>
9#include <linux/types.h>
10#include <linux/slab.h>
11#include <linux/bpf.h>
12#include <linux/btf.h>
13#include <linux/bpf_verifier.h>
14#include <linux/filter.h>
15#include <net/netlink.h>
16#include <linux/file.h>
17#include <linux/vmalloc.h>
18#include <linux/stringify.h>
19#include <linux/bsearch.h>
20#include <linux/sort.h>
21#include <linux/perf_event.h>
22#include <linux/ctype.h>
23#include <linux/error-injection.h>
24#include <linux/bpf_lsm.h>
25#include <linux/btf_ids.h>
26#include <linux/poison.h>
27#include <linux/module.h>
28#include <linux/cpumask.h>
29#include <linux/bpf_mem_alloc.h>
30#include <net/xdp.h>
31
32#include "disasm.h"
33
34static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
35#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
36	[_id] = & _name ## _verifier_ops,
37#define BPF_MAP_TYPE(_id, _ops)
38#define BPF_LINK_TYPE(_id, _name)
39#include <linux/bpf_types.h>
40#undef BPF_PROG_TYPE
41#undef BPF_MAP_TYPE
42#undef BPF_LINK_TYPE
43};
44
45struct bpf_mem_alloc bpf_global_percpu_ma;
46static bool bpf_global_percpu_ma_set;
47
48/* bpf_check() is a static code analyzer that walks eBPF program
49 * instruction by instruction and updates register/stack state.
50 * All paths of conditional branches are analyzed until 'bpf_exit' insn.
51 *
52 * The first pass is depth-first-search to check that the program is a DAG.
53 * It rejects the following programs:
54 * - larger than BPF_MAXINSNS insns
55 * - if loop is present (detected via back-edge)
56 * - unreachable insns exist (shouldn't be a forest. program = one function)
57 * - out of bounds or malformed jumps
58 * The second pass is all possible path descent from the 1st insn.
59 * Since it's analyzing all paths through the program, the length of the
60 * analysis is limited to 64k insn, which may be hit even if total number of
61 * insn is less then 4K, but there are too many branches that change stack/regs.
62 * Number of 'branches to be analyzed' is limited to 1k
63 *
64 * On entry to each instruction, each register has a type, and the instruction
65 * changes the types of the registers depending on instruction semantics.
66 * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
67 * copied to R1.
68 *
69 * All registers are 64-bit.
70 * R0 - return register
71 * R1-R5 argument passing registers
72 * R6-R9 callee saved registers
73 * R10 - frame pointer read-only
74 *
75 * At the start of BPF program the register R1 contains a pointer to bpf_context
76 * and has type PTR_TO_CTX.
77 *
78 * Verifier tracks arithmetic operations on pointers in case:
79 *    BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
80 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
81 * 1st insn copies R10 (which has FRAME_PTR) type into R1
82 * and 2nd arithmetic instruction is pattern matched to recognize
83 * that it wants to construct a pointer to some element within stack.
84 * So after 2nd insn, the register R1 has type PTR_TO_STACK
85 * (and -20 constant is saved for further stack bounds checking).
86 * Meaning that this reg is a pointer to stack plus known immediate constant.
87 *
88 * Most of the time the registers have SCALAR_VALUE type, which
89 * means the register has some value, but it's not a valid pointer.
90 * (like pointer plus pointer becomes SCALAR_VALUE type)
91 *
92 * When verifier sees load or store instructions the type of base register
93 * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
94 * four pointer types recognized by check_mem_access() function.
95 *
96 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
97 * and the range of [ptr, ptr + map's value_size) is accessible.
98 *
99 * registers used to pass values to function calls are checked against
100 * function argument constraints.
101 *
102 * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
103 * It means that the register type passed to this function must be
104 * PTR_TO_STACK and it will be used inside the function as
105 * 'pointer to map element key'
106 *
107 * For example the argument constraints for bpf_map_lookup_elem():
108 *   .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
109 *   .arg1_type = ARG_CONST_MAP_PTR,
110 *   .arg2_type = ARG_PTR_TO_MAP_KEY,
111 *
112 * ret_type says that this function returns 'pointer to map elem value or null'
113 * function expects 1st argument to be a const pointer to 'struct bpf_map' and
114 * 2nd argument should be a pointer to stack, which will be used inside
115 * the helper function as a pointer to map element key.
116 *
117 * On the kernel side the helper function looks like:
118 * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
119 * {
120 *    struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
121 *    void *key = (void *) (unsigned long) r2;
122 *    void *value;
123 *
124 *    here kernel can access 'key' and 'map' pointers safely, knowing that
125 *    [key, key + map->key_size) bytes are valid and were initialized on
126 *    the stack of eBPF program.
127 * }
128 *
129 * Corresponding eBPF program may look like:
130 *    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),  // after this insn R2 type is FRAME_PTR
131 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
132 *    BPF_LD_MAP_FD(BPF_REG_1, map_fd),      // after this insn R1 type is CONST_PTR_TO_MAP
133 *    BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
134 * here verifier looks at prototype of map_lookup_elem() and sees:
135 * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
136 * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
137 *
138 * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
139 * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
140 * and were initialized prior to this call.
141 * If it's ok, then verifier allows this BPF_CALL insn and looks at
142 * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
143 * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
144 * returns either pointer to map value or NULL.
145 *
146 * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
147 * insn, the register holding that pointer in the true branch changes state to
148 * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
149 * branch. See check_cond_jmp_op().
150 *
151 * After the call R0 is set to return type of the function and registers R1-R5
152 * are set to NOT_INIT to indicate that they are no longer readable.
153 *
154 * The following reference types represent a potential reference to a kernel
155 * resource which, after first being allocated, must be checked and freed by
156 * the BPF program:
157 * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
158 *
159 * When the verifier sees a helper call return a reference type, it allocates a
160 * pointer id for the reference and stores it in the current function state.
161 * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
162 * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
163 * passes through a NULL-check conditional. For the branch wherein the state is
164 * changed to CONST_IMM, the verifier releases the reference.
165 *
166 * For each helper function that allocates a reference, such as
167 * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
168 * bpf_sk_release(). When a reference type passes into the release function,
169 * the verifier also releases the reference. If any unchecked or unreleased
170 * reference remains at the end of the program, the verifier rejects it.
171 */
172
173/* verifier_state + insn_idx are pushed to stack when branch is encountered */
174struct bpf_verifier_stack_elem {
175	/* verifier state is 'st'
176	 * before processing instruction 'insn_idx'
177	 * and after processing instruction 'prev_insn_idx'
178	 */
179	struct bpf_verifier_state st;
180	int insn_idx;
181	int prev_insn_idx;
182	struct bpf_verifier_stack_elem *next;
183	/* length of verifier log at the time this state was pushed on stack */
184	u32 log_pos;
185};
186
187#define BPF_COMPLEXITY_LIMIT_JMP_SEQ	8192
188#define BPF_COMPLEXITY_LIMIT_STATES	64
189
190#define BPF_MAP_KEY_POISON	(1ULL << 63)
191#define BPF_MAP_KEY_SEEN	(1ULL << 62)
192
193#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
194
195static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
196static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
197static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
198static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
199static int ref_set_non_owning(struct bpf_verifier_env *env,
200			      struct bpf_reg_state *reg);
201static void specialize_kfunc(struct bpf_verifier_env *env,
202			     u32 func_id, u16 offset, unsigned long *addr);
203static bool is_trusted_reg(const struct bpf_reg_state *reg);
204
205static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
206{
207	return aux->map_ptr_state.poison;
208}
209
210static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
211{
212	return aux->map_ptr_state.unpriv;
213}
214
215static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
216			      struct bpf_map *map,
217			      bool unpriv, bool poison)
218{
219	unpriv |= bpf_map_ptr_unpriv(aux);
220	aux->map_ptr_state.unpriv = unpriv;
221	aux->map_ptr_state.poison = poison;
222	aux->map_ptr_state.map_ptr = map;
223}
224
225static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
226{
227	return aux->map_key_state & BPF_MAP_KEY_POISON;
228}
229
230static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
231{
232	return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
233}
234
235static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
236{
237	return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
238}
239
240static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
241{
242	bool poisoned = bpf_map_key_poisoned(aux);
243
244	aux->map_key_state = state | BPF_MAP_KEY_SEEN |
245			     (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
246}
247
248static bool bpf_helper_call(const struct bpf_insn *insn)
249{
250	return insn->code == (BPF_JMP | BPF_CALL) &&
251	       insn->src_reg == 0;
252}
253
254static bool bpf_pseudo_call(const struct bpf_insn *insn)
255{
256	return insn->code == (BPF_JMP | BPF_CALL) &&
257	       insn->src_reg == BPF_PSEUDO_CALL;
258}
259
260static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
261{
262	return insn->code == (BPF_JMP | BPF_CALL) &&
263	       insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
264}
265
266struct bpf_call_arg_meta {
267	struct bpf_map *map_ptr;
268	bool raw_mode;
269	bool pkt_access;
270	u8 release_regno;
271	int regno;
272	int access_size;
273	int mem_size;
274	u64 msize_max_value;
275	int ref_obj_id;
276	int dynptr_id;
277	int map_uid;
278	int func_id;
279	struct btf *btf;
280	u32 btf_id;
281	struct btf *ret_btf;
282	u32 ret_btf_id;
283	u32 subprogno;
284	struct btf_field *kptr_field;
285};
286
287struct bpf_kfunc_call_arg_meta {
288	/* In parameters */
289	struct btf *btf;
290	u32 func_id;
291	u32 kfunc_flags;
292	const struct btf_type *func_proto;
293	const char *func_name;
294	/* Out parameters */
295	u32 ref_obj_id;
296	u8 release_regno;
297	bool r0_rdonly;
298	u32 ret_btf_id;
299	u64 r0_size;
300	u32 subprogno;
301	struct {
302		u64 value;
303		bool found;
304	} arg_constant;
305
306	/* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
307	 * generally to pass info about user-defined local kptr types to later
308	 * verification logic
309	 *   bpf_obj_drop/bpf_percpu_obj_drop
310	 *     Record the local kptr type to be drop'd
311	 *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
312	 *     Record the local kptr type to be refcount_incr'd and use
313	 *     arg_owning_ref to determine whether refcount_acquire should be
314	 *     fallible
315	 */
316	struct btf *arg_btf;
317	u32 arg_btf_id;
318	bool arg_owning_ref;
319
320	struct {
321		struct btf_field *field;
322	} arg_list_head;
323	struct {
324		struct btf_field *field;
325	} arg_rbtree_root;
326	struct {
327		enum bpf_dynptr_type type;
328		u32 id;
329		u32 ref_obj_id;
330	} initialized_dynptr;
331	struct {
332		u8 spi;
333		u8 frameno;
334	} iter;
335	struct {
336		struct bpf_map *ptr;
337		int uid;
338	} map;
339	u64 mem_size;
340};
341
342struct btf *btf_vmlinux;
343
344static const char *btf_type_name(const struct btf *btf, u32 id)
345{
346	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
347}
348
349static DEFINE_MUTEX(bpf_verifier_lock);
350static DEFINE_MUTEX(bpf_percpu_ma_lock);
351
352__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
353{
354	struct bpf_verifier_env *env = private_data;
355	va_list args;
356
357	if (!bpf_verifier_log_needed(&env->log))
358		return;
359
360	va_start(args, fmt);
361	bpf_verifier_vlog(&env->log, fmt, args);
362	va_end(args);
363}
364
365static void verbose_invalid_scalar(struct bpf_verifier_env *env,
366				   struct bpf_reg_state *reg,
367				   struct bpf_retval_range range, const char *ctx,
368				   const char *reg_name)
369{
370	bool unknown = true;
371
372	verbose(env, "%s the register %s has", ctx, reg_name);
373	if (reg->smin_value > S64_MIN) {
374		verbose(env, " smin=%lld", reg->smin_value);
375		unknown = false;
376	}
377	if (reg->smax_value < S64_MAX) {
378		verbose(env, " smax=%lld", reg->smax_value);
379		unknown = false;
380	}
381	if (unknown)
382		verbose(env, " unknown scalar value");
383	verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
384}
385
386static bool type_may_be_null(u32 type)
387{
388	return type & PTR_MAYBE_NULL;
389}
390
391static bool reg_not_null(const struct bpf_reg_state *reg)
392{
393	enum bpf_reg_type type;
394
395	type = reg->type;
396	if (type_may_be_null(type))
397		return false;
398
399	type = base_type(type);
400	return type == PTR_TO_SOCKET ||
401		type == PTR_TO_TCP_SOCK ||
402		type == PTR_TO_MAP_VALUE ||
403		type == PTR_TO_MAP_KEY ||
404		type == PTR_TO_SOCK_COMMON ||
405		(type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
406		type == PTR_TO_MEM;
407}
408
409static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
410{
411	struct btf_record *rec = NULL;
412	struct btf_struct_meta *meta;
413
414	if (reg->type == PTR_TO_MAP_VALUE) {
415		rec = reg->map_ptr->record;
416	} else if (type_is_ptr_alloc_obj(reg->type)) {
417		meta = btf_find_struct_meta(reg->btf, reg->btf_id);
418		if (meta)
419			rec = meta->record;
420	}
421	return rec;
422}
423
424static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
425{
426	struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
427
428	return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
429}
430
431static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
432{
433	struct bpf_func_info *info;
434
435	if (!env->prog->aux->func_info)
436		return "";
437
438	info = &env->prog->aux->func_info[subprog];
439	return btf_type_name(env->prog->aux->btf, info->type_id);
440}
441
442static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
443{
444	struct bpf_subprog_info *info = subprog_info(env, subprog);
445
446	info->is_cb = true;
447	info->is_async_cb = true;
448	info->is_exception_cb = true;
449}
450
451static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
452{
453	return subprog_info(env, subprog)->is_exception_cb;
454}
455
456static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
457{
458	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
459}
460
461static bool type_is_rdonly_mem(u32 type)
462{
463	return type & MEM_RDONLY;
464}
465
466static bool is_acquire_function(enum bpf_func_id func_id,
467				const struct bpf_map *map)
468{
469	enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
470
471	if (func_id == BPF_FUNC_sk_lookup_tcp ||
472	    func_id == BPF_FUNC_sk_lookup_udp ||
473	    func_id == BPF_FUNC_skc_lookup_tcp ||
474	    func_id == BPF_FUNC_ringbuf_reserve ||
475	    func_id == BPF_FUNC_kptr_xchg)
476		return true;
477
478	if (func_id == BPF_FUNC_map_lookup_elem &&
479	    (map_type == BPF_MAP_TYPE_SOCKMAP ||
480	     map_type == BPF_MAP_TYPE_SOCKHASH))
481		return true;
482
483	return false;
484}
485
486static bool is_ptr_cast_function(enum bpf_func_id func_id)
487{
488	return func_id == BPF_FUNC_tcp_sock ||
489		func_id == BPF_FUNC_sk_fullsock ||
490		func_id == BPF_FUNC_skc_to_tcp_sock ||
491		func_id == BPF_FUNC_skc_to_tcp6_sock ||
492		func_id == BPF_FUNC_skc_to_udp6_sock ||
493		func_id == BPF_FUNC_skc_to_mptcp_sock ||
494		func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
495		func_id == BPF_FUNC_skc_to_tcp_request_sock;
496}
497
498static bool is_dynptr_ref_function(enum bpf_func_id func_id)
499{
500	return func_id == BPF_FUNC_dynptr_data;
501}
502
503static bool is_sync_callback_calling_kfunc(u32 btf_id);
504static bool is_async_callback_calling_kfunc(u32 btf_id);
505static bool is_callback_calling_kfunc(u32 btf_id);
506static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
507
508static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
509
510static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
511{
512	return func_id == BPF_FUNC_for_each_map_elem ||
513	       func_id == BPF_FUNC_find_vma ||
514	       func_id == BPF_FUNC_loop ||
515	       func_id == BPF_FUNC_user_ringbuf_drain;
516}
517
518static bool is_async_callback_calling_function(enum bpf_func_id func_id)
519{
520	return func_id == BPF_FUNC_timer_set_callback;
521}
522
523static bool is_callback_calling_function(enum bpf_func_id func_id)
524{
525	return is_sync_callback_calling_function(func_id) ||
526	       is_async_callback_calling_function(func_id);
527}
528
529static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
530{
531	return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
532	       (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
533}
534
535static bool is_async_callback_calling_insn(struct bpf_insn *insn)
536{
537	return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
538	       (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
539}
540
541static bool is_may_goto_insn(struct bpf_insn *insn)
542{
543	return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
544}
545
546static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
547{
548	return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
549}
550
551static bool is_storage_get_function(enum bpf_func_id func_id)
552{
553	return func_id == BPF_FUNC_sk_storage_get ||
554	       func_id == BPF_FUNC_inode_storage_get ||
555	       func_id == BPF_FUNC_task_storage_get ||
556	       func_id == BPF_FUNC_cgrp_storage_get;
557}
558
559static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
560					const struct bpf_map *map)
561{
562	int ref_obj_uses = 0;
563
564	if (is_ptr_cast_function(func_id))
565		ref_obj_uses++;
566	if (is_acquire_function(func_id, map))
567		ref_obj_uses++;
568	if (is_dynptr_ref_function(func_id))
569		ref_obj_uses++;
570
571	return ref_obj_uses > 1;
572}
573
574static bool is_cmpxchg_insn(const struct bpf_insn *insn)
575{
576	return BPF_CLASS(insn->code) == BPF_STX &&
577	       BPF_MODE(insn->code) == BPF_ATOMIC &&
578	       insn->imm == BPF_CMPXCHG;
579}
580
581static int __get_spi(s32 off)
582{
583	return (-off - 1) / BPF_REG_SIZE;
584}
585
586static struct bpf_func_state *func(struct bpf_verifier_env *env,
587				   const struct bpf_reg_state *reg)
588{
589	struct bpf_verifier_state *cur = env->cur_state;
590
591	return cur->frame[reg->frameno];
592}
593
594static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
595{
596       int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
597
598       /* We need to check that slots between [spi - nr_slots + 1, spi] are
599	* within [0, allocated_stack).
600	*
601	* Please note that the spi grows downwards. For example, a dynptr
602	* takes the size of two stack slots; the first slot will be at
603	* spi and the second slot will be at spi - 1.
604	*/
605       return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
606}
607
608static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
609			          const char *obj_kind, int nr_slots)
610{
611	int off, spi;
612
613	if (!tnum_is_const(reg->var_off)) {
614		verbose(env, "%s has to be at a constant offset\n", obj_kind);
615		return -EINVAL;
616	}
617
618	off = reg->off + reg->var_off.value;
619	if (off % BPF_REG_SIZE) {
620		verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
621		return -EINVAL;
622	}
623
624	spi = __get_spi(off);
625	if (spi + 1 < nr_slots) {
626		verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
627		return -EINVAL;
628	}
629
630	if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
631		return -ERANGE;
632	return spi;
633}
634
635static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
636{
637	return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
638}
639
640static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
641{
642	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
643}
644
645static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
646{
647	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
648	case DYNPTR_TYPE_LOCAL:
649		return BPF_DYNPTR_TYPE_LOCAL;
650	case DYNPTR_TYPE_RINGBUF:
651		return BPF_DYNPTR_TYPE_RINGBUF;
652	case DYNPTR_TYPE_SKB:
653		return BPF_DYNPTR_TYPE_SKB;
654	case DYNPTR_TYPE_XDP:
655		return BPF_DYNPTR_TYPE_XDP;
656	default:
657		return BPF_DYNPTR_TYPE_INVALID;
658	}
659}
660
661static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
662{
663	switch (type) {
664	case BPF_DYNPTR_TYPE_LOCAL:
665		return DYNPTR_TYPE_LOCAL;
666	case BPF_DYNPTR_TYPE_RINGBUF:
667		return DYNPTR_TYPE_RINGBUF;
668	case BPF_DYNPTR_TYPE_SKB:
669		return DYNPTR_TYPE_SKB;
670	case BPF_DYNPTR_TYPE_XDP:
671		return DYNPTR_TYPE_XDP;
672	default:
673		return 0;
674	}
675}
676
677static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
678{
679	return type == BPF_DYNPTR_TYPE_RINGBUF;
680}
681
682static void __mark_dynptr_reg(struct bpf_reg_state *reg,
683			      enum bpf_dynptr_type type,
684			      bool first_slot, int dynptr_id);
685
686static void __mark_reg_not_init(const struct bpf_verifier_env *env,
687				struct bpf_reg_state *reg);
688
689static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
690				   struct bpf_reg_state *sreg1,
691				   struct bpf_reg_state *sreg2,
692				   enum bpf_dynptr_type type)
693{
694	int id = ++env->id_gen;
695
696	__mark_dynptr_reg(sreg1, type, true, id);
697	__mark_dynptr_reg(sreg2, type, false, id);
698}
699
700static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
701			       struct bpf_reg_state *reg,
702			       enum bpf_dynptr_type type)
703{
704	__mark_dynptr_reg(reg, type, true, ++env->id_gen);
705}
706
707static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
708				        struct bpf_func_state *state, int spi);
709
710static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
711				   enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
712{
713	struct bpf_func_state *state = func(env, reg);
714	enum bpf_dynptr_type type;
715	int spi, i, err;
716
717	spi = dynptr_get_spi(env, reg);
718	if (spi < 0)
719		return spi;
720
721	/* We cannot assume both spi and spi - 1 belong to the same dynptr,
722	 * hence we need to call destroy_if_dynptr_stack_slot twice for both,
723	 * to ensure that for the following example:
724	 *	[d1][d1][d2][d2]
725	 * spi    3   2   1   0
726	 * So marking spi = 2 should lead to destruction of both d1 and d2. In
727	 * case they do belong to same dynptr, second call won't see slot_type
728	 * as STACK_DYNPTR and will simply skip destruction.
729	 */
730	err = destroy_if_dynptr_stack_slot(env, state, spi);
731	if (err)
732		return err;
733	err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
734	if (err)
735		return err;
736
737	for (i = 0; i < BPF_REG_SIZE; i++) {
738		state->stack[spi].slot_type[i] = STACK_DYNPTR;
739		state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
740	}
741
742	type = arg_to_dynptr_type(arg_type);
743	if (type == BPF_DYNPTR_TYPE_INVALID)
744		return -EINVAL;
745
746	mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
747			       &state->stack[spi - 1].spilled_ptr, type);
748
749	if (dynptr_type_refcounted(type)) {
750		/* The id is used to track proper releasing */
751		int id;
752
753		if (clone_ref_obj_id)
754			id = clone_ref_obj_id;
755		else
756			id = acquire_reference_state(env, insn_idx);
757
758		if (id < 0)
759			return id;
760
761		state->stack[spi].spilled_ptr.ref_obj_id = id;
762		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
763	}
764
765	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
766	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
767
768	return 0;
769}
770
771static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
772{
773	int i;
774
775	for (i = 0; i < BPF_REG_SIZE; i++) {
776		state->stack[spi].slot_type[i] = STACK_INVALID;
777		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
778	}
779
780	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
781	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
782
783	/* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
784	 *
785	 * While we don't allow reading STACK_INVALID, it is still possible to
786	 * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
787	 * helpers or insns can do partial read of that part without failing,
788	 * but check_stack_range_initialized, check_stack_read_var_off, and
789	 * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
790	 * the slot conservatively. Hence we need to prevent those liveness
791	 * marking walks.
792	 *
793	 * This was not a problem before because STACK_INVALID is only set by
794	 * default (where the default reg state has its reg->parent as NULL), or
795	 * in clean_live_states after REG_LIVE_DONE (at which point
796	 * mark_reg_read won't walk reg->parent chain), but not randomly during
797	 * verifier state exploration (like we did above). Hence, for our case
798	 * parentage chain will still be live (i.e. reg->parent may be
799	 * non-NULL), while earlier reg->parent was NULL, so we need
800	 * REG_LIVE_WRITTEN to screen off read marker propagation when it is
801	 * done later on reads or by mark_dynptr_read as well to unnecessary
802	 * mark registers in verifier state.
803	 */
804	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
805	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
806}
807
808static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
809{
810	struct bpf_func_state *state = func(env, reg);
811	int spi, ref_obj_id, i;
812
813	spi = dynptr_get_spi(env, reg);
814	if (spi < 0)
815		return spi;
816
817	if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
818		invalidate_dynptr(env, state, spi);
819		return 0;
820	}
821
822	ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
823
824	/* If the dynptr has a ref_obj_id, then we need to invalidate
825	 * two things:
826	 *
827	 * 1) Any dynptrs with a matching ref_obj_id (clones)
828	 * 2) Any slices derived from this dynptr.
829	 */
830
831	/* Invalidate any slices associated with this dynptr */
832	WARN_ON_ONCE(release_reference(env, ref_obj_id));
833
834	/* Invalidate any dynptr clones */
835	for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
836		if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
837			continue;
838
839		/* it should always be the case that if the ref obj id
840		 * matches then the stack slot also belongs to a
841		 * dynptr
842		 */
843		if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
844			verbose(env, "verifier internal error: misconfigured ref_obj_id\n");
845			return -EFAULT;
846		}
847		if (state->stack[i].spilled_ptr.dynptr.first_slot)
848			invalidate_dynptr(env, state, i);
849	}
850
851	return 0;
852}
853
854static void __mark_reg_unknown(const struct bpf_verifier_env *env,
855			       struct bpf_reg_state *reg);
856
857static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
858{
859	if (!env->allow_ptr_leaks)
860		__mark_reg_not_init(env, reg);
861	else
862		__mark_reg_unknown(env, reg);
863}
864
865static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
866				        struct bpf_func_state *state, int spi)
867{
868	struct bpf_func_state *fstate;
869	struct bpf_reg_state *dreg;
870	int i, dynptr_id;
871
872	/* We always ensure that STACK_DYNPTR is never set partially,
873	 * hence just checking for slot_type[0] is enough. This is
874	 * different for STACK_SPILL, where it may be only set for
875	 * 1 byte, so code has to use is_spilled_reg.
876	 */
877	if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
878		return 0;
879
880	/* Reposition spi to first slot */
881	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
882		spi = spi + 1;
883
884	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
885		verbose(env, "cannot overwrite referenced dynptr\n");
886		return -EINVAL;
887	}
888
889	mark_stack_slot_scratched(env, spi);
890	mark_stack_slot_scratched(env, spi - 1);
891
892	/* Writing partially to one dynptr stack slot destroys both. */
893	for (i = 0; i < BPF_REG_SIZE; i++) {
894		state->stack[spi].slot_type[i] = STACK_INVALID;
895		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
896	}
897
898	dynptr_id = state->stack[spi].spilled_ptr.id;
899	/* Invalidate any slices associated with this dynptr */
900	bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
901		/* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
902		if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
903			continue;
904		if (dreg->dynptr_id == dynptr_id)
905			mark_reg_invalid(env, dreg);
906	}));
907
908	/* Do not release reference state, we are destroying dynptr on stack,
909	 * not using some helper to release it. Just reset register.
910	 */
911	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
912	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
913
914	/* Same reason as unmark_stack_slots_dynptr above */
915	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
916	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
917
918	return 0;
919}
920
921static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
922{
923	int spi;
924
925	if (reg->type == CONST_PTR_TO_DYNPTR)
926		return false;
927
928	spi = dynptr_get_spi(env, reg);
929
930	/* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
931	 * error because this just means the stack state hasn't been updated yet.
932	 * We will do check_mem_access to check and update stack bounds later.
933	 */
934	if (spi < 0 && spi != -ERANGE)
935		return false;
936
937	/* We don't need to check if the stack slots are marked by previous
938	 * dynptr initializations because we allow overwriting existing unreferenced
939	 * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
940	 * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
941	 * touching are completely destructed before we reinitialize them for a new
942	 * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
943	 * instead of delaying it until the end where the user will get "Unreleased
944	 * reference" error.
945	 */
946	return true;
947}
948
949static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
950{
951	struct bpf_func_state *state = func(env, reg);
952	int i, spi;
953
954	/* This already represents first slot of initialized bpf_dynptr.
955	 *
956	 * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
957	 * check_func_arg_reg_off's logic, so we don't need to check its
958	 * offset and alignment.
959	 */
960	if (reg->type == CONST_PTR_TO_DYNPTR)
961		return true;
962
963	spi = dynptr_get_spi(env, reg);
964	if (spi < 0)
965		return false;
966	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
967		return false;
968
969	for (i = 0; i < BPF_REG_SIZE; i++) {
970		if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
971		    state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
972			return false;
973	}
974
975	return true;
976}
977
978static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
979				    enum bpf_arg_type arg_type)
980{
981	struct bpf_func_state *state = func(env, reg);
982	enum bpf_dynptr_type dynptr_type;
983	int spi;
984
985	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
986	if (arg_type == ARG_PTR_TO_DYNPTR)
987		return true;
988
989	dynptr_type = arg_to_dynptr_type(arg_type);
990	if (reg->type == CONST_PTR_TO_DYNPTR) {
991		return reg->dynptr.type == dynptr_type;
992	} else {
993		spi = dynptr_get_spi(env, reg);
994		if (spi < 0)
995			return false;
996		return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
997	}
998}
999
1000static void __mark_reg_known_zero(struct bpf_reg_state *reg);
1001
1002static bool in_rcu_cs(struct bpf_verifier_env *env);
1003
1004static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
1005
1006static int mark_stack_slots_iter(struct bpf_verifier_env *env,
1007				 struct bpf_kfunc_call_arg_meta *meta,
1008				 struct bpf_reg_state *reg, int insn_idx,
1009				 struct btf *btf, u32 btf_id, int nr_slots)
1010{
1011	struct bpf_func_state *state = func(env, reg);
1012	int spi, i, j, id;
1013
1014	spi = iter_get_spi(env, reg, nr_slots);
1015	if (spi < 0)
1016		return spi;
1017
1018	id = acquire_reference_state(env, insn_idx);
1019	if (id < 0)
1020		return id;
1021
1022	for (i = 0; i < nr_slots; i++) {
1023		struct bpf_stack_state *slot = &state->stack[spi - i];
1024		struct bpf_reg_state *st = &slot->spilled_ptr;
1025
1026		__mark_reg_known_zero(st);
1027		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
1028		if (is_kfunc_rcu_protected(meta)) {
1029			if (in_rcu_cs(env))
1030				st->type |= MEM_RCU;
1031			else
1032				st->type |= PTR_UNTRUSTED;
1033		}
1034		st->live |= REG_LIVE_WRITTEN;
1035		st->ref_obj_id = i == 0 ? id : 0;
1036		st->iter.btf = btf;
1037		st->iter.btf_id = btf_id;
1038		st->iter.state = BPF_ITER_STATE_ACTIVE;
1039		st->iter.depth = 0;
1040
1041		for (j = 0; j < BPF_REG_SIZE; j++)
1042			slot->slot_type[j] = STACK_ITER;
1043
1044		mark_stack_slot_scratched(env, spi - i);
1045	}
1046
1047	return 0;
1048}
1049
1050static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
1051				   struct bpf_reg_state *reg, int nr_slots)
1052{
1053	struct bpf_func_state *state = func(env, reg);
1054	int spi, i, j;
1055
1056	spi = iter_get_spi(env, reg, nr_slots);
1057	if (spi < 0)
1058		return spi;
1059
1060	for (i = 0; i < nr_slots; i++) {
1061		struct bpf_stack_state *slot = &state->stack[spi - i];
1062		struct bpf_reg_state *st = &slot->spilled_ptr;
1063
1064		if (i == 0)
1065			WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
1066
1067		__mark_reg_not_init(env, st);
1068
1069		/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
1070		st->live |= REG_LIVE_WRITTEN;
1071
1072		for (j = 0; j < BPF_REG_SIZE; j++)
1073			slot->slot_type[j] = STACK_INVALID;
1074
1075		mark_stack_slot_scratched(env, spi - i);
1076	}
1077
1078	return 0;
1079}
1080
1081static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
1082				     struct bpf_reg_state *reg, int nr_slots)
1083{
1084	struct bpf_func_state *state = func(env, reg);
1085	int spi, i, j;
1086
1087	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
1088	 * will do check_mem_access to check and update stack bounds later, so
1089	 * return true for that case.
1090	 */
1091	spi = iter_get_spi(env, reg, nr_slots);
1092	if (spi == -ERANGE)
1093		return true;
1094	if (spi < 0)
1095		return false;
1096
1097	for (i = 0; i < nr_slots; i++) {
1098		struct bpf_stack_state *slot = &state->stack[spi - i];
1099
1100		for (j = 0; j < BPF_REG_SIZE; j++)
1101			if (slot->slot_type[j] == STACK_ITER)
1102				return false;
1103	}
1104
1105	return true;
1106}
1107
1108static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
1109				   struct btf *btf, u32 btf_id, int nr_slots)
1110{
1111	struct bpf_func_state *state = func(env, reg);
1112	int spi, i, j;
1113
1114	spi = iter_get_spi(env, reg, nr_slots);
1115	if (spi < 0)
1116		return -EINVAL;
1117
1118	for (i = 0; i < nr_slots; i++) {
1119		struct bpf_stack_state *slot = &state->stack[spi - i];
1120		struct bpf_reg_state *st = &slot->spilled_ptr;
1121
1122		if (st->type & PTR_UNTRUSTED)
1123			return -EPROTO;
1124		/* only main (first) slot has ref_obj_id set */
1125		if (i == 0 && !st->ref_obj_id)
1126			return -EINVAL;
1127		if (i != 0 && st->ref_obj_id)
1128			return -EINVAL;
1129		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
1130			return -EINVAL;
1131
1132		for (j = 0; j < BPF_REG_SIZE; j++)
1133			if (slot->slot_type[j] != STACK_ITER)
1134				return -EINVAL;
1135	}
1136
1137	return 0;
1138}
1139
1140/* Check if given stack slot is "special":
1141 *   - spilled register state (STACK_SPILL);
1142 *   - dynptr state (STACK_DYNPTR);
1143 *   - iter state (STACK_ITER).
1144 */
1145static bool is_stack_slot_special(const struct bpf_stack_state *stack)
1146{
1147	enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
1148
1149	switch (type) {
1150	case STACK_SPILL:
1151	case STACK_DYNPTR:
1152	case STACK_ITER:
1153		return true;
1154	case STACK_INVALID:
1155	case STACK_MISC:
1156	case STACK_ZERO:
1157		return false;
1158	default:
1159		WARN_ONCE(1, "unknown stack slot type %d\n", type);
1160		return true;
1161	}
1162}
1163
1164/* The reg state of a pointer or a bounded scalar was saved when
1165 * it was spilled to the stack.
1166 */
1167static bool is_spilled_reg(const struct bpf_stack_state *stack)
1168{
1169	return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
1170}
1171
1172static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
1173{
1174	return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
1175	       stack->spilled_ptr.type == SCALAR_VALUE;
1176}
1177
1178static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
1179{
1180	return stack->slot_type[0] == STACK_SPILL &&
1181	       stack->spilled_ptr.type == SCALAR_VALUE;
1182}
1183
1184/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
1185 * case they are equivalent, or it's STACK_ZERO, in which case we preserve
1186 * more precise STACK_ZERO.
1187 * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
1188 * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
1189 */
1190static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
1191{
1192	if (*stype == STACK_ZERO)
1193		return;
1194	if (env->allow_ptr_leaks && *stype == STACK_INVALID)
1195		return;
1196	*stype = STACK_MISC;
1197}
1198
1199static void scrub_spilled_slot(u8 *stype)
1200{
1201	if (*stype != STACK_INVALID)
1202		*stype = STACK_MISC;
1203}
1204
1205/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
1206 * small to hold src. This is different from krealloc since we don't want to preserve
1207 * the contents of dst.
1208 *
1209 * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could
1210 * not be allocated.
1211 */
1212static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
1213{
1214	size_t alloc_bytes;
1215	void *orig = dst;
1216	size_t bytes;
1217
1218	if (ZERO_OR_NULL_PTR(src))
1219		goto out;
1220
1221	if (unlikely(check_mul_overflow(n, size, &bytes)))
1222		return NULL;
1223
1224	alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
1225	dst = krealloc(orig, alloc_bytes, flags);
1226	if (!dst) {
1227		kfree(orig);
1228		return NULL;
1229	}
1230
1231	memcpy(dst, src, bytes);
1232out:
1233	return dst ? dst : ZERO_SIZE_PTR;
1234}
1235
1236/* resize an array from old_n items to new_n items. the array is reallocated if it's too
1237 * small to hold new_n items. new items are zeroed out if the array grows.
1238 *
1239 * Contrary to krealloc_array, does not free arr if new_n is zero.
1240 */
1241static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
1242{
1243	size_t alloc_size;
1244	void *new_arr;
1245
1246	if (!new_n || old_n == new_n)
1247		goto out;
1248
1249	alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
1250	new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
1251	if (!new_arr) {
1252		kfree(arr);
1253		return NULL;
1254	}
1255	arr = new_arr;
1256
1257	if (new_n > old_n)
1258		memset(arr + old_n * size, 0, (new_n - old_n) * size);
1259
1260out:
1261	return arr ? arr : ZERO_SIZE_PTR;
1262}
1263
1264static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
1265{
1266	dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
1267			       sizeof(struct bpf_reference_state), GFP_KERNEL);
1268	if (!dst->refs)
1269		return -ENOMEM;
1270
1271	dst->acquired_refs = src->acquired_refs;
1272	return 0;
1273}
1274
1275static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
1276{
1277	size_t n = src->allocated_stack / BPF_REG_SIZE;
1278
1279	dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
1280				GFP_KERNEL);
1281	if (!dst->stack)
1282		return -ENOMEM;
1283
1284	dst->allocated_stack = src->allocated_stack;
1285	return 0;
1286}
1287
1288static int resize_reference_state(struct bpf_func_state *state, size_t n)
1289{
1290	state->refs = realloc_array(state->refs, state->acquired_refs, n,
1291				    sizeof(struct bpf_reference_state));
1292	if (!state->refs)
1293		return -ENOMEM;
1294
1295	state->acquired_refs = n;
1296	return 0;
1297}
1298
1299/* Possibly update state->allocated_stack to be at least size bytes. Also
1300 * possibly update the function's high-water mark in its bpf_subprog_info.
1301 */
1302static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
1303{
1304	size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
1305
1306	/* The stack size is always a multiple of BPF_REG_SIZE. */
1307	size = round_up(size, BPF_REG_SIZE);
1308	n = size / BPF_REG_SIZE;
1309
1310	if (old_n >= n)
1311		return 0;
1312
1313	state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state));
1314	if (!state->stack)
1315		return -ENOMEM;
1316
1317	state->allocated_stack = size;
1318
1319	/* update known max for given subprogram */
1320	if (env->subprog_info[state->subprogno].stack_depth < size)
1321		env->subprog_info[state->subprogno].stack_depth = size;
1322
1323	return 0;
1324}
1325
1326/* Acquire a pointer id from the env and update the state->refs to include
1327 * this new pointer reference.
1328 * On success, returns a valid pointer id to associate with the register
1329 * On failure, returns a negative errno.
1330 */
1331static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
1332{
1333	struct bpf_func_state *state = cur_func(env);
1334	int new_ofs = state->acquired_refs;
1335	int id, err;
1336
1337	err = resize_reference_state(state, state->acquired_refs + 1);
1338	if (err)
1339		return err;
1340	id = ++env->id_gen;
1341	state->refs[new_ofs].id = id;
1342	state->refs[new_ofs].insn_idx = insn_idx;
1343	state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
1344
1345	return id;
1346}
1347
1348/* release function corresponding to acquire_reference_state(). Idempotent. */
1349static int release_reference_state(struct bpf_func_state *state, int ptr_id)
1350{
1351	int i, last_idx;
1352
1353	last_idx = state->acquired_refs - 1;
1354	for (i = 0; i < state->acquired_refs; i++) {
1355		if (state->refs[i].id == ptr_id) {
1356			/* Cannot release caller references in callbacks */
1357			if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
1358				return -EINVAL;
1359			if (last_idx && i != last_idx)
1360				memcpy(&state->refs[i], &state->refs[last_idx],
1361				       sizeof(*state->refs));
1362			memset(&state->refs[last_idx], 0, sizeof(*state->refs));
1363			state->acquired_refs--;
1364			return 0;
1365		}
1366	}
1367	return -EINVAL;
1368}
1369
1370static void free_func_state(struct bpf_func_state *state)
1371{
1372	if (!state)
1373		return;
1374	kfree(state->refs);
1375	kfree(state->stack);
1376	kfree(state);
1377}
1378
1379static void clear_jmp_history(struct bpf_verifier_state *state)
1380{
1381	kfree(state->jmp_history);
1382	state->jmp_history = NULL;
1383	state->jmp_history_cnt = 0;
1384}
1385
1386static void free_verifier_state(struct bpf_verifier_state *state,
1387				bool free_self)
1388{
1389	int i;
1390
1391	for (i = 0; i <= state->curframe; i++) {
1392		free_func_state(state->frame[i]);
1393		state->frame[i] = NULL;
1394	}
1395	clear_jmp_history(state);
1396	if (free_self)
1397		kfree(state);
1398}
1399
1400/* copy verifier state from src to dst growing dst stack space
1401 * when necessary to accommodate larger src stack
1402 */
1403static int copy_func_state(struct bpf_func_state *dst,
1404			   const struct bpf_func_state *src)
1405{
1406	int err;
1407
1408	memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
1409	err = copy_reference_state(dst, src);
1410	if (err)
1411		return err;
1412	return copy_stack_state(dst, src);
1413}
1414
1415static int copy_verifier_state(struct bpf_verifier_state *dst_state,
1416			       const struct bpf_verifier_state *src)
1417{
1418	struct bpf_func_state *dst;
1419	int i, err;
1420
1421	dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
1422					  src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
1423					  GFP_USER);
1424	if (!dst_state->jmp_history)
1425		return -ENOMEM;
1426	dst_state->jmp_history_cnt = src->jmp_history_cnt;
1427
1428	/* if dst has more stack frames then src frame, free them, this is also
1429	 * necessary in case of exceptional exits using bpf_throw.
1430	 */
1431	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
1432		free_func_state(dst_state->frame[i]);
1433		dst_state->frame[i] = NULL;
1434	}
1435	dst_state->speculative = src->speculative;
1436	dst_state->active_rcu_lock = src->active_rcu_lock;
1437	dst_state->active_preempt_lock = src->active_preempt_lock;
1438	dst_state->in_sleepable = src->in_sleepable;
1439	dst_state->curframe = src->curframe;
1440	dst_state->active_lock.ptr = src->active_lock.ptr;
1441	dst_state->active_lock.id = src->active_lock.id;
1442	dst_state->branches = src->branches;
1443	dst_state->parent = src->parent;
1444	dst_state->first_insn_idx = src->first_insn_idx;
1445	dst_state->last_insn_idx = src->last_insn_idx;
1446	dst_state->dfs_depth = src->dfs_depth;
1447	dst_state->callback_unroll_depth = src->callback_unroll_depth;
1448	dst_state->used_as_loop_entry = src->used_as_loop_entry;
1449	dst_state->may_goto_depth = src->may_goto_depth;
1450	for (i = 0; i <= src->curframe; i++) {
1451		dst = dst_state->frame[i];
1452		if (!dst) {
1453			dst = kzalloc(sizeof(*dst), GFP_KERNEL);
1454			if (!dst)
1455				return -ENOMEM;
1456			dst_state->frame[i] = dst;
1457		}
1458		err = copy_func_state(dst, src->frame[i]);
1459		if (err)
1460			return err;
1461	}
1462	return 0;
1463}
1464
1465static u32 state_htab_size(struct bpf_verifier_env *env)
1466{
1467	return env->prog->len;
1468}
1469
1470static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
1471{
1472	struct bpf_verifier_state *cur = env->cur_state;
1473	struct bpf_func_state *state = cur->frame[cur->curframe];
1474
1475	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
1476}
1477
1478static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
1479{
1480	int fr;
1481
1482	if (a->curframe != b->curframe)
1483		return false;
1484
1485	for (fr = a->curframe; fr >= 0; fr--)
1486		if (a->frame[fr]->callsite != b->frame[fr]->callsite)
1487			return false;
1488
1489	return true;
1490}
1491
1492/* Open coded iterators allow back-edges in the state graph in order to
1493 * check unbounded loops that iterators.
1494 *
1495 * In is_state_visited() it is necessary to know if explored states are
1496 * part of some loops in order to decide whether non-exact states
1497 * comparison could be used:
1498 * - non-exact states comparison establishes sub-state relation and uses
1499 *   read and precision marks to do so, these marks are propagated from
1500 *   children states and thus are not guaranteed to be final in a loop;
1501 * - exact states comparison just checks if current and explored states
1502 *   are identical (and thus form a back-edge).
1503 *
1504 * Paper "A New Algorithm for Identifying Loops in Decompilation"
1505 * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
1506 * algorithm for loop structure detection and gives an overview of
1507 * relevant terminology. It also has helpful illustrations.
1508 *
1509 * [1] https://api.semanticscholar.org/CorpusID:15784067
1510 *
1511 * We use a similar algorithm but because loop nested structure is
1512 * irrelevant for verifier ours is significantly simpler and resembles
1513 * strongly connected components algorithm from Sedgewick's textbook.
1514 *
1515 * Define topmost loop entry as a first node of the loop traversed in a
1516 * depth first search starting from initial state. The goal of the loop
1517 * tracking algorithm is to associate topmost loop entries with states
1518 * derived from these entries.
1519 *
1520 * For each step in the DFS states traversal algorithm needs to identify
1521 * the following situations:
1522 *
1523 *          initial                     initial                   initial
1524 *            |                           |                         |
1525 *            V                           V                         V
1526 *           ...                         ...           .---------> hdr
1527 *            |                           |            |            |
1528 *            V                           V            |            V
1529 *           cur                     .-> succ          |    .------...
1530 *            |                      |    |            |    |       |
1531 *            V                      |    V            |    V       V
1532 *           succ                    '-- cur           |   ...     ...
1533 *                                                     |    |       |
1534 *                                                     |    V       V
1535 *                                                     |   succ <- cur
1536 *                                                     |    |
1537 *                                                     |    V
1538 *                                                     |   ...
1539 *                                                     |    |
1540 *                                                     '----'
1541 *
1542 *  (A) successor state of cur   (B) successor state of cur or it's entry
1543 *      not yet traversed            are in current DFS path, thus cur and succ
1544 *                                   are members of the same outermost loop
1545 *
1546 *                      initial                  initial
1547 *                        |                        |
1548 *                        V                        V
1549 *                       ...                      ...
1550 *                        |                        |
1551 *                        V                        V
1552 *                .------...               .------...
1553 *                |       |                |       |
1554 *                V       V                V       V
1555 *           .-> hdr     ...              ...     ...
1556 *           |    |       |                |       |
1557 *           |    V       V                V       V
1558 *           |   succ <- cur              succ <- cur
1559 *           |    |                        |
1560 *           |    V                        V
1561 *           |   ...                      ...
1562 *           |    |                        |
1563 *           '----'                       exit
1564 *
1565 * (C) successor state of cur is a part of some loop but this loop
1566 *     does not include cur or successor state is not in a loop at all.
1567 *
1568 * Algorithm could be described as the following python code:
1569 *
1570 *     traversed = set()   # Set of traversed nodes
1571 *     entries = {}        # Mapping from node to loop entry
1572 *     depths = {}         # Depth level assigned to graph node
1573 *     path = set()        # Current DFS path
1574 *
1575 *     # Find outermost loop entry known for n
1576 *     def get_loop_entry(n):
1577 *         h = entries.get(n, None)
1578 *         while h in entries and entries[h] != h:
1579 *             h = entries[h]
1580 *         return h
1581 *
1582 *     # Update n's loop entry if h's outermost entry comes
1583 *     # before n's outermost entry in current DFS path.
1584 *     def update_loop_entry(n, h):
1585 *         n1 = get_loop_entry(n) or n
1586 *         h1 = get_loop_entry(h) or h
1587 *         if h1 in path and depths[h1] <= depths[n1]:
1588 *             entries[n] = h1
1589 *
1590 *     def dfs(n, depth):
1591 *         traversed.add(n)
1592 *         path.add(n)
1593 *         depths[n] = depth
1594 *         for succ in G.successors(n):
1595 *             if succ not in traversed:
1596 *                 # Case A: explore succ and update cur's loop entry
1597 *                 #         only if succ's entry is in current DFS path.
1598 *                 dfs(succ, depth + 1)
1599 *                 h = get_loop_entry(succ)
1600 *                 update_loop_entry(n, h)
1601 *             else:
1602 *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
1603 *                 update_loop_entry(n, succ)
1604 *         path.remove(n)
1605 *
1606 * To adapt this algorithm for use with verifier:
1607 * - use st->branch == 0 as a signal that DFS of succ had been finished
1608 *   and cur's loop entry has to be updated (case A), handle this in
1609 *   update_branch_counts();
1610 * - use st->branch > 0 as a signal that st is in the current DFS path;
1611 * - handle cases B and C in is_state_visited();
1612 * - update topmost loop entry for intermediate states in get_loop_entry().
1613 */
1614static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
1615{
1616	struct bpf_verifier_state *topmost = st->loop_entry, *old;
1617
1618	while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
1619		topmost = topmost->loop_entry;
1620	/* Update loop entries for intermediate states to avoid this
1621	 * traversal in future get_loop_entry() calls.
1622	 */
1623	while (st && st->loop_entry != topmost) {
1624		old = st->loop_entry;
1625		st->loop_entry = topmost;
1626		st = old;
1627	}
1628	return topmost;
1629}
1630
1631static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
1632{
1633	struct bpf_verifier_state *cur1, *hdr1;
1634
1635	cur1 = get_loop_entry(cur) ?: cur;
1636	hdr1 = get_loop_entry(hdr) ?: hdr;
1637	/* The head1->branches check decides between cases B and C in
1638	 * comment for get_loop_entry(). If hdr1->branches == 0 then
1639	 * head's topmost loop entry is not in current DFS path,
1640	 * hence 'cur' and 'hdr' are not in the same loop and there is
1641	 * no need to update cur->loop_entry.
1642	 */
1643	if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
1644		cur->loop_entry = hdr;
1645		hdr->used_as_loop_entry = true;
1646	}
1647}
1648
1649static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
1650{
1651	while (st) {
1652		u32 br = --st->branches;
1653
1654		/* br == 0 signals that DFS exploration for 'st' is finished,
1655		 * thus it is necessary to update parent's loop entry if it
1656		 * turned out that st is a part of some loop.
1657		 * This is a part of 'case A' in get_loop_entry() comment.
1658		 */
1659		if (br == 0 && st->parent && st->loop_entry)
1660			update_loop_entry(st->parent, st->loop_entry);
1661
1662		/* WARN_ON(br > 1) technically makes sense here,
1663		 * but see comment in push_stack(), hence:
1664		 */
1665		WARN_ONCE((int)br < 0,
1666			  "BUG update_branch_counts:branches_to_explore=%d\n",
1667			  br);
1668		if (br)
1669			break;
1670		st = st->parent;
1671	}
1672}
1673
1674static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
1675		     int *insn_idx, bool pop_log)
1676{
1677	struct bpf_verifier_state *cur = env->cur_state;
1678	struct bpf_verifier_stack_elem *elem, *head = env->head;
1679	int err;
1680
1681	if (env->head == NULL)
1682		return -ENOENT;
1683
1684	if (cur) {
1685		err = copy_verifier_state(cur, &head->st);
1686		if (err)
1687			return err;
1688	}
1689	if (pop_log)
1690		bpf_vlog_reset(&env->log, head->log_pos);
1691	if (insn_idx)
1692		*insn_idx = head->insn_idx;
1693	if (prev_insn_idx)
1694		*prev_insn_idx = head->prev_insn_idx;
1695	elem = head->next;
1696	free_verifier_state(&head->st, false);
1697	kfree(head);
1698	env->head = elem;
1699	env->stack_size--;
1700	return 0;
1701}
1702
1703static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
1704					     int insn_idx, int prev_insn_idx,
1705					     bool speculative)
1706{
1707	struct bpf_verifier_state *cur = env->cur_state;
1708	struct bpf_verifier_stack_elem *elem;
1709	int err;
1710
1711	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
1712	if (!elem)
1713		goto err;
1714
1715	elem->insn_idx = insn_idx;
1716	elem->prev_insn_idx = prev_insn_idx;
1717	elem->next = env->head;
1718	elem->log_pos = env->log.end_pos;
1719	env->head = elem;
1720	env->stack_size++;
1721	err = copy_verifier_state(&elem->st, cur);
1722	if (err)
1723		goto err;
1724	elem->st.speculative |= speculative;
1725	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
1726		verbose(env, "The sequence of %d jumps is too complex.\n",
1727			env->stack_size);
1728		goto err;
1729	}
1730	if (elem->st.parent) {
1731		++elem->st.parent->branches;
1732		/* WARN_ON(branches > 2) technically makes sense here,
1733		 * but
1734		 * 1. speculative states will bump 'branches' for non-branch
1735		 * instructions
1736		 * 2. is_state_visited() heuristics may decide not to create
1737		 * a new state for a sequence of branches and all such current
1738		 * and cloned states will be pointing to a single parent state
1739		 * which might have large 'branches' count.
1740		 */
1741	}
1742	return &elem->st;
1743err:
1744	free_verifier_state(env->cur_state, true);
1745	env->cur_state = NULL;
1746	/* pop all elements and return */
1747	while (!pop_stack(env, NULL, NULL, false));
1748	return NULL;
1749}
1750
1751#define CALLER_SAVED_REGS 6
1752static const int caller_saved[CALLER_SAVED_REGS] = {
1753	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
1754};
1755
1756/* This helper doesn't clear reg->id */
1757static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
1758{
1759	reg->var_off = tnum_const(imm);
1760	reg->smin_value = (s64)imm;
1761	reg->smax_value = (s64)imm;
1762	reg->umin_value = imm;
1763	reg->umax_value = imm;
1764
1765	reg->s32_min_value = (s32)imm;
1766	reg->s32_max_value = (s32)imm;
1767	reg->u32_min_value = (u32)imm;
1768	reg->u32_max_value = (u32)imm;
1769}
1770
1771/* Mark the unknown part of a register (variable offset or scalar value) as
1772 * known to have the value @imm.
1773 */
1774static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
1775{
1776	/* Clear off and union(map_ptr, range) */
1777	memset(((u8 *)reg) + sizeof(reg->type), 0,
1778	       offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
1779	reg->id = 0;
1780	reg->ref_obj_id = 0;
1781	___mark_reg_known(reg, imm);
1782}
1783
1784static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
1785{
1786	reg->var_off = tnum_const_subreg(reg->var_off, imm);
1787	reg->s32_min_value = (s32)imm;
1788	reg->s32_max_value = (s32)imm;
1789	reg->u32_min_value = (u32)imm;
1790	reg->u32_max_value = (u32)imm;
1791}
1792
1793/* Mark the 'variable offset' part of a register as zero.  This should be
1794 * used only on registers holding a pointer type.
1795 */
1796static void __mark_reg_known_zero(struct bpf_reg_state *reg)
1797{
1798	__mark_reg_known(reg, 0);
1799}
1800
1801static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
1802{
1803	__mark_reg_known(reg, 0);
1804	reg->type = SCALAR_VALUE;
1805	/* all scalars are assumed imprecise initially (unless unprivileged,
1806	 * in which case everything is forced to be precise)
1807	 */
1808	reg->precise = !env->bpf_capable;
1809}
1810
1811static void mark_reg_known_zero(struct bpf_verifier_env *env,
1812				struct bpf_reg_state *regs, u32 regno)
1813{
1814	if (WARN_ON(regno >= MAX_BPF_REG)) {
1815		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
1816		/* Something bad happened, let's kill all regs */
1817		for (regno = 0; regno < MAX_BPF_REG; regno++)
1818			__mark_reg_not_init(env, regs + regno);
1819		return;
1820	}
1821	__mark_reg_known_zero(regs + regno);
1822}
1823
1824static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
1825			      bool first_slot, int dynptr_id)
1826{
1827	/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
1828	 * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
1829	 * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
1830	 */
1831	__mark_reg_known_zero(reg);
1832	reg->type = CONST_PTR_TO_DYNPTR;
1833	/* Give each dynptr a unique id to uniquely associate slices to it. */
1834	reg->id = dynptr_id;
1835	reg->dynptr.type = type;
1836	reg->dynptr.first_slot = first_slot;
1837}
1838
1839static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
1840{
1841	if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
1842		const struct bpf_map *map = reg->map_ptr;
1843
1844		if (map->inner_map_meta) {
1845			reg->type = CONST_PTR_TO_MAP;
1846			reg->map_ptr = map->inner_map_meta;
1847			/* transfer reg's id which is unique for every map_lookup_elem
1848			 * as UID of the inner map.
1849			 */
1850			if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
1851				reg->map_uid = reg->id;
1852			if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
1853				reg->map_uid = reg->id;
1854		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
1855			reg->type = PTR_TO_XDP_SOCK;
1856		} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
1857			   map->map_type == BPF_MAP_TYPE_SOCKHASH) {
1858			reg->type = PTR_TO_SOCKET;
1859		} else {
1860			reg->type = PTR_TO_MAP_VALUE;
1861		}
1862		return;
1863	}
1864
1865	reg->type &= ~PTR_MAYBE_NULL;
1866}
1867
1868static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
1869				struct btf_field_graph_root *ds_head)
1870{
1871	__mark_reg_known_zero(&regs[regno]);
1872	regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
1873	regs[regno].btf = ds_head->btf;
1874	regs[regno].btf_id = ds_head->value_btf_id;
1875	regs[regno].off = ds_head->node_offset;
1876}
1877
1878static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
1879{
1880	return type_is_pkt_pointer(reg->type);
1881}
1882
1883static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
1884{
1885	return reg_is_pkt_pointer(reg) ||
1886	       reg->type == PTR_TO_PACKET_END;
1887}
1888
1889static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
1890{
1891	return base_type(reg->type) == PTR_TO_MEM &&
1892		(reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
1893}
1894
1895/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
1896static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
1897				    enum bpf_reg_type which)
1898{
1899	/* The register can already have a range from prior markings.
1900	 * This is fine as long as it hasn't been advanced from its
1901	 * origin.
1902	 */
1903	return reg->type == which &&
1904	       reg->id == 0 &&
1905	       reg->off == 0 &&
1906	       tnum_equals_const(reg->var_off, 0);
1907}
1908
1909/* Reset the min/max bounds of a register */
1910static void __mark_reg_unbounded(struct bpf_reg_state *reg)
1911{
1912	reg->smin_value = S64_MIN;
1913	reg->smax_value = S64_MAX;
1914	reg->umin_value = 0;
1915	reg->umax_value = U64_MAX;
1916
1917	reg->s32_min_value = S32_MIN;
1918	reg->s32_max_value = S32_MAX;
1919	reg->u32_min_value = 0;
1920	reg->u32_max_value = U32_MAX;
1921}
1922
1923static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
1924{
1925	reg->smin_value = S64_MIN;
1926	reg->smax_value = S64_MAX;
1927	reg->umin_value = 0;
1928	reg->umax_value = U64_MAX;
1929}
1930
1931static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
1932{
1933	reg->s32_min_value = S32_MIN;
1934	reg->s32_max_value = S32_MAX;
1935	reg->u32_min_value = 0;
1936	reg->u32_max_value = U32_MAX;
1937}
1938
1939static void __update_reg32_bounds(struct bpf_reg_state *reg)
1940{
1941	struct tnum var32_off = tnum_subreg(reg->var_off);
1942
1943	/* min signed is max(sign bit) | min(other bits) */
1944	reg->s32_min_value = max_t(s32, reg->s32_min_value,
1945			var32_off.value | (var32_off.mask & S32_MIN));
1946	/* max signed is min(sign bit) | max(other bits) */
1947	reg->s32_max_value = min_t(s32, reg->s32_max_value,
1948			var32_off.value | (var32_off.mask & S32_MAX));
1949	reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
1950	reg->u32_max_value = min(reg->u32_max_value,
1951				 (u32)(var32_off.value | var32_off.mask));
1952}
1953
1954static void __update_reg64_bounds(struct bpf_reg_state *reg)
1955{
1956	/* min signed is max(sign bit) | min(other bits) */
1957	reg->smin_value = max_t(s64, reg->smin_value,
1958				reg->var_off.value | (reg->var_off.mask & S64_MIN));
1959	/* max signed is min(sign bit) | max(other bits) */
1960	reg->smax_value = min_t(s64, reg->smax_value,
1961				reg->var_off.value | (reg->var_off.mask & S64_MAX));
1962	reg->umin_value = max(reg->umin_value, reg->var_off.value);
1963	reg->umax_value = min(reg->umax_value,
1964			      reg->var_off.value | reg->var_off.mask);
1965}
1966
1967static void __update_reg_bounds(struct bpf_reg_state *reg)
1968{
1969	__update_reg32_bounds(reg);
1970	__update_reg64_bounds(reg);
1971}
1972
1973/* Uses signed min/max values to inform unsigned, and vice-versa */
1974static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
1975{
1976	/* If upper 32 bits of u64/s64 range don't change, we can use lower 32
1977	 * bits to improve our u32/s32 boundaries.
1978	 *
1979	 * E.g., the case where we have upper 32 bits as zero ([10, 20] in
1980	 * u64) is pretty trivial, it's obvious that in u32 we'll also have
1981	 * [10, 20] range. But this property holds for any 64-bit range as
1982	 * long as upper 32 bits in that entire range of values stay the same.
1983	 *
1984	 * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
1985	 * in decimal) has the same upper 32 bits throughout all the values in
1986	 * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
1987	 * range.
1988	 *
1989	 * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
1990	 * following the rules outlined below about u64/s64 correspondence
1991	 * (which equally applies to u32 vs s32 correspondence). In general it
1992	 * depends on actual hexadecimal values of 32-bit range. They can form
1993	 * only valid u32, or only valid s32 ranges in some cases.
1994	 *
1995	 * So we use all these insights to derive bounds for subregisters here.
1996	 */
1997	if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
1998		/* u64 to u32 casting preserves validity of low 32 bits as
1999		 * a range, if upper 32 bits are the same
2000		 */
2001		reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
2002		reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
2003
2004		if ((s32)reg->umin_value <= (s32)reg->umax_value) {
2005			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
2006			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
2007		}
2008	}
2009	if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
2010		/* low 32 bits should form a proper u32 range */
2011		if ((u32)reg->smin_value <= (u32)reg->smax_value) {
2012			reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
2013			reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
2014		}
2015		/* low 32 bits should form a proper s32 range */
2016		if ((s32)reg->smin_value <= (s32)reg->smax_value) {
2017			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
2018			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
2019		}
2020	}
2021	/* Special case where upper bits form a small sequence of two
2022	 * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
2023	 * 0x00000000 is also valid), while lower bits form a proper s32 range
2024	 * going from negative numbers to positive numbers. E.g., let's say we
2025	 * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
2026	 * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
2027	 * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
2028	 * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
2029	 * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
2030	 * upper 32 bits. As a random example, s64 range
2031	 * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
2032	 * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
2033	 */
2034	if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
2035	    (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
2036		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
2037		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
2038	}
2039	if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
2040	    (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
2041		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
2042		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
2043	}
2044	/* if u32 range forms a valid s32 range (due to matching sign bit),
2045	 * try to learn from that
2046	 */
2047	if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
2048		reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
2049		reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
2050	}
2051	/* If we cannot cross the sign boundary, then signed and unsigned bounds
2052	 * are the same, so combine.  This works even in the negative case, e.g.
2053	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
2054	 */
2055	if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
2056		reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
2057		reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
2058	}
2059}
2060
2061static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
2062{
2063	/* If u64 range forms a valid s64 range (due to matching sign bit),
2064	 * try to learn from that. Let's do a bit of ASCII art to see when
2065	 * this is happening. Let's take u64 range first:
2066	 *
2067	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
2068	 * |-------------------------------|--------------------------------|
2069	 *
2070	 * Valid u64 range is formed when umin and umax are anywhere in the
2071	 * range [0, U64_MAX], and umin <= umax. u64 case is simple and
2072	 * straightforward. Let's see how s64 range maps onto the same range
2073	 * of values, annotated below the line for comparison:
2074	 *
2075	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
2076	 * |-------------------------------|--------------------------------|
2077	 * 0                        S64_MAX S64_MIN                        -1
2078	 *
2079	 * So s64 values basically start in the middle and they are logically
2080	 * contiguous to the right of it, wrapping around from -1 to 0, and
2081	 * then finishing as S64_MAX (0x7fffffffffffffff) right before
2082	 * S64_MIN. We can try drawing the continuity of u64 vs s64 values
2083	 * more visually as mapped to sign-agnostic range of hex values.
2084	 *
2085	 *  u64 start                                               u64 end
2086	 *  _______________________________________________________________
2087	 * /                                                               \
2088	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
2089	 * |-------------------------------|--------------------------------|
2090	 * 0                        S64_MAX S64_MIN                        -1
2091	 *                                / \
2092	 * >------------------------------   ------------------------------->
2093	 * s64 continues...        s64 end   s64 start          s64 "midpoint"
2094	 *
2095	 * What this means is that, in general, we can't always derive
2096	 * something new about u64 from any random s64 range, and vice versa.
2097	 *
2098	 * But we can do that in two particular cases. One is when entire
2099	 * u64/s64 range is *entirely* contained within left half of the above
2100	 * diagram or when it is *entirely* contained in the right half. I.e.:
2101	 *
2102	 * |-------------------------------|--------------------------------|
2103	 *     ^                   ^            ^                 ^
2104	 *     A                   B            C                 D
2105	 *
2106	 * [A, B] and [C, D] are contained entirely in their respective halves
2107	 * and form valid contiguous ranges as both u64 and s64 values. [A, B]
2108	 * will be non-negative both as u64 and s64 (and in fact it will be
2109	 * identical ranges no matter the signedness). [C, D] treated as s64
2110	 * will be a range of negative values, while in u64 it will be
2111	 * non-negative range of values larger than 0x8000000000000000.
2112	 *
2113	 * Now, any other range here can't be represented in both u64 and s64
2114	 * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
2115	 * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
2116	 * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
2117	 * for example. Similarly, valid s64 range [D, A] (going from negative
2118	 * to positive values), would be two separate [D, U64_MAX] and [0, A]
2119	 * ranges as u64. Currently reg_state can't represent two segments per
2120	 * numeric domain, so in such situations we can only derive maximal
2121	 * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
2122	 *
2123	 * So we use these facts to derive umin/umax from smin/smax and vice
2124	 * versa only if they stay within the same "half". This is equivalent
2125	 * to checking sign bit: lower half will have sign bit as zero, upper
2126	 * half have sign bit 1. Below in code we simplify this by just
2127	 * casting umin/umax as smin/smax and checking if they form valid
2128	 * range, and vice versa. Those are equivalent checks.
2129	 */
2130	if ((s64)reg->umin_value <= (s64)reg->umax_value) {
2131		reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
2132		reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
2133	}
2134	/* If we cannot cross the sign boundary, then signed and unsigned bounds
2135	 * are the same, so combine.  This works even in the negative case, e.g.
2136	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
2137	 */
2138	if ((u64)reg->smin_value <= (u64)reg->smax_value) {
2139		reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
2140		reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
2141	}
2142}
2143
2144static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
2145{
2146	/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
2147	 * values on both sides of 64-bit range in hope to have tighter range.
2148	 * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
2149	 * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
2150	 * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
2151	 * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
2152	 * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
2153	 * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
2154	 * We just need to make sure that derived bounds we are intersecting
2155	 * with are well-formed ranges in respective s64 or u64 domain, just
2156	 * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
2157	 */
2158	__u64 new_umin, new_umax;
2159	__s64 new_smin, new_smax;
2160
2161	/* u32 -> u64 tightening, it's always well-formed */
2162	new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
2163	new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
2164	reg->umin_value = max_t(u64, reg->umin_value, new_umin);
2165	reg->umax_value = min_t(u64, reg->umax_value, new_umax);
2166	/* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
2167	new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
2168	new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
2169	reg->smin_value = max_t(s64, reg->smin_value, new_smin);
2170	reg->smax_value = min_t(s64, reg->smax_value, new_smax);
2171
2172	/* if s32 can be treated as valid u32 range, we can use it as well */
2173	if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
2174		/* s32 -> u64 tightening */
2175		new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
2176		new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
2177		reg->umin_value = max_t(u64, reg->umin_value, new_umin);
2178		reg->umax_value = min_t(u64, reg->umax_value, new_umax);
2179		/* s32 -> s64 tightening */
2180		new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
2181		new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
2182		reg->smin_value = max_t(s64, reg->smin_value, new_smin);
2183		reg->smax_value = min_t(s64, reg->smax_value, new_smax);
2184	}
2185}
2186
2187static void __reg_deduce_bounds(struct bpf_reg_state *reg)
2188{
2189	__reg32_deduce_bounds(reg);
2190	__reg64_deduce_bounds(reg);
2191	__reg_deduce_mixed_bounds(reg);
2192}
2193
2194/* Attempts to improve var_off based on unsigned min/max information */
2195static void __reg_bound_offset(struct bpf_reg_state *reg)
2196{
2197	struct tnum var64_off = tnum_intersect(reg->var_off,
2198					       tnum_range(reg->umin_value,
2199							  reg->umax_value));
2200	struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
2201					       tnum_range(reg->u32_min_value,
2202							  reg->u32_max_value));
2203
2204	reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
2205}
2206
2207static void reg_bounds_sync(struct bpf_reg_state *reg)
2208{
2209	/* We might have learned new bounds from the var_off. */
2210	__update_reg_bounds(reg);
2211	/* We might have learned something about the sign bit. */
2212	__reg_deduce_bounds(reg);
2213	__reg_deduce_bounds(reg);
2214	/* We might have learned some bits from the bounds. */
2215	__reg_bound_offset(reg);
2216	/* Intersecting with the old var_off might have improved our bounds
2217	 * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
2218	 * then new var_off is (0; 0x7f...fc) which improves our umax.
2219	 */
2220	__update_reg_bounds(reg);
2221}
2222
2223static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
2224				   struct bpf_reg_state *reg, const char *ctx)
2225{
2226	const char *msg;
2227
2228	if (reg->umin_value > reg->umax_value ||
2229	    reg->smin_value > reg->smax_value ||
2230	    reg->u32_min_value > reg->u32_max_value ||
2231	    reg->s32_min_value > reg->s32_max_value) {
2232		    msg = "range bounds violation";
2233		    goto out;
2234	}
2235
2236	if (tnum_is_const(reg->var_off)) {
2237		u64 uval = reg->var_off.value;
2238		s64 sval = (s64)uval;
2239
2240		if (reg->umin_value != uval || reg->umax_value != uval ||
2241		    reg->smin_value != sval || reg->smax_value != sval) {
2242			msg = "const tnum out of sync with range bounds";
2243			goto out;
2244		}
2245	}
2246
2247	if (tnum_subreg_is_const(reg->var_off)) {
2248		u32 uval32 = tnum_subreg(reg->var_off).value;
2249		s32 sval32 = (s32)uval32;
2250
2251		if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
2252		    reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
2253			msg = "const subreg tnum out of sync with range bounds";
2254			goto out;
2255		}
2256	}
2257
2258	return 0;
2259out:
2260	verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
2261		"s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
2262		ctx, msg, reg->umin_value, reg->umax_value,
2263		reg->smin_value, reg->smax_value,
2264		reg->u32_min_value, reg->u32_max_value,
2265		reg->s32_min_value, reg->s32_max_value,
2266		reg->var_off.value, reg->var_off.mask);
2267	if (env->test_reg_invariants)
2268		return -EFAULT;
2269	__mark_reg_unbounded(reg);
2270	return 0;
2271}
2272
2273static bool __reg32_bound_s64(s32 a)
2274{
2275	return a >= 0 && a <= S32_MAX;
2276}
2277
2278static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
2279{
2280	reg->umin_value = reg->u32_min_value;
2281	reg->umax_value = reg->u32_max_value;
2282
2283	/* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
2284	 * be positive otherwise set to worse case bounds and refine later
2285	 * from tnum.
2286	 */
2287	if (__reg32_bound_s64(reg->s32_min_value) &&
2288	    __reg32_bound_s64(reg->s32_max_value)) {
2289		reg->smin_value = reg->s32_min_value;
2290		reg->smax_value = reg->s32_max_value;
2291	} else {
2292		reg->smin_value = 0;
2293		reg->smax_value = U32_MAX;
2294	}
2295}
2296
2297/* Mark a register as having a completely unknown (scalar) value. */
2298static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
2299{
2300	/*
2301	 * Clear type, off, and union(map_ptr, range) and
2302	 * padding between 'type' and union
2303	 */
2304	memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
2305	reg->type = SCALAR_VALUE;
2306	reg->id = 0;
2307	reg->ref_obj_id = 0;
2308	reg->var_off = tnum_unknown;
2309	reg->frameno = 0;
2310	reg->precise = false;
2311	__mark_reg_unbounded(reg);
2312}
2313
2314/* Mark a register as having a completely unknown (scalar) value,
2315 * initialize .precise as true when not bpf capable.
2316 */
2317static void __mark_reg_unknown(const struct bpf_verifier_env *env,
2318			       struct bpf_reg_state *reg)
2319{
2320	__mark_reg_unknown_imprecise(reg);
2321	reg->precise = !env->bpf_capable;
2322}
2323
2324static void mark_reg_unknown(struct bpf_verifier_env *env,
2325			     struct bpf_reg_state *regs, u32 regno)
2326{
2327	if (WARN_ON(regno >= MAX_BPF_REG)) {
2328		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
2329		/* Something bad happened, let's kill all regs except FP */
2330		for (regno = 0; regno < BPF_REG_FP; regno++)
2331			__mark_reg_not_init(env, regs + regno);
2332		return;
2333	}
2334	__mark_reg_unknown(env, regs + regno);
2335}
2336
2337static void __mark_reg_not_init(const struct bpf_verifier_env *env,
2338				struct bpf_reg_state *reg)
2339{
2340	__mark_reg_unknown(env, reg);
2341	reg->type = NOT_INIT;
2342}
2343
2344static void mark_reg_not_init(struct bpf_verifier_env *env,
2345			      struct bpf_reg_state *regs, u32 regno)
2346{
2347	if (WARN_ON(regno >= MAX_BPF_REG)) {
2348		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
2349		/* Something bad happened, let's kill all regs except FP */
2350		for (regno = 0; regno < BPF_REG_FP; regno++)
2351			__mark_reg_not_init(env, regs + regno);
2352		return;
2353	}
2354	__mark_reg_not_init(env, regs + regno);
2355}
2356
2357static void mark_btf_ld_reg(struct bpf_verifier_env *env,
2358			    struct bpf_reg_state *regs, u32 regno,
2359			    enum bpf_reg_type reg_type,
2360			    struct btf *btf, u32 btf_id,
2361			    enum bpf_type_flag flag)
2362{
2363	if (reg_type == SCALAR_VALUE) {
2364		mark_reg_unknown(env, regs, regno);
2365		return;
2366	}
2367	mark_reg_known_zero(env, regs, regno);
2368	regs[regno].type = PTR_TO_BTF_ID | flag;
2369	regs[regno].btf = btf;
2370	regs[regno].btf_id = btf_id;
2371	if (type_may_be_null(flag))
2372		regs[regno].id = ++env->id_gen;
2373}
2374
2375#define DEF_NOT_SUBREG	(0)
2376static void init_reg_state(struct bpf_verifier_env *env,
2377			   struct bpf_func_state *state)
2378{
2379	struct bpf_reg_state *regs = state->regs;
2380	int i;
2381
2382	for (i = 0; i < MAX_BPF_REG; i++) {
2383		mark_reg_not_init(env, regs, i);
2384		regs[i].live = REG_LIVE_NONE;
2385		regs[i].parent = NULL;
2386		regs[i].subreg_def = DEF_NOT_SUBREG;
2387	}
2388
2389	/* frame pointer */
2390	regs[BPF_REG_FP].type = PTR_TO_STACK;
2391	mark_reg_known_zero(env, regs, BPF_REG_FP);
2392	regs[BPF_REG_FP].frameno = state->frameno;
2393}
2394
2395static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
2396{
2397	return (struct bpf_retval_range){ minval, maxval };
2398}
2399
2400#define BPF_MAIN_FUNC (-1)
2401static void init_func_state(struct bpf_verifier_env *env,
2402			    struct bpf_func_state *state,
2403			    int callsite, int frameno, int subprogno)
2404{
2405	state->callsite = callsite;
2406	state->frameno = frameno;
2407	state->subprogno = subprogno;
2408	state->callback_ret_range = retval_range(0, 0);
2409	init_reg_state(env, state);
2410	mark_verifier_state_scratched(env);
2411}
2412
2413/* Similar to push_stack(), but for async callbacks */
2414static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
2415						int insn_idx, int prev_insn_idx,
2416						int subprog, bool is_sleepable)
2417{
2418	struct bpf_verifier_stack_elem *elem;
2419	struct bpf_func_state *frame;
2420
2421	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
2422	if (!elem)
2423		goto err;
2424
2425	elem->insn_idx = insn_idx;
2426	elem->prev_insn_idx = prev_insn_idx;
2427	elem->next = env->head;
2428	elem->log_pos = env->log.end_pos;
2429	env->head = elem;
2430	env->stack_size++;
2431	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
2432		verbose(env,
2433			"The sequence of %d jumps is too complex for async cb.\n",
2434			env->stack_size);
2435		goto err;
2436	}
2437	/* Unlike push_stack() do not copy_verifier_state().
2438	 * The caller state doesn't matter.
2439	 * This is async callback. It starts in a fresh stack.
2440	 * Initialize it similar to do_check_common().
2441	 */
2442	elem->st.branches = 1;
2443	elem->st.in_sleepable = is_sleepable;
2444	frame = kzalloc(sizeof(*frame), GFP_KERNEL);
2445	if (!frame)
2446		goto err;
2447	init_func_state(env, frame,
2448			BPF_MAIN_FUNC /* callsite */,
2449			0 /* frameno within this callchain */,
2450			subprog /* subprog number within this prog */);
2451	elem->st.frame[0] = frame;
2452	return &elem->st;
2453err:
2454	free_verifier_state(env->cur_state, true);
2455	env->cur_state = NULL;
2456	/* pop all elements and return */
2457	while (!pop_stack(env, NULL, NULL, false));
2458	return NULL;
2459}
2460
2461
2462enum reg_arg_type {
2463	SRC_OP,		/* register is used as source operand */
2464	DST_OP,		/* register is used as destination operand */
2465	DST_OP_NO_MARK	/* same as above, check only, don't mark */
2466};
2467
2468static int cmp_subprogs(const void *a, const void *b)
2469{
2470	return ((struct bpf_subprog_info *)a)->start -
2471	       ((struct bpf_subprog_info *)b)->start;
2472}
2473
2474static int find_subprog(struct bpf_verifier_env *env, int off)
2475{
2476	struct bpf_subprog_info *p;
2477
2478	p = bsearch(&off, env->subprog_info, env->subprog_cnt,
2479		    sizeof(env->subprog_info[0]), cmp_subprogs);
2480	if (!p)
2481		return -ENOENT;
2482	return p - env->subprog_info;
2483
2484}
2485
2486static int add_subprog(struct bpf_verifier_env *env, int off)
2487{
2488	int insn_cnt = env->prog->len;
2489	int ret;
2490
2491	if (off >= insn_cnt || off < 0) {
2492		verbose(env, "call to invalid destination\n");
2493		return -EINVAL;
2494	}
2495	ret = find_subprog(env, off);
2496	if (ret >= 0)
2497		return ret;
2498	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
2499		verbose(env, "too many subprograms\n");
2500		return -E2BIG;
2501	}
2502	/* determine subprog starts. The end is one before the next starts */
2503	env->subprog_info[env->subprog_cnt++].start = off;
2504	sort(env->subprog_info, env->subprog_cnt,
2505	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
2506	return env->subprog_cnt - 1;
2507}
2508
2509static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
2510{
2511	struct bpf_prog_aux *aux = env->prog->aux;
2512	struct btf *btf = aux->btf;
2513	const struct btf_type *t;
2514	u32 main_btf_id, id;
2515	const char *name;
2516	int ret, i;
2517
2518	/* Non-zero func_info_cnt implies valid btf */
2519	if (!aux->func_info_cnt)
2520		return 0;
2521	main_btf_id = aux->func_info[0].type_id;
2522
2523	t = btf_type_by_id(btf, main_btf_id);
2524	if (!t) {
2525		verbose(env, "invalid btf id for main subprog in func_info\n");
2526		return -EINVAL;
2527	}
2528
2529	name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
2530	if (IS_ERR(name)) {
2531		ret = PTR_ERR(name);
2532		/* If there is no tag present, there is no exception callback */
2533		if (ret == -ENOENT)
2534			ret = 0;
2535		else if (ret == -EEXIST)
2536			verbose(env, "multiple exception callback tags for main subprog\n");
2537		return ret;
2538	}
2539
2540	ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
2541	if (ret < 0) {
2542		verbose(env, "exception callback '%s' could not be found in BTF\n", name);
2543		return ret;
2544	}
2545	id = ret;
2546	t = btf_type_by_id(btf, id);
2547	if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
2548		verbose(env, "exception callback '%s' must have global linkage\n", name);
2549		return -EINVAL;
2550	}
2551	ret = 0;
2552	for (i = 0; i < aux->func_info_cnt; i++) {
2553		if (aux->func_info[i].type_id != id)
2554			continue;
2555		ret = aux->func_info[i].insn_off;
2556		/* Further func_info and subprog checks will also happen
2557		 * later, so assume this is the right insn_off for now.
2558		 */
2559		if (!ret) {
2560			verbose(env, "invalid exception callback insn_off in func_info: 0\n");
2561			ret = -EINVAL;
2562		}
2563	}
2564	if (!ret) {
2565		verbose(env, "exception callback type id not found in func_info\n");
2566		ret = -EINVAL;
2567	}
2568	return ret;
2569}
2570
2571#define MAX_KFUNC_DESCS 256
2572#define MAX_KFUNC_BTFS	256
2573
2574struct bpf_kfunc_desc {
2575	struct btf_func_model func_model;
2576	u32 func_id;
2577	s32 imm;
2578	u16 offset;
2579	unsigned long addr;
2580};
2581
2582struct bpf_kfunc_btf {
2583	struct btf *btf;
2584	struct module *module;
2585	u16 offset;
2586};
2587
2588struct bpf_kfunc_desc_tab {
2589	/* Sorted by func_id (BTF ID) and offset (fd_array offset) during
2590	 * verification. JITs do lookups by bpf_insn, where func_id may not be
2591	 * available, therefore at the end of verification do_misc_fixups()
2592	 * sorts this by imm and offset.
2593	 */
2594	struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
2595	u32 nr_descs;
2596};
2597
2598struct bpf_kfunc_btf_tab {
2599	struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
2600	u32 nr_descs;
2601};
2602
2603static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
2604{
2605	const struct bpf_kfunc_desc *d0 = a;
2606	const struct bpf_kfunc_desc *d1 = b;
2607
2608	/* func_id is not greater than BTF_MAX_TYPE */
2609	return d0->func_id - d1->func_id ?: d0->offset - d1->offset;
2610}
2611
2612static int kfunc_btf_cmp_by_off(const void *a, const void *b)
2613{
2614	const struct bpf_kfunc_btf *d0 = a;
2615	const struct bpf_kfunc_btf *d1 = b;
2616
2617	return d0->offset - d1->offset;
2618}
2619
2620static const struct bpf_kfunc_desc *
2621find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
2622{
2623	struct bpf_kfunc_desc desc = {
2624		.func_id = func_id,
2625		.offset = offset,
2626	};
2627	struct bpf_kfunc_desc_tab *tab;
2628
2629	tab = prog->aux->kfunc_tab;
2630	return bsearch(&desc, tab->descs, tab->nr_descs,
2631		       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
2632}
2633
2634int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
2635		       u16 btf_fd_idx, u8 **func_addr)
2636{
2637	const struct bpf_kfunc_desc *desc;
2638
2639	desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
2640	if (!desc)
2641		return -EFAULT;
2642
2643	*func_addr = (u8 *)desc->addr;
2644	return 0;
2645}
2646
2647static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
2648					 s16 offset)
2649{
2650	struct bpf_kfunc_btf kf_btf = { .offset = offset };
2651	struct bpf_kfunc_btf_tab *tab;
2652	struct bpf_kfunc_btf *b;
2653	struct module *mod;
2654	struct btf *btf;
2655	int btf_fd;
2656
2657	tab = env->prog->aux->kfunc_btf_tab;
2658	b = bsearch(&kf_btf, tab->descs, tab->nr_descs,
2659		    sizeof(tab->descs[0]), kfunc_btf_cmp_by_off);
2660	if (!b) {
2661		if (tab->nr_descs == MAX_KFUNC_BTFS) {
2662			verbose(env, "too many different module BTFs\n");
2663			return ERR_PTR(-E2BIG);
2664		}
2665
2666		if (bpfptr_is_null(env->fd_array)) {
2667			verbose(env, "kfunc offset > 0 without fd_array is invalid\n");
2668			return ERR_PTR(-EPROTO);
2669		}
2670
2671		if (copy_from_bpfptr_offset(&btf_fd, env->fd_array,
2672					    offset * sizeof(btf_fd),
2673					    sizeof(btf_fd)))
2674			return ERR_PTR(-EFAULT);
2675
2676		btf = btf_get_by_fd(btf_fd);
2677		if (IS_ERR(btf)) {
2678			verbose(env, "invalid module BTF fd specified\n");
2679			return btf;
2680		}
2681
2682		if (!btf_is_module(btf)) {
2683			verbose(env, "BTF fd for kfunc is not a module BTF\n");
2684			btf_put(btf);
2685			return ERR_PTR(-EINVAL);
2686		}
2687
2688		mod = btf_try_get_module(btf);
2689		if (!mod) {
2690			btf_put(btf);
2691			return ERR_PTR(-ENXIO);
2692		}
2693
2694		b = &tab->descs[tab->nr_descs++];
2695		b->btf = btf;
2696		b->module = mod;
2697		b->offset = offset;
2698
2699		sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
2700		     kfunc_btf_cmp_by_off, NULL);
2701	}
2702	return b->btf;
2703}
2704
2705void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
2706{
2707	if (!tab)
2708		return;
2709
2710	while (tab->nr_descs--) {
2711		module_put(tab->descs[tab->nr_descs].module);
2712		btf_put(tab->descs[tab->nr_descs].btf);
2713	}
2714	kfree(tab);
2715}
2716
2717static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
2718{
2719	if (offset) {
2720		if (offset < 0) {
2721			/* In the future, this can be allowed to increase limit
2722			 * of fd index into fd_array, interpreted as u16.
2723			 */
2724			verbose(env, "negative offset disallowed for kernel module function call\n");
2725			return ERR_PTR(-EINVAL);
2726		}
2727
2728		return __find_kfunc_desc_btf(env, offset);
2729	}
2730	return btf_vmlinux ?: ERR_PTR(-ENOENT);
2731}
2732
2733static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
2734{
2735	const struct btf_type *func, *func_proto;
2736	struct bpf_kfunc_btf_tab *btf_tab;
2737	struct bpf_kfunc_desc_tab *tab;
2738	struct bpf_prog_aux *prog_aux;
2739	struct bpf_kfunc_desc *desc;
2740	const char *func_name;
2741	struct btf *desc_btf;
2742	unsigned long call_imm;
2743	unsigned long addr;
2744	int err;
2745
2746	prog_aux = env->prog->aux;
2747	tab = prog_aux->kfunc_tab;
2748	btf_tab = prog_aux->kfunc_btf_tab;
2749	if (!tab) {
2750		if (!btf_vmlinux) {
2751			verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
2752			return -ENOTSUPP;
2753		}
2754
2755		if (!env->prog->jit_requested) {
2756			verbose(env, "JIT is required for calling kernel function\n");
2757			return -ENOTSUPP;
2758		}
2759
2760		if (!bpf_jit_supports_kfunc_call()) {
2761			verbose(env, "JIT does not support calling kernel function\n");
2762			return -ENOTSUPP;
2763		}
2764
2765		if (!env->prog->gpl_compatible) {
2766			verbose(env, "cannot call kernel function from non-GPL compatible program\n");
2767			return -EINVAL;
2768		}
2769
2770		tab = kzalloc(sizeof(*tab), GFP_KERNEL);
2771		if (!tab)
2772			return -ENOMEM;
2773		prog_aux->kfunc_tab = tab;
2774	}
2775
2776	/* func_id == 0 is always invalid, but instead of returning an error, be
2777	 * conservative and wait until the code elimination pass before returning
2778	 * error, so that invalid calls that get pruned out can be in BPF programs
2779	 * loaded from userspace.  It is also required that offset be untouched
2780	 * for such calls.
2781	 */
2782	if (!func_id && !offset)
2783		return 0;
2784
2785	if (!btf_tab && offset) {
2786		btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL);
2787		if (!btf_tab)
2788			return -ENOMEM;
2789		prog_aux->kfunc_btf_tab = btf_tab;
2790	}
2791
2792	desc_btf = find_kfunc_desc_btf(env, offset);
2793	if (IS_ERR(desc_btf)) {
2794		verbose(env, "failed to find BTF for kernel function\n");
2795		return PTR_ERR(desc_btf);
2796	}
2797
2798	if (find_kfunc_desc(env->prog, func_id, offset))
2799		return 0;
2800
2801	if (tab->nr_descs == MAX_KFUNC_DESCS) {
2802		verbose(env, "too many different kernel function calls\n");
2803		return -E2BIG;
2804	}
2805
2806	func = btf_type_by_id(desc_btf, func_id);
2807	if (!func || !btf_type_is_func(func)) {
2808		verbose(env, "kernel btf_id %u is not a function\n",
2809			func_id);
2810		return -EINVAL;
2811	}
2812	func_proto = btf_type_by_id(desc_btf, func->type);
2813	if (!func_proto || !btf_type_is_func_proto(func_proto)) {
2814		verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
2815			func_id);
2816		return -EINVAL;
2817	}
2818
2819	func_name = btf_name_by_offset(desc_btf, func->name_off);
2820	addr = kallsyms_lookup_name(func_name);
2821	if (!addr) {
2822		verbose(env, "cannot find address for kernel function %s\n",
2823			func_name);
2824		return -EINVAL;
2825	}
2826	specialize_kfunc(env, func_id, offset, &addr);
2827
2828	if (bpf_jit_supports_far_kfunc_call()) {
2829		call_imm = func_id;
2830	} else {
2831		call_imm = BPF_CALL_IMM(addr);
2832		/* Check whether the relative offset overflows desc->imm */
2833		if ((unsigned long)(s32)call_imm != call_imm) {
2834			verbose(env, "address of kernel function %s is out of range\n",
2835				func_name);
2836			return -EINVAL;
2837		}
2838	}
2839
2840	if (bpf_dev_bound_kfunc_id(func_id)) {
2841		err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
2842		if (err)
2843			return err;
2844	}
2845
2846	desc = &tab->descs[tab->nr_descs++];
2847	desc->func_id = func_id;
2848	desc->imm = call_imm;
2849	desc->offset = offset;
2850	desc->addr = addr;
2851	err = btf_distill_func_proto(&env->log, desc_btf,
2852				     func_proto, func_name,
2853				     &desc->func_model);
2854	if (!err)
2855		sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
2856		     kfunc_desc_cmp_by_id_off, NULL);
2857	return err;
2858}
2859
2860static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
2861{
2862	const struct bpf_kfunc_desc *d0 = a;
2863	const struct bpf_kfunc_desc *d1 = b;
2864
2865	if (d0->imm != d1->imm)
2866		return d0->imm < d1->imm ? -1 : 1;
2867	if (d0->offset != d1->offset)
2868		return d0->offset < d1->offset ? -1 : 1;
2869	return 0;
2870}
2871
2872static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
2873{
2874	struct bpf_kfunc_desc_tab *tab;
2875
2876	tab = prog->aux->kfunc_tab;
2877	if (!tab)
2878		return;
2879
2880	sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
2881	     kfunc_desc_cmp_by_imm_off, NULL);
2882}
2883
2884bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
2885{
2886	return !!prog->aux->kfunc_tab;
2887}
2888
2889const struct btf_func_model *
2890bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
2891			 const struct bpf_insn *insn)
2892{
2893	const struct bpf_kfunc_desc desc = {
2894		.imm = insn->imm,
2895		.offset = insn->off,
2896	};
2897	const struct bpf_kfunc_desc *res;
2898	struct bpf_kfunc_desc_tab *tab;
2899
2900	tab = prog->aux->kfunc_tab;
2901	res = bsearch(&desc, tab->descs, tab->nr_descs,
2902		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
2903
2904	return res ? &res->func_model : NULL;
2905}
2906
2907static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
2908{
2909	struct bpf_subprog_info *subprog = env->subprog_info;
2910	int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
2911	struct bpf_insn *insn = env->prog->insnsi;
2912
2913	/* Add entry function. */
2914	ret = add_subprog(env, 0);
2915	if (ret)
2916		return ret;
2917
2918	for (i = 0; i < insn_cnt; i++, insn++) {
2919		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
2920		    !bpf_pseudo_kfunc_call(insn))
2921			continue;
2922
2923		if (!env->bpf_capable) {
2924			verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
2925			return -EPERM;
2926		}
2927
2928		if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
2929			ret = add_subprog(env, i + insn->imm + 1);
2930		else
2931			ret = add_kfunc_call(env, insn->imm, insn->off);
2932
2933		if (ret < 0)
2934			return ret;
2935	}
2936
2937	ret = bpf_find_exception_callback_insn_off(env);
2938	if (ret < 0)
2939		return ret;
2940	ex_cb_insn = ret;
2941
2942	/* If ex_cb_insn > 0, this means that the main program has a subprog
2943	 * marked using BTF decl tag to serve as the exception callback.
2944	 */
2945	if (ex_cb_insn) {
2946		ret = add_subprog(env, ex_cb_insn);
2947		if (ret < 0)
2948			return ret;
2949		for (i = 1; i < env->subprog_cnt; i++) {
2950			if (env->subprog_info[i].start != ex_cb_insn)
2951				continue;
2952			env->exception_callback_subprog = i;
2953			mark_subprog_exc_cb(env, i);
2954			break;
2955		}
2956	}
2957
2958	/* Add a fake 'exit' subprog which could simplify subprog iteration
2959	 * logic. 'subprog_cnt' should not be increased.
2960	 */
2961	subprog[env->subprog_cnt].start = insn_cnt;
2962
2963	if (env->log.level & BPF_LOG_LEVEL2)
2964		for (i = 0; i < env->subprog_cnt; i++)
2965			verbose(env, "func#%d @%d\n", i, subprog[i].start);
2966
2967	return 0;
2968}
2969
2970static int check_subprogs(struct bpf_verifier_env *env)
2971{
2972	int i, subprog_start, subprog_end, off, cur_subprog = 0;
2973	struct bpf_subprog_info *subprog = env->subprog_info;
2974	struct bpf_insn *insn = env->prog->insnsi;
2975	int insn_cnt = env->prog->len;
2976
2977	/* now check that all jumps are within the same subprog */
2978	subprog_start = subprog[cur_subprog].start;
2979	subprog_end = subprog[cur_subprog + 1].start;
2980	for (i = 0; i < insn_cnt; i++) {
2981		u8 code = insn[i].code;
2982
2983		if (code == (BPF_JMP | BPF_CALL) &&
2984		    insn[i].src_reg == 0 &&
2985		    insn[i].imm == BPF_FUNC_tail_call)
2986			subprog[cur_subprog].has_tail_call = true;
2987		if (BPF_CLASS(code) == BPF_LD &&
2988		    (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
2989			subprog[cur_subprog].has_ld_abs = true;
2990		if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
2991			goto next;
2992		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
2993			goto next;
2994		if (code == (BPF_JMP32 | BPF_JA))
2995			off = i + insn[i].imm + 1;
2996		else
2997			off = i + insn[i].off + 1;
2998		if (off < subprog_start || off >= subprog_end) {
2999			verbose(env, "jump out of range from insn %d to %d\n", i, off);
3000			return -EINVAL;
3001		}
3002next:
3003		if (i == subprog_end - 1) {
3004			/* to avoid fall-through from one subprog into another
3005			 * the last insn of the subprog should be either exit
3006			 * or unconditional jump back or bpf_throw call
3007			 */
3008			if (code != (BPF_JMP | BPF_EXIT) &&
3009			    code != (BPF_JMP32 | BPF_JA) &&
3010			    code != (BPF_JMP | BPF_JA)) {
3011				verbose(env, "last insn is not an exit or jmp\n");
3012				return -EINVAL;
3013			}
3014			subprog_start = subprog_end;
3015			cur_subprog++;
3016			if (cur_subprog < env->subprog_cnt)
3017				subprog_end = subprog[cur_subprog + 1].start;
3018		}
3019	}
3020	return 0;
3021}
3022
3023/* Parentage chain of this register (or stack slot) should take care of all
3024 * issues like callee-saved registers, stack slot allocation time, etc.
3025 */
3026static int mark_reg_read(struct bpf_verifier_env *env,
3027			 const struct bpf_reg_state *state,
3028			 struct bpf_reg_state *parent, u8 flag)
3029{
3030	bool writes = parent == state->parent; /* Observe write marks */
3031	int cnt = 0;
3032
3033	while (parent) {
3034		/* if read wasn't screened by an earlier write ... */
3035		if (writes && state->live & REG_LIVE_WRITTEN)
3036			break;
3037		if (parent->live & REG_LIVE_DONE) {
3038			verbose(env, "verifier BUG type %s var_off %lld off %d\n",
3039				reg_type_str(env, parent->type),
3040				parent->var_off.value, parent->off);
3041			return -EFAULT;
3042		}
3043		/* The first condition is more likely to be true than the
3044		 * second, checked it first.
3045		 */
3046		if ((parent->live & REG_LIVE_READ) == flag ||
3047		    parent->live & REG_LIVE_READ64)
3048			/* The parentage chain never changes and
3049			 * this parent was already marked as LIVE_READ.
3050			 * There is no need to keep walking the chain again and
3051			 * keep re-marking all parents as LIVE_READ.
3052			 * This case happens when the same register is read
3053			 * multiple times without writes into it in-between.
3054			 * Also, if parent has the stronger REG_LIVE_READ64 set,
3055			 * then no need to set the weak REG_LIVE_READ32.
3056			 */
3057			break;
3058		/* ... then we depend on parent's value */
3059		parent->live |= flag;
3060		/* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
3061		if (flag == REG_LIVE_READ64)
3062			parent->live &= ~REG_LIVE_READ32;
3063		state = parent;
3064		parent = state->parent;
3065		writes = true;
3066		cnt++;
3067	}
3068
3069	if (env->longest_mark_read_walk < cnt)
3070		env->longest_mark_read_walk = cnt;
3071	return 0;
3072}
3073
3074static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
3075{
3076	struct bpf_func_state *state = func(env, reg);
3077	int spi, ret;
3078
3079	/* For CONST_PTR_TO_DYNPTR, it must have already been done by
3080	 * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
3081	 * check_kfunc_call.
3082	 */
3083	if (reg->type == CONST_PTR_TO_DYNPTR)
3084		return 0;
3085	spi = dynptr_get_spi(env, reg);
3086	if (spi < 0)
3087		return spi;
3088	/* Caller ensures dynptr is valid and initialized, which means spi is in
3089	 * bounds and spi is the first dynptr slot. Simply mark stack slot as
3090	 * read.
3091	 */
3092	ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
3093			    state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
3094	if (ret)
3095		return ret;
3096	return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
3097			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
3098}
3099
3100static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
3101			  int spi, int nr_slots)
3102{
3103	struct bpf_func_state *state = func(env, reg);
3104	int err, i;
3105
3106	for (i = 0; i < nr_slots; i++) {
3107		struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
3108
3109		err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
3110		if (err)
3111			return err;
3112
3113		mark_stack_slot_scratched(env, spi - i);
3114	}
3115
3116	return 0;
3117}
3118
3119/* This function is supposed to be used by the following 32-bit optimization
3120 * code only. It returns TRUE if the source or destination register operates
3121 * on 64-bit, otherwise return FALSE.
3122 */
3123static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
3124		     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
3125{
3126	u8 code, class, op;
3127
3128	code = insn->code;
3129	class = BPF_CLASS(code);
3130	op = BPF_OP(code);
3131	if (class == BPF_JMP) {
3132		/* BPF_EXIT for "main" will reach here. Return TRUE
3133		 * conservatively.
3134		 */
3135		if (op == BPF_EXIT)
3136			return true;
3137		if (op == BPF_CALL) {
3138			/* BPF to BPF call will reach here because of marking
3139			 * caller saved clobber with DST_OP_NO_MARK for which we
3140			 * don't care the register def because they are anyway
3141			 * marked as NOT_INIT already.
3142			 */
3143			if (insn->src_reg == BPF_PSEUDO_CALL)
3144				return false;
3145			/* Helper call will reach here because of arg type
3146			 * check, conservatively return TRUE.
3147			 */
3148			if (t == SRC_OP)
3149				return true;
3150
3151			return false;
3152		}
3153	}
3154
3155	if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
3156		return false;
3157
3158	if (class == BPF_ALU64 || class == BPF_JMP ||
3159	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
3160		return true;
3161
3162	if (class == BPF_ALU || class == BPF_JMP32)
3163		return false;
3164
3165	if (class == BPF_LDX) {
3166		if (t != SRC_OP)
3167			return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
3168		/* LDX source must be ptr. */
3169		return true;
3170	}
3171
3172	if (class == BPF_STX) {
3173		/* BPF_STX (including atomic variants) has multiple source
3174		 * operands, one of which is a ptr. Check whether the caller is
3175		 * asking about it.
3176		 */
3177		if (t == SRC_OP && reg->type != SCALAR_VALUE)
3178			return true;
3179		return BPF_SIZE(code) == BPF_DW;
3180	}
3181
3182	if (class == BPF_LD) {
3183		u8 mode = BPF_MODE(code);
3184
3185		/* LD_IMM64 */
3186		if (mode == BPF_IMM)
3187			return true;
3188
3189		/* Both LD_IND and LD_ABS return 32-bit data. */
3190		if (t != SRC_OP)
3191			return  false;
3192
3193		/* Implicit ctx ptr. */
3194		if (regno == BPF_REG_6)
3195			return true;
3196
3197		/* Explicit source could be any width. */
3198		return true;
3199	}
3200
3201	if (class == BPF_ST)
3202		/* The only source register for BPF_ST is a ptr. */
3203		return true;
3204
3205	/* Conservatively return true at default. */
3206	return true;
3207}
3208
3209/* Return the regno defined by the insn, or -1. */
3210static int insn_def_regno(const struct bpf_insn *insn)
3211{
3212	switch (BPF_CLASS(insn->code)) {
3213	case BPF_JMP:
3214	case BPF_JMP32:
3215	case BPF_ST:
3216		return -1;
3217	case BPF_STX:
3218		if (BPF_MODE(insn->code) == BPF_ATOMIC &&
3219		    (insn->imm & BPF_FETCH)) {
3220			if (insn->imm == BPF_CMPXCHG)
3221				return BPF_REG_0;
3222			else
3223				return insn->src_reg;
3224		} else {
3225			return -1;
3226		}
3227	default:
3228		return insn->dst_reg;
3229	}
3230}
3231
3232/* Return TRUE if INSN has defined any 32-bit value explicitly. */
3233static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
3234{
3235	int dst_reg = insn_def_regno(insn);
3236
3237	if (dst_reg == -1)
3238		return false;
3239
3240	return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
3241}
3242
3243static void mark_insn_zext(struct bpf_verifier_env *env,
3244			   struct bpf_reg_state *reg)
3245{
3246	s32 def_idx = reg->subreg_def;
3247
3248	if (def_idx == DEF_NOT_SUBREG)
3249		return;
3250
3251	env->insn_aux_data[def_idx - 1].zext_dst = true;
3252	/* The dst will be zero extended, so won't be sub-register anymore. */
3253	reg->subreg_def = DEF_NOT_SUBREG;
3254}
3255
3256static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
3257			   enum reg_arg_type t)
3258{
3259	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
3260	struct bpf_reg_state *reg;
3261	bool rw64;
3262
3263	if (regno >= MAX_BPF_REG) {
3264		verbose(env, "R%d is invalid\n", regno);
3265		return -EINVAL;
3266	}
3267
3268	mark_reg_scratched(env, regno);
3269
3270	reg = &regs[regno];
3271	rw64 = is_reg64(env, insn, regno, reg, t);
3272	if (t == SRC_OP) {
3273		/* check whether register used as source operand can be read */
3274		if (reg->type == NOT_INIT) {
3275			verbose(env, "R%d !read_ok\n", regno);
3276			return -EACCES;
3277		}
3278		/* We don't need to worry about FP liveness because it's read-only */
3279		if (regno == BPF_REG_FP)
3280			return 0;
3281
3282		if (rw64)
3283			mark_insn_zext(env, reg);
3284
3285		return mark_reg_read(env, reg, reg->parent,
3286				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
3287	} else {
3288		/* check whether register used as dest operand can be written to */
3289		if (regno == BPF_REG_FP) {
3290			verbose(env, "frame pointer is read only\n");
3291			return -EACCES;
3292		}
3293		reg->live |= REG_LIVE_WRITTEN;
3294		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
3295		if (t == DST_OP)
3296			mark_reg_unknown(env, regs, regno);
3297	}
3298	return 0;
3299}
3300
3301static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
3302			 enum reg_arg_type t)
3303{
3304	struct bpf_verifier_state *vstate = env->cur_state;
3305	struct bpf_func_state *state = vstate->frame[vstate->curframe];
3306
3307	return __check_reg_arg(env, state->regs, regno, t);
3308}
3309
3310static int insn_stack_access_flags(int frameno, int spi)
3311{
3312	return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
3313}
3314
3315static int insn_stack_access_spi(int insn_flags)
3316{
3317	return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
3318}
3319
3320static int insn_stack_access_frameno(int insn_flags)
3321{
3322	return insn_flags & INSN_F_FRAMENO_MASK;
3323}
3324
3325static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
3326{
3327	env->insn_aux_data[idx].jmp_point = true;
3328}
3329
3330static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
3331{
3332	return env->insn_aux_data[insn_idx].jmp_point;
3333}
3334
3335/* for any branch, call, exit record the history of jmps in the given state */
3336static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
3337			    int insn_flags)
3338{
3339	u32 cnt = cur->jmp_history_cnt;
3340	struct bpf_jmp_history_entry *p;
3341	size_t alloc_size;
3342
3343	/* combine instruction flags if we already recorded this instruction */
3344	if (env->cur_hist_ent) {
3345		/* atomic instructions push insn_flags twice, for READ and
3346		 * WRITE sides, but they should agree on stack slot
3347		 */
3348		WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
3349			  (env->cur_hist_ent->flags & insn_flags) != insn_flags,
3350			  "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
3351			  env->insn_idx, env->cur_hist_ent->flags, insn_flags);
3352		env->cur_hist_ent->flags |= insn_flags;
3353		return 0;
3354	}
3355
3356	cnt++;
3357	alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
3358	p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
3359	if (!p)
3360		return -ENOMEM;
3361	cur->jmp_history = p;
3362
3363	p = &cur->jmp_history[cnt - 1];
3364	p->idx = env->insn_idx;
3365	p->prev_idx = env->prev_insn_idx;
3366	p->flags = insn_flags;
3367	cur->jmp_history_cnt = cnt;
3368	env->cur_hist_ent = p;
3369
3370	return 0;
3371}
3372
3373static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
3374						        u32 hist_end, int insn_idx)
3375{
3376	if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
3377		return &st->jmp_history[hist_end - 1];
3378	return NULL;
3379}
3380
3381/* Backtrack one insn at a time. If idx is not at the top of recorded
3382 * history then previous instruction came from straight line execution.
3383 * Return -ENOENT if we exhausted all instructions within given state.
3384 *
3385 * It's legal to have a bit of a looping with the same starting and ending
3386 * insn index within the same state, e.g.: 3->4->5->3, so just because current
3387 * instruction index is the same as state's first_idx doesn't mean we are
3388 * done. If there is still some jump history left, we should keep going. We
3389 * need to take into account that we might have a jump history between given
3390 * state's parent and itself, due to checkpointing. In this case, we'll have
3391 * history entry recording a jump from last instruction of parent state and
3392 * first instruction of given state.
3393 */
3394static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
3395			     u32 *history)
3396{
3397	u32 cnt = *history;
3398
3399	if (i == st->first_insn_idx) {
3400		if (cnt == 0)
3401			return -ENOENT;
3402		if (cnt == 1 && st->jmp_history[0].idx == i)
3403			return -ENOENT;
3404	}
3405
3406	if (cnt && st->jmp_history[cnt - 1].idx == i) {
3407		i = st->jmp_history[cnt - 1].prev_idx;
3408		(*history)--;
3409	} else {
3410		i--;
3411	}
3412	return i;
3413}
3414
3415static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
3416{
3417	const struct btf_type *func;
3418	struct btf *desc_btf;
3419
3420	if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
3421		return NULL;
3422
3423	desc_btf = find_kfunc_desc_btf(data, insn->off);
3424	if (IS_ERR(desc_btf))
3425		return "<error>";
3426
3427	func = btf_type_by_id(desc_btf, insn->imm);
3428	return btf_name_by_offset(desc_btf, func->name_off);
3429}
3430
3431static inline void bt_init(struct backtrack_state *bt, u32 frame)
3432{
3433	bt->frame = frame;
3434}
3435
3436static inline void bt_reset(struct backtrack_state *bt)
3437{
3438	struct bpf_verifier_env *env = bt->env;
3439
3440	memset(bt, 0, sizeof(*bt));
3441	bt->env = env;
3442}
3443
3444static inline u32 bt_empty(struct backtrack_state *bt)
3445{
3446	u64 mask = 0;
3447	int i;
3448
3449	for (i = 0; i <= bt->frame; i++)
3450		mask |= bt->reg_masks[i] | bt->stack_masks[i];
3451
3452	return mask == 0;
3453}
3454
3455static inline int bt_subprog_enter(struct backtrack_state *bt)
3456{
3457	if (bt->frame == MAX_CALL_FRAMES - 1) {
3458		verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
3459		WARN_ONCE(1, "verifier backtracking bug");
3460		return -EFAULT;
3461	}
3462	bt->frame++;
3463	return 0;
3464}
3465
3466static inline int bt_subprog_exit(struct backtrack_state *bt)
3467{
3468	if (bt->frame == 0) {
3469		verbose(bt->env, "BUG subprog exit from frame 0\n");
3470		WARN_ONCE(1, "verifier backtracking bug");
3471		return -EFAULT;
3472	}
3473	bt->frame--;
3474	return 0;
3475}
3476
3477static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
3478{
3479	bt->reg_masks[frame] |= 1 << reg;
3480}
3481
3482static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
3483{
3484	bt->reg_masks[frame] &= ~(1 << reg);
3485}
3486
3487static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
3488{
3489	bt_set_frame_reg(bt, bt->frame, reg);
3490}
3491
3492static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
3493{
3494	bt_clear_frame_reg(bt, bt->frame, reg);
3495}
3496
3497static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
3498{
3499	bt->stack_masks[frame] |= 1ull << slot;
3500}
3501
3502static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
3503{
3504	bt->stack_masks[frame] &= ~(1ull << slot);
3505}
3506
3507static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
3508{
3509	return bt->reg_masks[frame];
3510}
3511
3512static inline u32 bt_reg_mask(struct backtrack_state *bt)
3513{
3514	return bt->reg_masks[bt->frame];
3515}
3516
3517static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
3518{
3519	return bt->stack_masks[frame];
3520}
3521
3522static inline u64 bt_stack_mask(struct backtrack_state *bt)
3523{
3524	return bt->stack_masks[bt->frame];
3525}
3526
3527static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
3528{
3529	return bt->reg_masks[bt->frame] & (1 << reg);
3530}
3531
3532static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
3533{
3534	return bt->stack_masks[frame] & (1ull << slot);
3535}
3536
3537/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
3538static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
3539{
3540	DECLARE_BITMAP(mask, 64);
3541	bool first = true;
3542	int i, n;
3543
3544	buf[0] = '\0';
3545
3546	bitmap_from_u64(mask, reg_mask);
3547	for_each_set_bit(i, mask, 32) {
3548		n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
3549		first = false;
3550		buf += n;
3551		buf_sz -= n;
3552		if (buf_sz < 0)
3553			break;
3554	}
3555}
3556/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
3557static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
3558{
3559	DECLARE_BITMAP(mask, 64);
3560	bool first = true;
3561	int i, n;
3562
3563	buf[0] = '\0';
3564
3565	bitmap_from_u64(mask, stack_mask);
3566	for_each_set_bit(i, mask, 64) {
3567		n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
3568		first = false;
3569		buf += n;
3570		buf_sz -= n;
3571		if (buf_sz < 0)
3572			break;
3573	}
3574}
3575
3576static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
3577
3578/* For given verifier state backtrack_insn() is called from the last insn to
3579 * the first insn. Its purpose is to compute a bitmask of registers and
3580 * stack slots that needs precision in the parent verifier state.
3581 *
3582 * @idx is an index of the instruction we are currently processing;
3583 * @subseq_idx is an index of the subsequent instruction that:
3584 *   - *would be* executed next, if jump history is viewed in forward order;
3585 *   - *was* processed previously during backtracking.
3586 */
3587static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
3588			  struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
3589{
3590	const struct bpf_insn_cbs cbs = {
3591		.cb_call	= disasm_kfunc_name,
3592		.cb_print	= verbose,
3593		.private_data	= env,
3594	};
3595	struct bpf_insn *insn = env->prog->insnsi + idx;
3596	u8 class = BPF_CLASS(insn->code);
3597	u8 opcode = BPF_OP(insn->code);
3598	u8 mode = BPF_MODE(insn->code);
3599	u32 dreg = insn->dst_reg;
3600	u32 sreg = insn->src_reg;
3601	u32 spi, i, fr;
3602
3603	if (insn->code == 0)
3604		return 0;
3605	if (env->log.level & BPF_LOG_LEVEL2) {
3606		fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
3607		verbose(env, "mark_precise: frame%d: regs=%s ",
3608			bt->frame, env->tmp_str_buf);
3609		fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
3610		verbose(env, "stack=%s before ", env->tmp_str_buf);
3611		verbose(env, "%d: ", idx);
3612		print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
3613	}
3614
3615	if (class == BPF_ALU || class == BPF_ALU64) {
3616		if (!bt_is_reg_set(bt, dreg))
3617			return 0;
3618		if (opcode == BPF_END || opcode == BPF_NEG) {
3619			/* sreg is reserved and unused
3620			 * dreg still need precision before this insn
3621			 */
3622			return 0;
3623		} else if (opcode == BPF_MOV) {
3624			if (BPF_SRC(insn->code) == BPF_X) {
3625				/* dreg = sreg or dreg = (s8, s16, s32)sreg
3626				 * dreg needs precision after this insn
3627				 * sreg needs precision before this insn
3628				 */
3629				bt_clear_reg(bt, dreg);
3630				if (sreg != BPF_REG_FP)
3631					bt_set_reg(bt, sreg);
3632			} else {
3633				/* dreg = K
3634				 * dreg needs precision after this insn.
3635				 * Corresponding register is already marked
3636				 * as precise=true in this verifier state.
3637				 * No further markings in parent are necessary
3638				 */
3639				bt_clear_reg(bt, dreg);
3640			}
3641		} else {
3642			if (BPF_SRC(insn->code) == BPF_X) {
3643				/* dreg += sreg
3644				 * both dreg and sreg need precision
3645				 * before this insn
3646				 */
3647				if (sreg != BPF_REG_FP)
3648					bt_set_reg(bt, sreg);
3649			} /* else dreg += K
3650			   * dreg still needs precision before this insn
3651			   */
3652		}
3653	} else if (class == BPF_LDX) {
3654		if (!bt_is_reg_set(bt, dreg))
3655			return 0;
3656		bt_clear_reg(bt, dreg);
3657
3658		/* scalars can only be spilled into stack w/o losing precision.
3659		 * Load from any other memory can be zero extended.
3660		 * The desire to keep that precision is already indicated
3661		 * by 'precise' mark in corresponding register of this state.
3662		 * No further tracking necessary.
3663		 */
3664		if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
3665			return 0;
3666		/* dreg = *(u64 *)[fp - off] was a fill from the stack.
3667		 * that [fp - off] slot contains scalar that needs to be
3668		 * tracked with precision
3669		 */
3670		spi = insn_stack_access_spi(hist->flags);
3671		fr = insn_stack_access_frameno(hist->flags);
3672		bt_set_frame_slot(bt, fr, spi);
3673	} else if (class == BPF_STX || class == BPF_ST) {
3674		if (bt_is_reg_set(bt, dreg))
3675			/* stx & st shouldn't be using _scalar_ dst_reg
3676			 * to access memory. It means backtracking
3677			 * encountered a case of pointer subtraction.
3678			 */
3679			return -ENOTSUPP;
3680		/* scalars can only be spilled into stack */
3681		if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
3682			return 0;
3683		spi = insn_stack_access_spi(hist->flags);
3684		fr = insn_stack_access_frameno(hist->flags);
3685		if (!bt_is_frame_slot_set(bt, fr, spi))
3686			return 0;
3687		bt_clear_frame_slot(bt, fr, spi);
3688		if (class == BPF_STX)
3689			bt_set_reg(bt, sreg);
3690	} else if (class == BPF_JMP || class == BPF_JMP32) {
3691		if (bpf_pseudo_call(insn)) {
3692			int subprog_insn_idx, subprog;
3693
3694			subprog_insn_idx = idx + insn->imm + 1;
3695			subprog = find_subprog(env, subprog_insn_idx);
3696			if (subprog < 0)
3697				return -EFAULT;
3698
3699			if (subprog_is_global(env, subprog)) {
3700				/* check that jump history doesn't have any
3701				 * extra instructions from subprog; the next
3702				 * instruction after call to global subprog
3703				 * should be literally next instruction in
3704				 * caller program
3705				 */
3706				WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
3707				/* r1-r5 are invalidated after subprog call,
3708				 * so for global func call it shouldn't be set
3709				 * anymore
3710				 */
3711				if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
3712					verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
3713					WARN_ONCE(1, "verifier backtracking bug");
3714					return -EFAULT;
3715				}
3716				/* global subprog always sets R0 */
3717				bt_clear_reg(bt, BPF_REG_0);
3718				return 0;
3719			} else {
3720				/* static subprog call instruction, which
3721				 * means that we are exiting current subprog,
3722				 * so only r1-r5 could be still requested as
3723				 * precise, r0 and r6-r10 or any stack slot in
3724				 * the current frame should be zero by now
3725				 */
3726				if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
3727					verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
3728					WARN_ONCE(1, "verifier backtracking bug");
3729					return -EFAULT;
3730				}
3731				/* we are now tracking register spills correctly,
3732				 * so any instance of leftover slots is a bug
3733				 */
3734				if (bt_stack_mask(bt) != 0) {
3735					verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
3736					WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
3737					return -EFAULT;
3738				}
3739				/* propagate r1-r5 to the caller */
3740				for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
3741					if (bt_is_reg_set(bt, i)) {
3742						bt_clear_reg(bt, i);
3743						bt_set_frame_reg(bt, bt->frame - 1, i);
3744					}
3745				}
3746				if (bt_subprog_exit(bt))
3747					return -EFAULT;
3748				return 0;
3749			}
3750		} else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
3751			/* exit from callback subprog to callback-calling helper or
3752			 * kfunc call. Use idx/subseq_idx check to discern it from
3753			 * straight line code backtracking.
3754			 * Unlike the subprog call handling above, we shouldn't
3755			 * propagate precision of r1-r5 (if any requested), as they are
3756			 * not actually arguments passed directly to callback subprogs
3757			 */
3758			if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
3759				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
3760				WARN_ONCE(1, "verifier backtracking bug");
3761				return -EFAULT;
3762			}
3763			if (bt_stack_mask(bt) != 0) {
3764				verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
3765				WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
3766				return -EFAULT;
3767			}
3768			/* clear r1-r5 in callback subprog's mask */
3769			for (i = BPF_REG_1; i <= BPF_REG_5; i++)
3770				bt_clear_reg(bt, i);
3771			if (bt_subprog_exit(bt))
3772				return -EFAULT;
3773			return 0;
3774		} else if (opcode == BPF_CALL) {
3775			/* kfunc with imm==0 is invalid and fixup_kfunc_call will
3776			 * catch this error later. Make backtracking conservative
3777			 * with ENOTSUPP.
3778			 */
3779			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
3780				return -ENOTSUPP;
3781			/* regular helper call sets R0 */
3782			bt_clear_reg(bt, BPF_REG_0);
3783			if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
3784				/* if backtracing was looking for registers R1-R5
3785				 * they should have been found already.
3786				 */
3787				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
3788				WARN_ONCE(1, "verifier backtracking bug");
3789				return -EFAULT;
3790			}
3791		} else if (opcode == BPF_EXIT) {
3792			bool r0_precise;
3793
3794			/* Backtracking to a nested function call, 'idx' is a part of
3795			 * the inner frame 'subseq_idx' is a part of the outer frame.
3796			 * In case of a regular function call, instructions giving
3797			 * precision to registers R1-R5 should have been found already.
3798			 * In case of a callback, it is ok to have R1-R5 marked for
3799			 * backtracking, as these registers are set by the function
3800			 * invoking callback.
3801			 */
3802			if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
3803				for (i = BPF_REG_1; i <= BPF_REG_5; i++)
3804					bt_clear_reg(bt, i);
3805			if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
3806				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
3807				WARN_ONCE(1, "verifier backtracking bug");
3808				return -EFAULT;
3809			}
3810
3811			/* BPF_EXIT in subprog or callback always returns
3812			 * right after the call instruction, so by checking
3813			 * whether the instruction at subseq_idx-1 is subprog
3814			 * call or not we can distinguish actual exit from
3815			 * *subprog* from exit from *callback*. In the former
3816			 * case, we need to propagate r0 precision, if
3817			 * necessary. In the former we never do that.
3818			 */
3819			r0_precise = subseq_idx - 1 >= 0 &&
3820				     bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
3821				     bt_is_reg_set(bt, BPF_REG_0);
3822
3823			bt_clear_reg(bt, BPF_REG_0);
3824			if (bt_subprog_enter(bt))
3825				return -EFAULT;
3826
3827			if (r0_precise)
3828				bt_set_reg(bt, BPF_REG_0);
3829			/* r6-r9 and stack slots will stay set in caller frame
3830			 * bitmasks until we return back from callee(s)
3831			 */
3832			return 0;
3833		} else if (BPF_SRC(insn->code) == BPF_X) {
3834			if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
3835				return 0;
3836			/* dreg <cond> sreg
3837			 * Both dreg and sreg need precision before
3838			 * this insn. If only sreg was marked precise
3839			 * before it would be equally necessary to
3840			 * propagate it to dreg.
3841			 */
3842			bt_set_reg(bt, dreg);
3843			bt_set_reg(bt, sreg);
3844			 /* else dreg <cond> K
3845			  * Only dreg still needs precision before
3846			  * this insn, so for the K-based conditional
3847			  * there is nothing new to be marked.
3848			  */
3849		}
3850	} else if (class == BPF_LD) {
3851		if (!bt_is_reg_set(bt, dreg))
3852			return 0;
3853		bt_clear_reg(bt, dreg);
3854		/* It's ld_imm64 or ld_abs or ld_ind.
3855		 * For ld_imm64 no further tracking of precision
3856		 * into parent is necessary
3857		 */
3858		if (mode == BPF_IND || mode == BPF_ABS)
3859			/* to be analyzed */
3860			return -ENOTSUPP;
3861	}
3862	return 0;
3863}
3864
3865/* the scalar precision tracking algorithm:
3866 * . at the start all registers have precise=false.
3867 * . scalar ranges are tracked as normal through alu and jmp insns.
3868 * . once precise value of the scalar register is used in:
3869 *   .  ptr + scalar alu
3870 *   . if (scalar cond K|scalar)
3871 *   .  helper_call(.., scalar, ...) where ARG_CONST is expected
3872 *   backtrack through the verifier states and mark all registers and
3873 *   stack slots with spilled constants that these scalar regisers
3874 *   should be precise.
3875 * . during state pruning two registers (or spilled stack slots)
3876 *   are equivalent if both are not precise.
3877 *
3878 * Note the verifier cannot simply walk register parentage chain,
3879 * since many different registers and stack slots could have been
3880 * used to compute single precise scalar.
3881 *
3882 * The approach of starting with precise=true for all registers and then
3883 * backtrack to mark a register as not precise when the verifier detects
3884 * that program doesn't care about specific value (e.g., when helper
3885 * takes register as ARG_ANYTHING parameter) is not safe.
3886 *
3887 * It's ok to walk single parentage chain of the verifier states.
3888 * It's possible that this backtracking will go all the way till 1st insn.
3889 * All other branches will be explored for needing precision later.
3890 *
3891 * The backtracking needs to deal with cases like:
3892 *   R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
3893 * r9 -= r8
3894 * r5 = r9
3895 * if r5 > 0x79f goto pc+7
3896 *    R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
3897 * r5 += 1
3898 * ...
3899 * call bpf_perf_event_output#25
3900 *   where .arg5_type = ARG_CONST_SIZE_OR_ZERO
3901 *
3902 * and this case:
3903 * r6 = 1
3904 * call foo // uses callee's r6 inside to compute r0
3905 * r0 += r6
3906 * if r0 == 0 goto
3907 *
3908 * to track above reg_mask/stack_mask needs to be independent for each frame.
3909 *
3910 * Also if parent's curframe > frame where backtracking started,
3911 * the verifier need to mark registers in both frames, otherwise callees
3912 * may incorrectly prune callers. This is similar to
3913 * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
3914 *
3915 * For now backtracking falls back into conservative marking.
3916 */
3917static void mark_all_scalars_precise(struct bpf_verifier_env *env,
3918				     struct bpf_verifier_state *st)
3919{
3920	struct bpf_func_state *func;
3921	struct bpf_reg_state *reg;
3922	int i, j;
3923
3924	if (env->log.level & BPF_LOG_LEVEL2) {
3925		verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
3926			st->curframe);
3927	}
3928
3929	/* big hammer: mark all scalars precise in this path.
3930	 * pop_stack may still get !precise scalars.
3931	 * We also skip current state and go straight to first parent state,
3932	 * because precision markings in current non-checkpointed state are
3933	 * not needed. See why in the comment in __mark_chain_precision below.
3934	 */
3935	for (st = st->parent; st; st = st->parent) {
3936		for (i = 0; i <= st->curframe; i++) {
3937			func = st->frame[i];
3938			for (j = 0; j < BPF_REG_FP; j++) {
3939				reg = &func->regs[j];
3940				if (reg->type != SCALAR_VALUE || reg->precise)
3941					continue;
3942				reg->precise = true;
3943				if (env->log.level & BPF_LOG_LEVEL2) {
3944					verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
3945						i, j);
3946				}
3947			}
3948			for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
3949				if (!is_spilled_reg(&func->stack[j]))
3950					continue;
3951				reg = &func->stack[j].spilled_ptr;
3952				if (reg->type != SCALAR_VALUE || reg->precise)
3953					continue;
3954				reg->precise = true;
3955				if (env->log.level & BPF_LOG_LEVEL2) {
3956					verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
3957						i, -(j + 1) * 8);
3958				}
3959			}
3960		}
3961	}
3962}
3963
3964static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
3965{
3966	struct bpf_func_state *func;
3967	struct bpf_reg_state *reg;
3968	int i, j;
3969
3970	for (i = 0; i <= st->curframe; i++) {
3971		func = st->frame[i];
3972		for (j = 0; j < BPF_REG_FP; j++) {
3973			reg = &func->regs[j];
3974			if (reg->type != SCALAR_VALUE)
3975				continue;
3976			reg->precise = false;
3977		}
3978		for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
3979			if (!is_spilled_reg(&func->stack[j]))
3980				continue;
3981			reg = &func->stack[j].spilled_ptr;
3982			if (reg->type != SCALAR_VALUE)
3983				continue;
3984			reg->precise = false;
3985		}
3986	}
3987}
3988
3989static bool idset_contains(struct bpf_idset *s, u32 id)
3990{
3991	u32 i;
3992
3993	for (i = 0; i < s->count; ++i)
3994		if (s->ids[i] == id)
3995			return true;
3996
3997	return false;
3998}
3999
4000static int idset_push(struct bpf_idset *s, u32 id)
4001{
4002	if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
4003		return -EFAULT;
4004	s->ids[s->count++] = id;
4005	return 0;
4006}
4007
4008static void idset_reset(struct bpf_idset *s)
4009{
4010	s->count = 0;
4011}
4012
4013/* Collect a set of IDs for all registers currently marked as precise in env->bt.
4014 * Mark all registers with these IDs as precise.
4015 */
4016static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
4017{
4018	struct bpf_idset *precise_ids = &env->idset_scratch;
4019	struct backtrack_state *bt = &env->bt;
4020	struct bpf_func_state *func;
4021	struct bpf_reg_state *reg;
4022	DECLARE_BITMAP(mask, 64);
4023	int i, fr;
4024
4025	idset_reset(precise_ids);
4026
4027	for (fr = bt->frame; fr >= 0; fr--) {
4028		func = st->frame[fr];
4029
4030		bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
4031		for_each_set_bit(i, mask, 32) {
4032			reg = &func->regs[i];
4033			if (!reg->id || reg->type != SCALAR_VALUE)
4034				continue;
4035			if (idset_push(precise_ids, reg->id))
4036				return -EFAULT;
4037		}
4038
4039		bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
4040		for_each_set_bit(i, mask, 64) {
4041			if (i >= func->allocated_stack / BPF_REG_SIZE)
4042				break;
4043			if (!is_spilled_scalar_reg(&func->stack[i]))
4044				continue;
4045			reg = &func->stack[i].spilled_ptr;
4046			if (!reg->id)
4047				continue;
4048			if (idset_push(precise_ids, reg->id))
4049				return -EFAULT;
4050		}
4051	}
4052
4053	for (fr = 0; fr <= st->curframe; ++fr) {
4054		func = st->frame[fr];
4055
4056		for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
4057			reg = &func->regs[i];
4058			if (!reg->id)
4059				continue;
4060			if (!idset_contains(precise_ids, reg->id))
4061				continue;
4062			bt_set_frame_reg(bt, fr, i);
4063		}
4064		for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
4065			if (!is_spilled_scalar_reg(&func->stack[i]))
4066				continue;
4067			reg = &func->stack[i].spilled_ptr;
4068			if (!reg->id)
4069				continue;
4070			if (!idset_contains(precise_ids, reg->id))
4071				continue;
4072			bt_set_frame_slot(bt, fr, i);
4073		}
4074	}
4075
4076	return 0;
4077}
4078
4079/*
4080 * __mark_chain_precision() backtracks BPF program instruction sequence and
4081 * chain of verifier states making sure that register *regno* (if regno >= 0)
4082 * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
4083 * SCALARS, as well as any other registers and slots that contribute to
4084 * a tracked state of given registers/stack slots, depending on specific BPF
4085 * assembly instructions (see backtrack_insns() for exact instruction handling
4086 * logic). This backtracking relies on recorded jmp_history and is able to
4087 * traverse entire chain of parent states. This process ends only when all the
4088 * necessary registers/slots and their transitive dependencies are marked as
4089 * precise.
4090 *
4091 * One important and subtle aspect is that precise marks *do not matter* in
4092 * the currently verified state (current state). It is important to understand
4093 * why this is the case.
4094 *
4095 * First, note that current state is the state that is not yet "checkpointed",
4096 * i.e., it is not yet put into env->explored_states, and it has no children
4097 * states as well. It's ephemeral, and can end up either a) being discarded if
4098 * compatible explored state is found at some point or BPF_EXIT instruction is
4099 * reached or b) checkpointed and put into env->explored_states, branching out
4100 * into one or more children states.
4101 *
4102 * In the former case, precise markings in current state are completely
4103 * ignored by state comparison code (see regsafe() for details). Only
4104 * checkpointed ("old") state precise markings are important, and if old
4105 * state's register/slot is precise, regsafe() assumes current state's
4106 * register/slot as precise and checks value ranges exactly and precisely. If
4107 * states turn out to be compatible, current state's necessary precise
4108 * markings and any required parent states' precise markings are enforced
4109 * after the fact with propagate_precision() logic, after the fact. But it's
4110 * important to realize that in this case, even after marking current state
4111 * registers/slots as precise, we immediately discard current state. So what
4112 * actually matters is any of the precise markings propagated into current
4113 * state's parent states, which are always checkpointed (due to b) case above).
4114 * As such, for scenario a) it doesn't matter if current state has precise
4115 * markings set or not.
4116 *
4117 * Now, for the scenario b), checkpointing and forking into child(ren)
4118 * state(s). Note that before current state gets to checkpointing step, any
4119 * processed instruction always assumes precise SCALAR register/slot
4120 * knowledge: if precise value or range is useful to prune jump branch, BPF
4121 * verifier takes this opportunity enthusiastically. Similarly, when
4122 * register's value is used to calculate offset or memory address, exact
4123 * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
4124 * what we mentioned above about state comparison ignoring precise markings
4125 * during state comparison, BPF verifier ignores and also assumes precise
4126 * markings *at will* during instruction verification process. But as verifier
4127 * assumes precision, it also propagates any precision dependencies across
4128 * parent states, which are not yet finalized, so can be further restricted
4129 * based on new knowledge gained from restrictions enforced by their children
4130 * states. This is so that once those parent states are finalized, i.e., when
4131 * they have no more active children state, state comparison logic in
4132 * is_state_visited() would enforce strict and precise SCALAR ranges, if
4133 * required for correctness.
4134 *
4135 * To build a bit more intuition, note also that once a state is checkpointed,
4136 * the path we took to get to that state is not important. This is crucial
4137 * property for state pruning. When state is checkpointed and finalized at
4138 * some instruction index, it can be correctly and safely used to "short
4139 * circuit" any *compatible* state that reaches exactly the same instruction
4140 * index. I.e., if we jumped to that instruction from a completely different
4141 * code path than original finalized state was derived from, it doesn't
4142 * matter, current state can be discarded because from that instruction
4143 * forward having a compatible state will ensure we will safely reach the
4144 * exit. States describe preconditions for further exploration, but completely
4145 * forget the history of how we got here.
4146 *
4147 * This also means that even if we needed precise SCALAR range to get to
4148 * finalized state, but from that point forward *that same* SCALAR register is
4149 * never used in a precise context (i.e., it's precise value is not needed for
4150 * correctness), it's correct and safe to mark such register as "imprecise"
4151 * (i.e., precise marking set to false). This is what we rely on when we do
4152 * not set precise marking in current state. If no child state requires
4153 * precision for any given SCALAR register, it's safe to dictate that it can
4154 * be imprecise. If any child state does require this register to be precise,
4155 * we'll mark it precise later retroactively during precise markings
4156 * propagation from child state to parent states.
4157 *
4158 * Skipping precise marking setting in current state is a mild version of
4159 * relying on the above observation. But we can utilize this property even
4160 * more aggressively by proactively forgetting any precise marking in the
4161 * current state (which we inherited from the parent state), right before we
4162 * checkpoint it and branch off into new child state. This is done by
4163 * mark_all_scalars_imprecise() to hopefully get more permissive and generic
4164 * finalized states which help in short circuiting more future states.
4165 */
4166static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
4167{
4168	struct backtrack_state *bt = &env->bt;
4169	struct bpf_verifier_state *st = env->cur_state;
4170	int first_idx = st->first_insn_idx;
4171	int last_idx = env->insn_idx;
4172	int subseq_idx = -1;
4173	struct bpf_func_state *func;
4174	struct bpf_reg_state *reg;
4175	bool skip_first = true;
4176	int i, fr, err;
4177
4178	if (!env->bpf_capable)
4179		return 0;
4180
4181	/* set frame number from which we are starting to backtrack */
4182	bt_init(bt, env->cur_state->curframe);
4183
4184	/* Do sanity checks against current state of register and/or stack
4185	 * slot, but don't set precise flag in current state, as precision
4186	 * tracking in the current state is unnecessary.
4187	 */
4188	func = st->frame[bt->frame];
4189	if (regno >= 0) {
4190		reg = &func->regs[regno];
4191		if (reg->type != SCALAR_VALUE) {
4192			WARN_ONCE(1, "backtracing misuse");
4193			return -EFAULT;
4194		}
4195		bt_set_reg(bt, regno);
4196	}
4197
4198	if (bt_empty(bt))
4199		return 0;
4200
4201	for (;;) {
4202		DECLARE_BITMAP(mask, 64);
4203		u32 history = st->jmp_history_cnt;
4204		struct bpf_jmp_history_entry *hist;
4205
4206		if (env->log.level & BPF_LOG_LEVEL2) {
4207			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
4208				bt->frame, last_idx, first_idx, subseq_idx);
4209		}
4210
4211		/* If some register with scalar ID is marked as precise,
4212		 * make sure that all registers sharing this ID are also precise.
4213		 * This is needed to estimate effect of find_equal_scalars().
4214		 * Do this at the last instruction of each state,
4215		 * bpf_reg_state::id fields are valid for these instructions.
4216		 *
4217		 * Allows to track precision in situation like below:
4218		 *
4219		 *     r2 = unknown value
4220		 *     ...
4221		 *   --- state #0 ---
4222		 *     ...
4223		 *     r1 = r2                 // r1 and r2 now share the same ID
4224		 *     ...
4225		 *   --- state #1 {r1.id = A, r2.id = A} ---
4226		 *     ...
4227		 *     if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
4228		 *     ...
4229		 *   --- state #2 {r1.id = A, r2.id = A} ---
4230		 *     r3 = r10
4231		 *     r3 += r1                // need to mark both r1 and r2
4232		 */
4233		if (mark_precise_scalar_ids(env, st))
4234			return -EFAULT;
4235
4236		if (last_idx < 0) {
4237			/* we are at the entry into subprog, which
4238			 * is expected for global funcs, but only if
4239			 * requested precise registers are R1-R5
4240			 * (which are global func's input arguments)
4241			 */
4242			if (st->curframe == 0 &&
4243			    st->frame[0]->subprogno > 0 &&
4244			    st->frame[0]->callsite == BPF_MAIN_FUNC &&
4245			    bt_stack_mask(bt) == 0 &&
4246			    (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
4247				bitmap_from_u64(mask, bt_reg_mask(bt));
4248				for_each_set_bit(i, mask, 32) {
4249					reg = &st->frame[0]->regs[i];
4250					bt_clear_reg(bt, i);
4251					if (reg->type == SCALAR_VALUE)
4252						reg->precise = true;
4253				}
4254				return 0;
4255			}
4256
4257			verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
4258				st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
4259			WARN_ONCE(1, "verifier backtracking bug");
4260			return -EFAULT;
4261		}
4262
4263		for (i = last_idx;;) {
4264			if (skip_first) {
4265				err = 0;
4266				skip_first = false;
4267			} else {
4268				hist = get_jmp_hist_entry(st, history, i);
4269				err = backtrack_insn(env, i, subseq_idx, hist, bt);
4270			}
4271			if (err == -ENOTSUPP) {
4272				mark_all_scalars_precise(env, env->cur_state);
4273				bt_reset(bt);
4274				return 0;
4275			} else if (err) {
4276				return err;
4277			}
4278			if (bt_empty(bt))
4279				/* Found assignment(s) into tracked register in this state.
4280				 * Since this state is already marked, just return.
4281				 * Nothing to be tracked further in the parent state.
4282				 */
4283				return 0;
4284			subseq_idx = i;
4285			i = get_prev_insn_idx(st, i, &history);
4286			if (i == -ENOENT)
4287				break;
4288			if (i >= env->prog->len) {
4289				/* This can happen if backtracking reached insn 0
4290				 * and there are still reg_mask or stack_mask
4291				 * to backtrack.
4292				 * It means the backtracking missed the spot where
4293				 * particular register was initialized with a constant.
4294				 */
4295				verbose(env, "BUG backtracking idx %d\n", i);
4296				WARN_ONCE(1, "verifier backtracking bug");
4297				return -EFAULT;
4298			}
4299		}
4300		st = st->parent;
4301		if (!st)
4302			break;
4303
4304		for (fr = bt->frame; fr >= 0; fr--) {
4305			func = st->frame[fr];
4306			bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
4307			for_each_set_bit(i, mask, 32) {
4308				reg = &func->regs[i];
4309				if (reg->type != SCALAR_VALUE) {
4310					bt_clear_frame_reg(bt, fr, i);
4311					continue;
4312				}
4313				if (reg->precise)
4314					bt_clear_frame_reg(bt, fr, i);
4315				else
4316					reg->precise = true;
4317			}
4318
4319			bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
4320			for_each_set_bit(i, mask, 64) {
4321				if (i >= func->allocated_stack / BPF_REG_SIZE) {
4322					verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
4323						i, func->allocated_stack / BPF_REG_SIZE);
4324					WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
4325					return -EFAULT;
4326				}
4327
4328				if (!is_spilled_scalar_reg(&func->stack[i])) {
4329					bt_clear_frame_slot(bt, fr, i);
4330					continue;
4331				}
4332				reg = &func->stack[i].spilled_ptr;
4333				if (reg->precise)
4334					bt_clear_frame_slot(bt, fr, i);
4335				else
4336					reg->precise = true;
4337			}
4338			if (env->log.level & BPF_LOG_LEVEL2) {
4339				fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
4340					     bt_frame_reg_mask(bt, fr));
4341				verbose(env, "mark_precise: frame%d: parent state regs=%s ",
4342					fr, env->tmp_str_buf);
4343				fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
4344					       bt_frame_stack_mask(bt, fr));
4345				verbose(env, "stack=%s: ", env->tmp_str_buf);
4346				print_verifier_state(env, func, true);
4347			}
4348		}
4349
4350		if (bt_empty(bt))
4351			return 0;
4352
4353		subseq_idx = first_idx;
4354		last_idx = st->last_insn_idx;
4355		first_idx = st->first_insn_idx;
4356	}
4357
4358	/* if we still have requested precise regs or slots, we missed
4359	 * something (e.g., stack access through non-r10 register), so
4360	 * fallback to marking all precise
4361	 */
4362	if (!bt_empty(bt)) {
4363		mark_all_scalars_precise(env, env->cur_state);
4364		bt_reset(bt);
4365	}
4366
4367	return 0;
4368}
4369
4370int mark_chain_precision(struct bpf_verifier_env *env, int regno)
4371{
4372	return __mark_chain_precision(env, regno);
4373}
4374
4375/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
4376 * desired reg and stack masks across all relevant frames
4377 */
4378static int mark_chain_precision_batch(struct bpf_verifier_env *env)
4379{
4380	return __mark_chain_precision(env, -1);
4381}
4382
4383static bool is_spillable_regtype(enum bpf_reg_type type)
4384{
4385	switch (base_type(type)) {
4386	case PTR_TO_MAP_VALUE:
4387	case PTR_TO_STACK:
4388	case PTR_TO_CTX:
4389	case PTR_TO_PACKET:
4390	case PTR_TO_PACKET_META:
4391	case PTR_TO_PACKET_END:
4392	case PTR_TO_FLOW_KEYS:
4393	case CONST_PTR_TO_MAP:
4394	case PTR_TO_SOCKET:
4395	case PTR_TO_SOCK_COMMON:
4396	case PTR_TO_TCP_SOCK:
4397	case PTR_TO_XDP_SOCK:
4398	case PTR_TO_BTF_ID:
4399	case PTR_TO_BUF:
4400	case PTR_TO_MEM:
4401	case PTR_TO_FUNC:
4402	case PTR_TO_MAP_KEY:
4403	case PTR_TO_ARENA:
4404		return true;
4405	default:
4406		return false;
4407	}
4408}
4409
4410/* Does this register contain a constant zero? */
4411static bool register_is_null(struct bpf_reg_state *reg)
4412{
4413	return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
4414}
4415
4416/* check if register is a constant scalar value */
4417static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
4418{
4419	return reg->type == SCALAR_VALUE &&
4420	       tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
4421}
4422
4423/* assuming is_reg_const() is true, return constant value of a register */
4424static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
4425{
4426	return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
4427}
4428
4429static bool __is_pointer_value(bool allow_ptr_leaks,
4430			       const struct bpf_reg_state *reg)
4431{
4432	if (allow_ptr_leaks)
4433		return false;
4434
4435	return reg->type != SCALAR_VALUE;
4436}
4437
4438static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
4439					struct bpf_reg_state *src_reg)
4440{
4441	if (src_reg->type == SCALAR_VALUE && !src_reg->id &&
4442	    !tnum_is_const(src_reg->var_off))
4443		/* Ensure that src_reg has a valid ID that will be copied to
4444		 * dst_reg and then will be used by find_equal_scalars() to
4445		 * propagate min/max range.
4446		 */
4447		src_reg->id = ++env->id_gen;
4448}
4449
4450/* Copy src state preserving dst->parent and dst->live fields */
4451static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
4452{
4453	struct bpf_reg_state *parent = dst->parent;
4454	enum bpf_reg_liveness live = dst->live;
4455
4456	*dst = *src;
4457	dst->parent = parent;
4458	dst->live = live;
4459}
4460
4461static void save_register_state(struct bpf_verifier_env *env,
4462				struct bpf_func_state *state,
4463				int spi, struct bpf_reg_state *reg,
4464				int size)
4465{
4466	int i;
4467
4468	copy_register_state(&state->stack[spi].spilled_ptr, reg);
4469	if (size == BPF_REG_SIZE)
4470		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
4471
4472	for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
4473		state->stack[spi].slot_type[i - 1] = STACK_SPILL;
4474
4475	/* size < 8 bytes spill */
4476	for (; i; i--)
4477		mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
4478}
4479
4480static bool is_bpf_st_mem(struct bpf_insn *insn)
4481{
4482	return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
4483}
4484
4485static int get_reg_width(struct bpf_reg_state *reg)
4486{
4487	return fls64(reg->umax_value);
4488}
4489
4490/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
4491 * stack boundary and alignment are checked in check_mem_access()
4492 */
4493static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
4494				       /* stack frame we're writing to */
4495				       struct bpf_func_state *state,
4496				       int off, int size, int value_regno,
4497				       int insn_idx)
4498{
4499	struct bpf_func_state *cur; /* state of the current function */
4500	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
4501	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
4502	struct bpf_reg_state *reg = NULL;
4503	int insn_flags = insn_stack_access_flags(state->frameno, spi);
4504
4505	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
4506	 * so it's aligned access and [off, off + size) are within stack limits
4507	 */
4508	if (!env->allow_ptr_leaks &&
4509	    is_spilled_reg(&state->stack[spi]) &&
4510	    size != BPF_REG_SIZE) {
4511		verbose(env, "attempt to corrupt spilled pointer on stack\n");
4512		return -EACCES;
4513	}
4514
4515	cur = env->cur_state->frame[env->cur_state->curframe];
4516	if (value_regno >= 0)
4517		reg = &cur->regs[value_regno];
4518	if (!env->bypass_spec_v4) {
4519		bool sanitize = reg && is_spillable_regtype(reg->type);
4520
4521		for (i = 0; i < size; i++) {
4522			u8 type = state->stack[spi].slot_type[i];
4523
4524			if (type != STACK_MISC && type != STACK_ZERO) {
4525				sanitize = true;
4526				break;
4527			}
4528		}
4529
4530		if (sanitize)
4531			env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
4532	}
4533
4534	err = destroy_if_dynptr_stack_slot(env, state, spi);
4535	if (err)
4536		return err;
4537
4538	mark_stack_slot_scratched(env, spi);
4539	if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
4540		bool reg_value_fits;
4541
4542		reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size;
4543		/* Make sure that reg had an ID to build a relation on spill. */
4544		if (reg_value_fits)
4545			assign_scalar_id_before_mov(env, reg);
4546		save_register_state(env, state, spi, reg, size);
4547		/* Break the relation on a narrowing spill. */
4548		if (!reg_value_fits)
4549			state->stack[spi].spilled_ptr.id = 0;
4550	} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
4551		   env->bpf_capable) {
4552		struct bpf_reg_state fake_reg = {};
4553
4554		__mark_reg_known(&fake_reg, insn->imm);
4555		fake_reg.type = SCALAR_VALUE;
4556		save_register_state(env, state, spi, &fake_reg, size);
4557	} else if (reg && is_spillable_regtype(reg->type)) {
4558		/* register containing pointer is being spilled into stack */
4559		if (size != BPF_REG_SIZE) {
4560			verbose_linfo(env, insn_idx, "; ");
4561			verbose(env, "invalid size of register spill\n");
4562			return -EACCES;
4563		}
4564		if (state != cur && reg->type == PTR_TO_STACK) {
4565			verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
4566			return -EINVAL;
4567		}
4568		save_register_state(env, state, spi, reg, size);
4569	} else {
4570		u8 type = STACK_MISC;
4571
4572		/* regular write of data into stack destroys any spilled ptr */
4573		state->stack[spi].spilled_ptr.type = NOT_INIT;
4574		/* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
4575		if (is_stack_slot_special(&state->stack[spi]))
4576			for (i = 0; i < BPF_REG_SIZE; i++)
4577				scrub_spilled_slot(&state->stack[spi].slot_type[i]);
4578
4579		/* only mark the slot as written if all 8 bytes were written
4580		 * otherwise read propagation may incorrectly stop too soon
4581		 * when stack slots are partially written.
4582		 * This heuristic means that read propagation will be
4583		 * conservative, since it will add reg_live_read marks
4584		 * to stack slots all the way to first state when programs
4585		 * writes+reads less than 8 bytes
4586		 */
4587		if (size == BPF_REG_SIZE)
4588			state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
4589
4590		/* when we zero initialize stack slots mark them as such */
4591		if ((reg && register_is_null(reg)) ||
4592		    (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
4593			/* STACK_ZERO case happened because register spill
4594			 * wasn't properly aligned at the stack slot boundary,
4595			 * so it's not a register spill anymore; force
4596			 * originating register to be precise to make
4597			 * STACK_ZERO correct for subsequent states
4598			 */
4599			err = mark_chain_precision(env, value_regno);
4600			if (err)
4601				return err;
4602			type = STACK_ZERO;
4603		}
4604
4605		/* Mark slots affected by this stack write. */
4606		for (i = 0; i < size; i++)
4607			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
4608		insn_flags = 0; /* not a register spill */
4609	}
4610
4611	if (insn_flags)
4612		return push_jmp_history(env, env->cur_state, insn_flags);
4613	return 0;
4614}
4615
4616/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
4617 * known to contain a variable offset.
4618 * This function checks whether the write is permitted and conservatively
4619 * tracks the effects of the write, considering that each stack slot in the
4620 * dynamic range is potentially written to.
4621 *
4622 * 'off' includes 'regno->off'.
4623 * 'value_regno' can be -1, meaning that an unknown value is being written to
4624 * the stack.
4625 *
4626 * Spilled pointers in range are not marked as written because we don't know
4627 * what's going to be actually written. This means that read propagation for
4628 * future reads cannot be terminated by this write.
4629 *
4630 * For privileged programs, uninitialized stack slots are considered
4631 * initialized by this write (even though we don't know exactly what offsets
4632 * are going to be written to). The idea is that we don't want the verifier to
4633 * reject future reads that access slots written to through variable offsets.
4634 */
4635static int check_stack_write_var_off(struct bpf_verifier_env *env,
4636				     /* func where register points to */
4637				     struct bpf_func_state *state,
4638				     int ptr_regno, int off, int size,
4639				     int value_regno, int insn_idx)
4640{
4641	struct bpf_func_state *cur; /* state of the current function */
4642	int min_off, max_off;
4643	int i, err;
4644	struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
4645	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
4646	bool writing_zero = false;
4647	/* set if the fact that we're writing a zero is used to let any
4648	 * stack slots remain STACK_ZERO
4649	 */
4650	bool zero_used = false;
4651
4652	cur = env->cur_state->frame[env->cur_state->curframe];
4653	ptr_reg = &cur->regs[ptr_regno];
4654	min_off = ptr_reg->smin_value + off;
4655	max_off = ptr_reg->smax_value + off + size;
4656	if (value_regno >= 0)
4657		value_reg = &cur->regs[value_regno];
4658	if ((value_reg && register_is_null(value_reg)) ||
4659	    (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
4660		writing_zero = true;
4661
4662	for (i = min_off; i < max_off; i++) {
4663		int spi;
4664
4665		spi = __get_spi(i);
4666		err = destroy_if_dynptr_stack_slot(env, state, spi);
4667		if (err)
4668			return err;
4669	}
4670
4671	/* Variable offset writes destroy any spilled pointers in range. */
4672	for (i = min_off; i < max_off; i++) {
4673		u8 new_type, *stype;
4674		int slot, spi;
4675
4676		slot = -i - 1;
4677		spi = slot / BPF_REG_SIZE;
4678		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
4679		mark_stack_slot_scratched(env, spi);
4680
4681		if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
4682			/* Reject the write if range we may write to has not
4683			 * been initialized beforehand. If we didn't reject
4684			 * here, the ptr status would be erased below (even
4685			 * though not all slots are actually overwritten),
4686			 * possibly opening the door to leaks.
4687			 *
4688			 * We do however catch STACK_INVALID case below, and
4689			 * only allow reading possibly uninitialized memory
4690			 * later for CAP_PERFMON, as the write may not happen to
4691			 * that slot.
4692			 */
4693			verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
4694				insn_idx, i);
4695			return -EINVAL;
4696		}
4697
4698		/* If writing_zero and the spi slot contains a spill of value 0,
4699		 * maintain the spill type.
4700		 */
4701		if (writing_zero && *stype == STACK_SPILL &&
4702		    is_spilled_scalar_reg(&state->stack[spi])) {
4703			struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
4704
4705			if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
4706				zero_used = true;
4707				continue;
4708			}
4709		}
4710
4711		/* Erase all other spilled pointers. */
4712		state->stack[spi].spilled_ptr.type = NOT_INIT;
4713
4714		/* Update the slot type. */
4715		new_type = STACK_MISC;
4716		if (writing_zero && *stype == STACK_ZERO) {
4717			new_type = STACK_ZERO;
4718			zero_used = true;
4719		}
4720		/* If the slot is STACK_INVALID, we check whether it's OK to
4721		 * pretend that it will be initialized by this write. The slot
4722		 * might not actually be written to, and so if we mark it as
4723		 * initialized future reads might leak uninitialized memory.
4724		 * For privileged programs, we will accept such reads to slots
4725		 * that may or may not be written because, if we're reject
4726		 * them, the error would be too confusing.
4727		 */
4728		if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
4729			verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
4730					insn_idx, i);
4731			return -EINVAL;
4732		}
4733		*stype = new_type;
4734	}
4735	if (zero_used) {
4736		/* backtracking doesn't work for STACK_ZERO yet. */
4737		err = mark_chain_precision(env, value_regno);
4738		if (err)
4739			return err;
4740	}
4741	return 0;
4742}
4743
4744/* When register 'dst_regno' is assigned some values from stack[min_off,
4745 * max_off), we set the register's type according to the types of the
4746 * respective stack slots. If all the stack values are known to be zeros, then
4747 * so is the destination reg. Otherwise, the register is considered to be
4748 * SCALAR. This function does not deal with register filling; the caller must
4749 * ensure that all spilled registers in the stack range have been marked as
4750 * read.
4751 */
4752static void mark_reg_stack_read(struct bpf_verifier_env *env,
4753				/* func where src register points to */
4754				struct bpf_func_state *ptr_state,
4755				int min_off, int max_off, int dst_regno)
4756{
4757	struct bpf_verifier_state *vstate = env->cur_state;
4758	struct bpf_func_state *state = vstate->frame[vstate->curframe];
4759	int i, slot, spi;
4760	u8 *stype;
4761	int zeros = 0;
4762
4763	for (i = min_off; i < max_off; i++) {
4764		slot = -i - 1;
4765		spi = slot / BPF_REG_SIZE;
4766		mark_stack_slot_scratched(env, spi);
4767		stype = ptr_state->stack[spi].slot_type;
4768		if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
4769			break;
4770		zeros++;
4771	}
4772	if (zeros == max_off - min_off) {
4773		/* Any access_size read into register is zero extended,
4774		 * so the whole register == const_zero.
4775		 */
4776		__mark_reg_const_zero(env, &state->regs[dst_regno]);
4777	} else {
4778		/* have read misc data from the stack */
4779		mark_reg_unknown(env, state->regs, dst_regno);
4780	}
4781	state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
4782}
4783
4784/* Read the stack at 'off' and put the results into the register indicated by
4785 * 'dst_regno'. It handles reg filling if the addressed stack slot is a
4786 * spilled reg.
4787 *
4788 * 'dst_regno' can be -1, meaning that the read value is not going to a
4789 * register.
4790 *
4791 * The access is assumed to be within the current stack bounds.
4792 */
4793static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
4794				      /* func where src register points to */
4795				      struct bpf_func_state *reg_state,
4796				      int off, int size, int dst_regno)
4797{
4798	struct bpf_verifier_state *vstate = env->cur_state;
4799	struct bpf_func_state *state = vstate->frame[vstate->curframe];
4800	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
4801	struct bpf_reg_state *reg;
4802	u8 *stype, type;
4803	int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
4804
4805	stype = reg_state->stack[spi].slot_type;
4806	reg = &reg_state->stack[spi].spilled_ptr;
4807
4808	mark_stack_slot_scratched(env, spi);
4809
4810	if (is_spilled_reg(&reg_state->stack[spi])) {
4811		u8 spill_size = 1;
4812
4813		for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
4814			spill_size++;
4815
4816		if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
4817			if (reg->type != SCALAR_VALUE) {
4818				verbose_linfo(env, env->insn_idx, "; ");
4819				verbose(env, "invalid size of register fill\n");
4820				return -EACCES;
4821			}
4822
4823			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
4824			if (dst_regno < 0)
4825				return 0;
4826
4827			if (size <= spill_size &&
4828			    bpf_stack_narrow_access_ok(off, size, spill_size)) {
4829				/* The earlier check_reg_arg() has decided the
4830				 * subreg_def for this insn.  Save it first.
4831				 */
4832				s32 subreg_def = state->regs[dst_regno].subreg_def;
4833
4834				copy_register_state(&state->regs[dst_regno], reg);
4835				state->regs[dst_regno].subreg_def = subreg_def;
4836
4837				/* Break the relation on a narrowing fill.
4838				 * coerce_reg_to_size will adjust the boundaries.
4839				 */
4840				if (get_reg_width(reg) > size * BITS_PER_BYTE)
4841					state->regs[dst_regno].id = 0;
4842			} else {
4843				int spill_cnt = 0, zero_cnt = 0;
4844
4845				for (i = 0; i < size; i++) {
4846					type = stype[(slot - i) % BPF_REG_SIZE];
4847					if (type == STACK_SPILL) {
4848						spill_cnt++;
4849						continue;
4850					}
4851					if (type == STACK_MISC)
4852						continue;
4853					if (type == STACK_ZERO) {
4854						zero_cnt++;
4855						continue;
4856					}
4857					if (type == STACK_INVALID && env->allow_uninit_stack)
4858						continue;
4859					verbose(env, "invalid read from stack off %d+%d size %d\n",
4860						off, i, size);
4861					return -EACCES;
4862				}
4863
4864				if (spill_cnt == size &&
4865				    tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
4866					__mark_reg_const_zero(env, &state->regs[dst_regno]);
4867					/* this IS register fill, so keep insn_flags */
4868				} else if (zero_cnt == size) {
4869					/* similarly to mark_reg_stack_read(), preserve zeroes */
4870					__mark_reg_const_zero(env, &state->regs[dst_regno]);
4871					insn_flags = 0; /* not restoring original register state */
4872				} else {
4873					mark_reg_unknown(env, state->regs, dst_regno);
4874					insn_flags = 0; /* not restoring original register state */
4875				}
4876			}
4877			state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
4878		} else if (dst_regno >= 0) {
4879			/* restore register state from stack */
4880			copy_register_state(&state->regs[dst_regno], reg);
4881			/* mark reg as written since spilled pointer state likely
4882			 * has its liveness marks cleared by is_state_visited()
4883			 * which resets stack/reg liveness for state transitions
4884			 */
4885			state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
4886		} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
4887			/* If dst_regno==-1, the caller is asking us whether
4888			 * it is acceptable to use this value as a SCALAR_VALUE
4889			 * (e.g. for XADD).
4890			 * We must not allow unprivileged callers to do that
4891			 * with spilled pointers.
4892			 */
4893			verbose(env, "leaking pointer from stack off %d\n",
4894				off);
4895			return -EACCES;
4896		}
4897		mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
4898	} else {
4899		for (i = 0; i < size; i++) {
4900			type = stype[(slot - i) % BPF_REG_SIZE];
4901			if (type == STACK_MISC)
4902				continue;
4903			if (type == STACK_ZERO)
4904				continue;
4905			if (type == STACK_INVALID && env->allow_uninit_stack)
4906				continue;
4907			verbose(env, "invalid read from stack off %d+%d size %d\n",
4908				off, i, size);
4909			return -EACCES;
4910		}
4911		mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
4912		if (dst_regno >= 0)
4913			mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
4914		insn_flags = 0; /* we are not restoring spilled register */
4915	}
4916	if (insn_flags)
4917		return push_jmp_history(env, env->cur_state, insn_flags);
4918	return 0;
4919}
4920
4921enum bpf_access_src {
4922	ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
4923	ACCESS_HELPER = 2,  /* the access is performed by a helper */
4924};
4925
4926static int check_stack_range_initialized(struct bpf_verifier_env *env,
4927					 int regno, int off, int access_size,
4928					 bool zero_size_allowed,
4929					 enum bpf_access_src type,
4930					 struct bpf_call_arg_meta *meta);
4931
4932static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
4933{
4934	return cur_regs(env) + regno;
4935}
4936
4937/* Read the stack at 'ptr_regno + off' and put the result into the register
4938 * 'dst_regno'.
4939 * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
4940 * but not its variable offset.
4941 * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
4942 *
4943 * As opposed to check_stack_read_fixed_off, this function doesn't deal with
4944 * filling registers (i.e. reads of spilled register cannot be detected when
4945 * the offset is not fixed). We conservatively mark 'dst_regno' as containing
4946 * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
4947 * offset; for a fixed offset check_stack_read_fixed_off should be used
4948 * instead.
4949 */
4950static int check_stack_read_var_off(struct bpf_verifier_env *env,
4951				    int ptr_regno, int off, int size, int dst_regno)
4952{
4953	/* The state of the source register. */
4954	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
4955	struct bpf_func_state *ptr_state = func(env, reg);
4956	int err;
4957	int min_off, max_off;
4958
4959	/* Note that we pass a NULL meta, so raw access will not be permitted.
4960	 */
4961	err = check_stack_range_initialized(env, ptr_regno, off, size,
4962					    false, ACCESS_DIRECT, NULL);
4963	if (err)
4964		return err;
4965
4966	min_off = reg->smin_value + off;
4967	max_off = reg->smax_value + off;
4968	mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
4969	return 0;
4970}
4971
4972/* check_stack_read dispatches to check_stack_read_fixed_off or
4973 * check_stack_read_var_off.
4974 *
4975 * The caller must ensure that the offset falls within the allocated stack
4976 * bounds.
4977 *
4978 * 'dst_regno' is a register which will receive the value from the stack. It
4979 * can be -1, meaning that the read value is not going to a register.
4980 */
4981static int check_stack_read(struct bpf_verifier_env *env,
4982			    int ptr_regno, int off, int size,
4983			    int dst_regno)
4984{
4985	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
4986	struct bpf_func_state *state = func(env, reg);
4987	int err;
4988	/* Some accesses are only permitted with a static offset. */
4989	bool var_off = !tnum_is_const(reg->var_off);
4990
4991	/* The offset is required to be static when reads don't go to a
4992	 * register, in order to not leak pointers (see
4993	 * check_stack_read_fixed_off).
4994	 */
4995	if (dst_regno < 0 && var_off) {
4996		char tn_buf[48];
4997
4998		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
4999		verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
5000			tn_buf, off, size);
5001		return -EACCES;
5002	}
5003	/* Variable offset is prohibited for unprivileged mode for simplicity
5004	 * since it requires corresponding support in Spectre masking for stack
5005	 * ALU. See also retrieve_ptr_limit(). The check in
5006	 * check_stack_access_for_ptr_arithmetic() called by
5007	 * adjust_ptr_min_max_vals() prevents users from creating stack pointers
5008	 * with variable offsets, therefore no check is required here. Further,
5009	 * just checking it here would be insufficient as speculative stack
5010	 * writes could still lead to unsafe speculative behaviour.
5011	 */
5012	if (!var_off) {
5013		off += reg->var_off.value;
5014		err = check_stack_read_fixed_off(env, state, off, size,
5015						 dst_regno);
5016	} else {
5017		/* Variable offset stack reads need more conservative handling
5018		 * than fixed offset ones. Note that dst_regno >= 0 on this
5019		 * branch.
5020		 */
5021		err = check_stack_read_var_off(env, ptr_regno, off, size,
5022					       dst_regno);
5023	}
5024	return err;
5025}
5026
5027
5028/* check_stack_write dispatches to check_stack_write_fixed_off or
5029 * check_stack_write_var_off.
5030 *
5031 * 'ptr_regno' is the register used as a pointer into the stack.
5032 * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
5033 * 'value_regno' is the register whose value we're writing to the stack. It can
5034 * be -1, meaning that we're not writing from a register.
5035 *
5036 * The caller must ensure that the offset falls within the maximum stack size.
5037 */
5038static int check_stack_write(struct bpf_verifier_env *env,
5039			     int ptr_regno, int off, int size,
5040			     int value_regno, int insn_idx)
5041{
5042	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
5043	struct bpf_func_state *state = func(env, reg);
5044	int err;
5045
5046	if (tnum_is_const(reg->var_off)) {
5047		off += reg->var_off.value;
5048		err = check_stack_write_fixed_off(env, state, off, size,
5049						  value_regno, insn_idx);
5050	} else {
5051		/* Variable offset stack reads need more conservative handling
5052		 * than fixed offset ones.
5053		 */
5054		err = check_stack_write_var_off(env, state,
5055						ptr_regno, off, size,
5056						value_regno, insn_idx);
5057	}
5058	return err;
5059}
5060
5061static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
5062				 int off, int size, enum bpf_access_type type)
5063{
5064	struct bpf_reg_state *regs = cur_regs(env);
5065	struct bpf_map *map = regs[regno].map_ptr;
5066	u32 cap = bpf_map_flags_to_cap(map);
5067
5068	if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
5069		verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
5070			map->value_size, off, size);
5071		return -EACCES;
5072	}
5073
5074	if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
5075		verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
5076			map->value_size, off, size);
5077		return -EACCES;
5078	}
5079
5080	return 0;
5081}
5082
5083/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
5084static int __check_mem_access(struct bpf_verifier_env *env, int regno,
5085			      int off, int size, u32 mem_size,
5086			      bool zero_size_allowed)
5087{
5088	bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
5089	struct bpf_reg_state *reg;
5090
5091	if (off >= 0 && size_ok && (u64)off + size <= mem_size)
5092		return 0;
5093
5094	reg = &cur_regs(env)[regno];
5095	switch (reg->type) {
5096	case PTR_TO_MAP_KEY:
5097		verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
5098			mem_size, off, size);
5099		break;
5100	case PTR_TO_MAP_VALUE:
5101		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
5102			mem_size, off, size);
5103		break;
5104	case PTR_TO_PACKET:
5105	case PTR_TO_PACKET_META:
5106	case PTR_TO_PACKET_END:
5107		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
5108			off, size, regno, reg->id, off, mem_size);
5109		break;
5110	case PTR_TO_MEM:
5111	default:
5112		verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
5113			mem_size, off, size);
5114	}
5115
5116	return -EACCES;
5117}
5118
5119/* check read/write into a memory region with possible variable offset */
5120static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
5121				   int off, int size, u32 mem_size,
5122				   bool zero_size_allowed)
5123{
5124	struct bpf_verifier_state *vstate = env->cur_state;
5125	struct bpf_func_state *state = vstate->frame[vstate->curframe];
5126	struct bpf_reg_state *reg = &state->regs[regno];
5127	int err;
5128
5129	/* We may have adjusted the register pointing to memory region, so we
5130	 * need to try adding each of min_value and max_value to off
5131	 * to make sure our theoretical access will be safe.
5132	 *
5133	 * The minimum value is only important with signed
5134	 * comparisons where we can't assume the floor of a
5135	 * value is 0.  If we are using signed variables for our
5136	 * index'es we need to make sure that whatever we use
5137	 * will have a set floor within our range.
5138	 */
5139	if (reg->smin_value < 0 &&
5140	    (reg->smin_value == S64_MIN ||
5141	     (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
5142	      reg->smin_value + off < 0)) {
5143		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
5144			regno);
5145		return -EACCES;
5146	}
5147	err = __check_mem_access(env, regno, reg->smin_value + off, size,
5148				 mem_size, zero_size_allowed);
5149	if (err) {
5150		verbose(env, "R%d min value is outside of the allowed memory range\n",
5151			regno);
5152		return err;
5153	}
5154
5155	/* If we haven't set a max value then we need to bail since we can't be
5156	 * sure we won't do bad things.
5157	 * If reg->umax_value + off could overflow, treat that as unbounded too.
5158	 */
5159	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
5160		verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
5161			regno);
5162		return -EACCES;
5163	}
5164	err = __check_mem_access(env, regno, reg->umax_value + off, size,
5165				 mem_size, zero_size_allowed);
5166	if (err) {
5167		verbose(env, "R%d max value is outside of the allowed memory range\n",
5168			regno);
5169		return err;
5170	}
5171
5172	return 0;
5173}
5174
5175static int __check_ptr_off_reg(struct bpf_verifier_env *env,
5176			       const struct bpf_reg_state *reg, int regno,
5177			       bool fixed_off_ok)
5178{
5179	/* Access to this pointer-typed register or passing it to a helper
5180	 * is only allowed in its original, unmodified form.
5181	 */
5182
5183	if (reg->off < 0) {
5184		verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
5185			reg_type_str(env, reg->type), regno, reg->off);
5186		return -EACCES;
5187	}
5188
5189	if (!fixed_off_ok && reg->off) {
5190		verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
5191			reg_type_str(env, reg->type), regno, reg->off);
5192		return -EACCES;
5193	}
5194
5195	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
5196		char tn_buf[48];
5197
5198		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
5199		verbose(env, "variable %s access var_off=%s disallowed\n",
5200			reg_type_str(env, reg->type), tn_buf);
5201		return -EACCES;
5202	}
5203
5204	return 0;
5205}
5206
5207static int check_ptr_off_reg(struct bpf_verifier_env *env,
5208		             const struct bpf_reg_state *reg, int regno)
5209{
5210	return __check_ptr_off_reg(env, reg, regno, false);
5211}
5212
5213static int map_kptr_match_type(struct bpf_verifier_env *env,
5214			       struct btf_field *kptr_field,
5215			       struct bpf_reg_state *reg, u32 regno)
5216{
5217	const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
5218	int perm_flags;
5219	const char *reg_name = "";
5220
5221	if (btf_is_kernel(reg->btf)) {
5222		perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
5223
5224		/* Only unreferenced case accepts untrusted pointers */
5225		if (kptr_field->type == BPF_KPTR_UNREF)
5226			perm_flags |= PTR_UNTRUSTED;
5227	} else {
5228		perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
5229		if (kptr_field->type == BPF_KPTR_PERCPU)
5230			perm_flags |= MEM_PERCPU;
5231	}
5232
5233	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
5234		goto bad_type;
5235
5236	/* We need to verify reg->type and reg->btf, before accessing reg->btf */
5237	reg_name = btf_type_name(reg->btf, reg->btf_id);
5238
5239	/* For ref_ptr case, release function check should ensure we get one
5240	 * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
5241	 * normal store of unreferenced kptr, we must ensure var_off is zero.
5242	 * Since ref_ptr cannot be accessed directly by BPF insns, checks for
5243	 * reg->off and reg->ref_obj_id are not needed here.
5244	 */
5245	if (__check_ptr_off_reg(env, reg, regno, true))
5246		return -EACCES;
5247
5248	/* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
5249	 * we also need to take into account the reg->off.
5250	 *
5251	 * We want to support cases like:
5252	 *
5253	 * struct foo {
5254	 *         struct bar br;
5255	 *         struct baz bz;
5256	 * };
5257	 *
5258	 * struct foo *v;
5259	 * v = func();	      // PTR_TO_BTF_ID
5260	 * val->foo = v;      // reg->off is zero, btf and btf_id match type
5261	 * val->bar = &v->br; // reg->off is still zero, but we need to retry with
5262	 *                    // first member type of struct after comparison fails
5263	 * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
5264	 *                    // to match type
5265	 *
5266	 * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
5267	 * is zero. We must also ensure that btf_struct_ids_match does not walk
5268	 * the struct to match type against first member of struct, i.e. reject
5269	 * second case from above. Hence, when type is BPF_KPTR_REF, we set
5270	 * strict mode to true for type match.
5271	 */
5272	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
5273				  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
5274				  kptr_field->type != BPF_KPTR_UNREF))
5275		goto bad_type;
5276	return 0;
5277bad_type:
5278	verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
5279		reg_type_str(env, reg->type), reg_name);
5280	verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
5281	if (kptr_field->type == BPF_KPTR_UNREF)
5282		verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
5283			targ_name);
5284	else
5285		verbose(env, "\n");
5286	return -EINVAL;
5287}
5288
5289static bool in_sleepable(struct bpf_verifier_env *env)
5290{
5291	return env->prog->sleepable ||
5292	       (env->cur_state && env->cur_state->in_sleepable);
5293}
5294
5295/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
5296 * can dereference RCU protected pointers and result is PTR_TRUSTED.
5297 */
5298static bool in_rcu_cs(struct bpf_verifier_env *env)
5299{
5300	return env->cur_state->active_rcu_lock ||
5301	       env->cur_state->active_lock.ptr ||
5302	       !in_sleepable(env);
5303}
5304
5305/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
5306BTF_SET_START(rcu_protected_types)
5307BTF_ID(struct, prog_test_ref_kfunc)
5308#ifdef CONFIG_CGROUPS
5309BTF_ID(struct, cgroup)
5310#endif
5311#ifdef CONFIG_BPF_JIT
5312BTF_ID(struct, bpf_cpumask)
5313#endif
5314BTF_ID(struct, task_struct)
5315BTF_ID(struct, bpf_crypto_ctx)
5316BTF_SET_END(rcu_protected_types)
5317
5318static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
5319{
5320	if (!btf_is_kernel(btf))
5321		return true;
5322	return btf_id_set_contains(&rcu_protected_types, btf_id);
5323}
5324
5325static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
5326{
5327	struct btf_struct_meta *meta;
5328
5329	if (btf_is_kernel(kptr_field->kptr.btf))
5330		return NULL;
5331
5332	meta = btf_find_struct_meta(kptr_field->kptr.btf,
5333				    kptr_field->kptr.btf_id);
5334
5335	return meta ? meta->record : NULL;
5336}
5337
5338static bool rcu_safe_kptr(const struct btf_field *field)
5339{
5340	const struct btf_field_kptr *kptr = &field->kptr;
5341
5342	return field->type == BPF_KPTR_PERCPU ||
5343	       (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
5344}
5345
5346static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
5347{
5348	struct btf_record *rec;
5349	u32 ret;
5350
5351	ret = PTR_MAYBE_NULL;
5352	if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
5353		ret |= MEM_RCU;
5354		if (kptr_field->type == BPF_KPTR_PERCPU)
5355			ret |= MEM_PERCPU;
5356		else if (!btf_is_kernel(kptr_field->kptr.btf))
5357			ret |= MEM_ALLOC;
5358
5359		rec = kptr_pointee_btf_record(kptr_field);
5360		if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
5361			ret |= NON_OWN_REF;
5362	} else {
5363		ret |= PTR_UNTRUSTED;
5364	}
5365
5366	return ret;
5367}
5368
5369static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
5370				 int value_regno, int insn_idx,
5371				 struct btf_field *kptr_field)
5372{
5373	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
5374	int class = BPF_CLASS(insn->code);
5375	struct bpf_reg_state *val_reg;
5376
5377	/* Things we already checked for in check_map_access and caller:
5378	 *  - Reject cases where variable offset may touch kptr
5379	 *  - size of access (must be BPF_DW)
5380	 *  - tnum_is_const(reg->var_off)
5381	 *  - kptr_field->offset == off + reg->var_off.value
5382	 */
5383	/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
5384	if (BPF_MODE(insn->code) != BPF_MEM) {
5385		verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
5386		return -EACCES;
5387	}
5388
5389	/* We only allow loading referenced kptr, since it will be marked as
5390	 * untrusted, similar to unreferenced kptr.
5391	 */
5392	if (class != BPF_LDX &&
5393	    (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
5394		verbose(env, "store to referenced kptr disallowed\n");
5395		return -EACCES;
5396	}
5397
5398	if (class == BPF_LDX) {
5399		val_reg = reg_state(env, value_regno);
5400		/* We can simply mark the value_regno receiving the pointer
5401		 * value from map as PTR_TO_BTF_ID, with the correct type.
5402		 */
5403		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
5404				kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
5405	} else if (class == BPF_STX) {
5406		val_reg = reg_state(env, value_regno);
5407		if (!register_is_null(val_reg) &&
5408		    map_kptr_match_type(env, kptr_field, val_reg, value_regno))
5409			return -EACCES;
5410	} else if (class == BPF_ST) {
5411		if (insn->imm) {
5412			verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
5413				kptr_field->offset);
5414			return -EACCES;
5415		}
5416	} else {
5417		verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
5418		return -EACCES;
5419	}
5420	return 0;
5421}
5422
5423/* check read/write into a map element with possible variable offset */
5424static int check_map_access(struct bpf_verifier_env *env, u32 regno,
5425			    int off, int size, bool zero_size_allowed,
5426			    enum bpf_access_src src)
5427{
5428	struct bpf_verifier_state *vstate = env->cur_state;
5429	struct bpf_func_state *state = vstate->frame[vstate->curframe];
5430	struct bpf_reg_state *reg = &state->regs[regno];
5431	struct bpf_map *map = reg->map_ptr;
5432	struct btf_record *rec;
5433	int err, i;
5434
5435	err = check_mem_region_access(env, regno, off, size, map->value_size,
5436				      zero_size_allowed);
5437	if (err)
5438		return err;
5439
5440	if (IS_ERR_OR_NULL(map->record))
5441		return 0;
5442	rec = map->record;
5443	for (i = 0; i < rec->cnt; i++) {
5444		struct btf_field *field = &rec->fields[i];
5445		u32 p = field->offset;
5446
5447		/* If any part of a field  can be touched by load/store, reject
5448		 * this program. To check that [x1, x2) overlaps with [y1, y2),
5449		 * it is sufficient to check x1 < y2 && y1 < x2.
5450		 */
5451		if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
5452		    p < reg->umax_value + off + size) {
5453			switch (field->type) {
5454			case BPF_KPTR_UNREF:
5455			case BPF_KPTR_REF:
5456			case BPF_KPTR_PERCPU:
5457				if (src != ACCESS_DIRECT) {
5458					verbose(env, "kptr cannot be accessed indirectly by helper\n");
5459					return -EACCES;
5460				}
5461				if (!tnum_is_const(reg->var_off)) {
5462					verbose(env, "kptr access cannot have variable offset\n");
5463					return -EACCES;
5464				}
5465				if (p != off + reg->var_off.value) {
5466					verbose(env, "kptr access misaligned expected=%u off=%llu\n",
5467						p, off + reg->var_off.value);
5468					return -EACCES;
5469				}
5470				if (size != bpf_size_to_bytes(BPF_DW)) {
5471					verbose(env, "kptr access size must be BPF_DW\n");
5472					return -EACCES;
5473				}
5474				break;
5475			default:
5476				verbose(env, "%s cannot be accessed directly by load/store\n",
5477					btf_field_type_name(field->type));
5478				return -EACCES;
5479			}
5480		}
5481	}
5482	return 0;
5483}
5484
5485#define MAX_PACKET_OFF 0xffff
5486
5487static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
5488				       const struct bpf_call_arg_meta *meta,
5489				       enum bpf_access_type t)
5490{
5491	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
5492
5493	switch (prog_type) {
5494	/* Program types only with direct read access go here! */
5495	case BPF_PROG_TYPE_LWT_IN:
5496	case BPF_PROG_TYPE_LWT_OUT:
5497	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
5498	case BPF_PROG_TYPE_SK_REUSEPORT:
5499	case BPF_PROG_TYPE_FLOW_DISSECTOR:
5500	case BPF_PROG_TYPE_CGROUP_SKB:
5501		if (t == BPF_WRITE)
5502			return false;
5503		fallthrough;
5504
5505	/* Program types with direct read + write access go here! */
5506	case BPF_PROG_TYPE_SCHED_CLS:
5507	case BPF_PROG_TYPE_SCHED_ACT:
5508	case BPF_PROG_TYPE_XDP:
5509	case BPF_PROG_TYPE_LWT_XMIT:
5510	case BPF_PROG_TYPE_SK_SKB:
5511	case BPF_PROG_TYPE_SK_MSG:
5512		if (meta)
5513			return meta->pkt_access;
5514
5515		env->seen_direct_write = true;
5516		return true;
5517
5518	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
5519		if (t == BPF_WRITE)
5520			env->seen_direct_write = true;
5521
5522		return true;
5523
5524	default:
5525		return false;
5526	}
5527}
5528
5529static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
5530			       int size, bool zero_size_allowed)
5531{
5532	struct bpf_reg_state *regs = cur_regs(env);
5533	struct bpf_reg_state *reg = &regs[regno];
5534	int err;
5535
5536	/* We may have added a variable offset to the packet pointer; but any
5537	 * reg->range we have comes after that.  We are only checking the fixed
5538	 * offset.
5539	 */
5540
5541	/* We don't allow negative numbers, because we aren't tracking enough
5542	 * detail to prove they're safe.
5543	 */
5544	if (reg->smin_value < 0) {
5545		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
5546			regno);
5547		return -EACCES;
5548	}
5549
5550	err = reg->range < 0 ? -EINVAL :
5551	      __check_mem_access(env, regno, off, size, reg->range,
5552				 zero_size_allowed);
5553	if (err) {
5554		verbose(env, "R%d offset is outside of the packet\n", regno);
5555		return err;
5556	}
5557
5558	/* __check_mem_access has made sure "off + size - 1" is within u16.
5559	 * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
5560	 * otherwise find_good_pkt_pointers would have refused to set range info
5561	 * that __check_mem_access would have rejected this pkt access.
5562	 * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
5563	 */
5564	env->prog->aux->max_pkt_offset =
5565		max_t(u32, env->prog->aux->max_pkt_offset,
5566		      off + reg->umax_value + size - 1);
5567
5568	return err;
5569}
5570
5571/* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
5572static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
5573			    enum bpf_access_type t, enum bpf_reg_type *reg_type,
5574			    struct btf **btf, u32 *btf_id)
5575{
5576	struct bpf_insn_access_aux info = {
5577		.reg_type = *reg_type,
5578		.log = &env->log,
5579	};
5580
5581	if (env->ops->is_valid_access &&
5582	    env->ops->is_valid_access(off, size, t, env->prog, &info)) {
5583		/* A non zero info.ctx_field_size indicates that this field is a
5584		 * candidate for later verifier transformation to load the whole
5585		 * field and then apply a mask when accessed with a narrower
5586		 * access than actual ctx access size. A zero info.ctx_field_size
5587		 * will only allow for whole field access and rejects any other
5588		 * type of narrower access.
5589		 */
5590		*reg_type = info.reg_type;
5591
5592		if (base_type(*reg_type) == PTR_TO_BTF_ID) {
5593			*btf = info.btf;
5594			*btf_id = info.btf_id;
5595		} else {
5596			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
5597		}
5598		/* remember the offset of last byte accessed in ctx */
5599		if (env->prog->aux->max_ctx_offset < off + size)
5600			env->prog->aux->max_ctx_offset = off + size;
5601		return 0;
5602	}
5603
5604	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
5605	return -EACCES;
5606}
5607
5608static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
5609				  int size)
5610{
5611	if (size < 0 || off < 0 ||
5612	    (u64)off + size > sizeof(struct bpf_flow_keys)) {
5613		verbose(env, "invalid access to flow keys off=%d size=%d\n",
5614			off, size);
5615		return -EACCES;
5616	}
5617	return 0;
5618}
5619
5620static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
5621			     u32 regno, int off, int size,
5622			     enum bpf_access_type t)
5623{
5624	struct bpf_reg_state *regs = cur_regs(env);
5625	struct bpf_reg_state *reg = &regs[regno];
5626	struct bpf_insn_access_aux info = {};
5627	bool valid;
5628
5629	if (reg->smin_value < 0) {
5630		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
5631			regno);
5632		return -EACCES;
5633	}
5634
5635	switch (reg->type) {
5636	case PTR_TO_SOCK_COMMON:
5637		valid = bpf_sock_common_is_valid_access(off, size, t, &info);
5638		break;
5639	case PTR_TO_SOCKET:
5640		valid = bpf_sock_is_valid_access(off, size, t, &info);
5641		break;
5642	case PTR_TO_TCP_SOCK:
5643		valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
5644		break;
5645	case PTR_TO_XDP_SOCK:
5646		valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
5647		break;
5648	default:
5649		valid = false;
5650	}
5651
5652
5653	if (valid) {
5654		env->insn_aux_data[insn_idx].ctx_field_size =
5655			info.ctx_field_size;
5656		return 0;
5657	}
5658
5659	verbose(env, "R%d invalid %s access off=%d size=%d\n",
5660		regno, reg_type_str(env, reg->type), off, size);
5661
5662	return -EACCES;
5663}
5664
5665static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
5666{
5667	return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
5668}
5669
5670static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
5671{
5672	const struct bpf_reg_state *reg = reg_state(env, regno);
5673
5674	return reg->type == PTR_TO_CTX;
5675}
5676
5677static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
5678{
5679	const struct bpf_reg_state *reg = reg_state(env, regno);
5680
5681	return type_is_sk_pointer(reg->type);
5682}
5683
5684static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
5685{
5686	const struct bpf_reg_state *reg = reg_state(env, regno);
5687
5688	return type_is_pkt_pointer(reg->type);
5689}
5690
5691static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
5692{
5693	const struct bpf_reg_state *reg = reg_state(env, regno);
5694
5695	/* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */
5696	return reg->type == PTR_TO_FLOW_KEYS;
5697}
5698
5699static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
5700{
5701	const struct bpf_reg_state *reg = reg_state(env, regno);
5702
5703	return reg->type == PTR_TO_ARENA;
5704}
5705
5706static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
5707#ifdef CONFIG_NET
5708	[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
5709	[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
5710	[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
5711#endif
5712	[CONST_PTR_TO_MAP] = btf_bpf_map_id,
5713};
5714
5715static bool is_trusted_reg(const struct bpf_reg_state *reg)
5716{
5717	/* A referenced register is always trusted. */
5718	if (reg->ref_obj_id)
5719		return true;
5720
5721	/* Types listed in the reg2btf_ids are always trusted */
5722	if (reg2btf_ids[base_type(reg->type)] &&
5723	    !bpf_type_has_unsafe_modifiers(reg->type))
5724		return true;
5725
5726	/* If a register is not referenced, it is trusted if it has the
5727	 * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
5728	 * other type modifiers may be safe, but we elect to take an opt-in
5729	 * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
5730	 * not.
5731	 *
5732	 * Eventually, we should make PTR_TRUSTED the single source of truth
5733	 * for whether a register is trusted.
5734	 */
5735	return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
5736	       !bpf_type_has_unsafe_modifiers(reg->type);
5737}
5738
5739static bool is_rcu_reg(const struct bpf_reg_state *reg)
5740{
5741	return reg->type & MEM_RCU;
5742}
5743
5744static void clear_trusted_flags(enum bpf_type_flag *flag)
5745{
5746	*flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
5747}
5748
5749static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
5750				   const struct bpf_reg_state *reg,
5751				   int off, int size, bool strict)
5752{
5753	struct tnum reg_off;
5754	int ip_align;
5755
5756	/* Byte size accesses are always allowed. */
5757	if (!strict || size == 1)
5758		return 0;
5759
5760	/* For platforms that do not have a Kconfig enabling
5761	 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
5762	 * NET_IP_ALIGN is universally set to '2'.  And on platforms
5763	 * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
5764	 * to this code only in strict mode where we want to emulate
5765	 * the NET_IP_ALIGN==2 checking.  Therefore use an
5766	 * unconditional IP align value of '2'.
5767	 */
5768	ip_align = 2;
5769
5770	reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
5771	if (!tnum_is_aligned(reg_off, size)) {
5772		char tn_buf[48];
5773
5774		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
5775		verbose(env,
5776			"misaligned packet access off %d+%s+%d+%d size %d\n",
5777			ip_align, tn_buf, reg->off, off, size);
5778		return -EACCES;
5779	}
5780
5781	return 0;
5782}
5783
5784static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
5785				       const struct bpf_reg_state *reg,
5786				       const char *pointer_desc,
5787				       int off, int size, bool strict)
5788{
5789	struct tnum reg_off;
5790
5791	/* Byte size accesses are always allowed. */
5792	if (!strict || size == 1)
5793		return 0;
5794
5795	reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
5796	if (!tnum_is_aligned(reg_off, size)) {
5797		char tn_buf[48];
5798
5799		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
5800		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
5801			pointer_desc, tn_buf, reg->off, off, size);
5802		return -EACCES;
5803	}
5804
5805	return 0;
5806}
5807
5808static int check_ptr_alignment(struct bpf_verifier_env *env,
5809			       const struct bpf_reg_state *reg, int off,
5810			       int size, bool strict_alignment_once)
5811{
5812	bool strict = env->strict_alignment || strict_alignment_once;
5813	const char *pointer_desc = "";
5814
5815	switch (reg->type) {
5816	case PTR_TO_PACKET:
5817	case PTR_TO_PACKET_META:
5818		/* Special case, because of NET_IP_ALIGN. Given metadata sits
5819		 * right in front, treat it the very same way.
5820		 */
5821		return check_pkt_ptr_alignment(env, reg, off, size, strict);
5822	case PTR_TO_FLOW_KEYS:
5823		pointer_desc = "flow keys ";
5824		break;
5825	case PTR_TO_MAP_KEY:
5826		pointer_desc = "key ";
5827		break;
5828	case PTR_TO_MAP_VALUE:
5829		pointer_desc = "value ";
5830		break;
5831	case PTR_TO_CTX:
5832		pointer_desc = "context ";
5833		break;
5834	case PTR_TO_STACK:
5835		pointer_desc = "stack ";
5836		/* The stack spill tracking logic in check_stack_write_fixed_off()
5837		 * and check_stack_read_fixed_off() relies on stack accesses being
5838		 * aligned.
5839		 */
5840		strict = true;
5841		break;
5842	case PTR_TO_SOCKET:
5843		pointer_desc = "sock ";
5844		break;
5845	case PTR_TO_SOCK_COMMON:
5846		pointer_desc = "sock_common ";
5847		break;
5848	case PTR_TO_TCP_SOCK:
5849		pointer_desc = "tcp_sock ";
5850		break;
5851	case PTR_TO_XDP_SOCK:
5852		pointer_desc = "xdp_sock ";
5853		break;
5854	case PTR_TO_ARENA:
5855		return 0;
5856	default:
5857		break;
5858	}
5859	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
5860					   strict);
5861}
5862
5863static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
5864{
5865	if (env->prog->jit_requested)
5866		return round_up(stack_depth, 16);
5867
5868	/* round up to 32-bytes, since this is granularity
5869	 * of interpreter stack size
5870	 */
5871	return round_up(max_t(u32, stack_depth, 1), 32);
5872}
5873
5874/* starting from main bpf function walk all instructions of the function
5875 * and recursively walk all callees that given function can call.
5876 * Ignore jump and exit insns.
5877 * Since recursion is prevented by check_cfg() this algorithm
5878 * only needs a local stack of MAX_CALL_FRAMES to remember callsites
5879 */
5880static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
5881{
5882	struct bpf_subprog_info *subprog = env->subprog_info;
5883	struct bpf_insn *insn = env->prog->insnsi;
5884	int depth = 0, frame = 0, i, subprog_end;
5885	bool tail_call_reachable = false;
5886	int ret_insn[MAX_CALL_FRAMES];
5887	int ret_prog[MAX_CALL_FRAMES];
5888	int j;
5889
5890	i = subprog[idx].start;
5891process_func:
5892	/* protect against potential stack overflow that might happen when
5893	 * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
5894	 * depth for such case down to 256 so that the worst case scenario
5895	 * would result in 8k stack size (32 which is tailcall limit * 256 =
5896	 * 8k).
5897	 *
5898	 * To get the idea what might happen, see an example:
5899	 * func1 -> sub rsp, 128
5900	 *  subfunc1 -> sub rsp, 256
5901	 *  tailcall1 -> add rsp, 256
5902	 *   func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
5903	 *   subfunc2 -> sub rsp, 64
5904	 *   subfunc22 -> sub rsp, 128
5905	 *   tailcall2 -> add rsp, 128
5906	 *    func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
5907	 *
5908	 * tailcall will unwind the current stack frame but it will not get rid
5909	 * of caller's stack as shown on the example above.
5910	 */
5911	if (idx && subprog[idx].has_tail_call && depth >= 256) {
5912		verbose(env,
5913			"tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
5914			depth);
5915		return -EACCES;
5916	}
5917	depth += round_up_stack_depth(env, subprog[idx].stack_depth);
5918	if (depth > MAX_BPF_STACK) {
5919		verbose(env, "combined stack size of %d calls is %d. Too large\n",
5920			frame + 1, depth);
5921		return -EACCES;
5922	}
5923continue_func:
5924	subprog_end = subprog[idx + 1].start;
5925	for (; i < subprog_end; i++) {
5926		int next_insn, sidx;
5927
5928		if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
5929			bool err = false;
5930
5931			if (!is_bpf_throw_kfunc(insn + i))
5932				continue;
5933			if (subprog[idx].is_cb)
5934				err = true;
5935			for (int c = 0; c < frame && !err; c++) {
5936				if (subprog[ret_prog[c]].is_cb) {
5937					err = true;
5938					break;
5939				}
5940			}
5941			if (!err)
5942				continue;
5943			verbose(env,
5944				"bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
5945				i, idx);
5946			return -EINVAL;
5947		}
5948
5949		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
5950			continue;
5951		/* remember insn and function to return to */
5952		ret_insn[frame] = i + 1;
5953		ret_prog[frame] = idx;
5954
5955		/* find the callee */
5956		next_insn = i + insn[i].imm + 1;
5957		sidx = find_subprog(env, next_insn);
5958		if (sidx < 0) {
5959			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
5960				  next_insn);
5961			return -EFAULT;
5962		}
5963		if (subprog[sidx].is_async_cb) {
5964			if (subprog[sidx].has_tail_call) {
5965				verbose(env, "verifier bug. subprog has tail_call and async cb\n");
5966				return -EFAULT;
5967			}
5968			/* async callbacks don't increase bpf prog stack size unless called directly */
5969			if (!bpf_pseudo_call(insn + i))
5970				continue;
5971			if (subprog[sidx].is_exception_cb) {
5972				verbose(env, "insn %d cannot call exception cb directly\n", i);
5973				return -EINVAL;
5974			}
5975		}
5976		i = next_insn;
5977		idx = sidx;
5978
5979		if (subprog[idx].has_tail_call)
5980			tail_call_reachable = true;
5981
5982		frame++;
5983		if (frame >= MAX_CALL_FRAMES) {
5984			verbose(env, "the call stack of %d frames is too deep !\n",
5985				frame);
5986			return -E2BIG;
5987		}
5988		goto process_func;
5989	}
5990	/* if tail call got detected across bpf2bpf calls then mark each of the
5991	 * currently present subprog frames as tail call reachable subprogs;
5992	 * this info will be utilized by JIT so that we will be preserving the
5993	 * tail call counter throughout bpf2bpf calls combined with tailcalls
5994	 */
5995	if (tail_call_reachable)
5996		for (j = 0; j < frame; j++) {
5997			if (subprog[ret_prog[j]].is_exception_cb) {
5998				verbose(env, "cannot tail call within exception cb\n");
5999				return -EINVAL;
6000			}
6001			subprog[ret_prog[j]].tail_call_reachable = true;
6002		}
6003	if (subprog[0].tail_call_reachable)
6004		env->prog->aux->tail_call_reachable = true;
6005
6006	/* end of for() loop means the last insn of the 'subprog'
6007	 * was reached. Doesn't matter whether it was JA or EXIT
6008	 */
6009	if (frame == 0)
6010		return 0;
6011	depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
6012	frame--;
6013	i = ret_insn[frame];
6014	idx = ret_prog[frame];
6015	goto continue_func;
6016}
6017
6018static int check_max_stack_depth(struct bpf_verifier_env *env)
6019{
6020	struct bpf_subprog_info *si = env->subprog_info;
6021	int ret;
6022
6023	for (int i = 0; i < env->subprog_cnt; i++) {
6024		if (!i || si[i].is_async_cb) {
6025			ret = check_max_stack_depth_subprog(env, i);
6026			if (ret < 0)
6027				return ret;
6028		}
6029		continue;
6030	}
6031	return 0;
6032}
6033
6034#ifndef CONFIG_BPF_JIT_ALWAYS_ON
6035static int get_callee_stack_depth(struct bpf_verifier_env *env,
6036				  const struct bpf_insn *insn, int idx)
6037{
6038	int start = idx + insn->imm + 1, subprog;
6039
6040	subprog = find_subprog(env, start);
6041	if (subprog < 0) {
6042		WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
6043			  start);
6044		return -EFAULT;
6045	}
6046	return env->subprog_info[subprog].stack_depth;
6047}
6048#endif
6049
6050static int __check_buffer_access(struct bpf_verifier_env *env,
6051				 const char *buf_info,
6052				 const struct bpf_reg_state *reg,
6053				 int regno, int off, int size)
6054{
6055	if (off < 0) {
6056		verbose(env,
6057			"R%d invalid %s buffer access: off=%d, size=%d\n",
6058			regno, buf_info, off, size);
6059		return -EACCES;
6060	}
6061	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
6062		char tn_buf[48];
6063
6064		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
6065		verbose(env,
6066			"R%d invalid variable buffer offset: off=%d, var_off=%s\n",
6067			regno, off, tn_buf);
6068		return -EACCES;
6069	}
6070
6071	return 0;
6072}
6073
6074static int check_tp_buffer_access(struct bpf_verifier_env *env,
6075				  const struct bpf_reg_state *reg,
6076				  int regno, int off, int size)
6077{
6078	int err;
6079
6080	err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
6081	if (err)
6082		return err;
6083
6084	if (off + size > env->prog->aux->max_tp_access)
6085		env->prog->aux->max_tp_access = off + size;
6086
6087	return 0;
6088}
6089
6090static int check_buffer_access(struct bpf_verifier_env *env,
6091			       const struct bpf_reg_state *reg,
6092			       int regno, int off, int size,
6093			       bool zero_size_allowed,
6094			       u32 *max_access)
6095{
6096	const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
6097	int err;
6098
6099	err = __check_buffer_access(env, buf_info, reg, regno, off, size);
6100	if (err)
6101		return err;
6102
6103	if (off + size > *max_access)
6104		*max_access = off + size;
6105
6106	return 0;
6107}
6108
6109/* BPF architecture zero extends alu32 ops into 64-bit registesr */
6110static void zext_32_to_64(struct bpf_reg_state *reg)
6111{
6112	reg->var_off = tnum_subreg(reg->var_off);
6113	__reg_assign_32_into_64(reg);
6114}
6115
6116/* truncate register to smaller size (in bytes)
6117 * must be called with size < BPF_REG_SIZE
6118 */
6119static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
6120{
6121	u64 mask;
6122
6123	/* clear high bits in bit representation */
6124	reg->var_off = tnum_cast(reg->var_off, size);
6125
6126	/* fix arithmetic bounds */
6127	mask = ((u64)1 << (size * 8)) - 1;
6128	if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
6129		reg->umin_value &= mask;
6130		reg->umax_value &= mask;
6131	} else {
6132		reg->umin_value = 0;
6133		reg->umax_value = mask;
6134	}
6135	reg->smin_value = reg->umin_value;
6136	reg->smax_value = reg->umax_value;
6137
6138	/* If size is smaller than 32bit register the 32bit register
6139	 * values are also truncated so we push 64-bit bounds into
6140	 * 32-bit bounds. Above were truncated < 32-bits already.
6141	 */
6142	if (size < 4)
6143		__mark_reg32_unbounded(reg);
6144
6145	reg_bounds_sync(reg);
6146}
6147
6148static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
6149{
6150	if (size == 1) {
6151		reg->smin_value = reg->s32_min_value = S8_MIN;
6152		reg->smax_value = reg->s32_max_value = S8_MAX;
6153	} else if (size == 2) {
6154		reg->smin_value = reg->s32_min_value = S16_MIN;
6155		reg->smax_value = reg->s32_max_value = S16_MAX;
6156	} else {
6157		/* size == 4 */
6158		reg->smin_value = reg->s32_min_value = S32_MIN;
6159		reg->smax_value = reg->s32_max_value = S32_MAX;
6160	}
6161	reg->umin_value = reg->u32_min_value = 0;
6162	reg->umax_value = U64_MAX;
6163	reg->u32_max_value = U32_MAX;
6164	reg->var_off = tnum_unknown;
6165}
6166
6167static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
6168{
6169	s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
6170	u64 top_smax_value, top_smin_value;
6171	u64 num_bits = size * 8;
6172
6173	if (tnum_is_const(reg->var_off)) {
6174		u64_cval = reg->var_off.value;
6175		if (size == 1)
6176			reg->var_off = tnum_const((s8)u64_cval);
6177		else if (size == 2)
6178			reg->var_off = tnum_const((s16)u64_cval);
6179		else
6180			/* size == 4 */
6181			reg->var_off = tnum_const((s32)u64_cval);
6182
6183		u64_cval = reg->var_off.value;
6184		reg->smax_value = reg->smin_value = u64_cval;
6185		reg->umax_value = reg->umin_value = u64_cval;
6186		reg->s32_max_value = reg->s32_min_value = u64_cval;
6187		reg->u32_max_value = reg->u32_min_value = u64_cval;
6188		return;
6189	}
6190
6191	top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
6192	top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
6193
6194	if (top_smax_value != top_smin_value)
6195		goto out;
6196
6197	/* find the s64_min and s64_min after sign extension */
6198	if (size == 1) {
6199		init_s64_max = (s8)reg->smax_value;
6200		init_s64_min = (s8)reg->smin_value;
6201	} else if (size == 2) {
6202		init_s64_max = (s16)reg->smax_value;
6203		init_s64_min = (s16)reg->smin_value;
6204	} else {
6205		init_s64_max = (s32)reg->smax_value;
6206		init_s64_min = (s32)reg->smin_value;
6207	}
6208
6209	s64_max = max(init_s64_max, init_s64_min);
6210	s64_min = min(init_s64_max, init_s64_min);
6211
6212	/* both of s64_max/s64_min positive or negative */
6213	if ((s64_max >= 0) == (s64_min >= 0)) {
6214		reg->smin_value = reg->s32_min_value = s64_min;
6215		reg->smax_value = reg->s32_max_value = s64_max;
6216		reg->umin_value = reg->u32_min_value = s64_min;
6217		reg->umax_value = reg->u32_max_value = s64_max;
6218		reg->var_off = tnum_range(s64_min, s64_max);
6219		return;
6220	}
6221
6222out:
6223	set_sext64_default_val(reg, size);
6224}
6225
6226static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
6227{
6228	if (size == 1) {
6229		reg->s32_min_value = S8_MIN;
6230		reg->s32_max_value = S8_MAX;
6231	} else {
6232		/* size == 2 */
6233		reg->s32_min_value = S16_MIN;
6234		reg->s32_max_value = S16_MAX;
6235	}
6236	reg->u32_min_value = 0;
6237	reg->u32_max_value = U32_MAX;
6238}
6239
6240static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
6241{
6242	s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
6243	u32 top_smax_value, top_smin_value;
6244	u32 num_bits = size * 8;
6245
6246	if (tnum_is_const(reg->var_off)) {
6247		u32_val = reg->var_off.value;
6248		if (size == 1)
6249			reg->var_off = tnum_const((s8)u32_val);
6250		else
6251			reg->var_off = tnum_const((s16)u32_val);
6252
6253		u32_val = reg->var_off.value;
6254		reg->s32_min_value = reg->s32_max_value = u32_val;
6255		reg->u32_min_value = reg->u32_max_value = u32_val;
6256		return;
6257	}
6258
6259	top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
6260	top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
6261
6262	if (top_smax_value != top_smin_value)
6263		goto out;
6264
6265	/* find the s32_min and s32_min after sign extension */
6266	if (size == 1) {
6267		init_s32_max = (s8)reg->s32_max_value;
6268		init_s32_min = (s8)reg->s32_min_value;
6269	} else {
6270		/* size == 2 */
6271		init_s32_max = (s16)reg->s32_max_value;
6272		init_s32_min = (s16)reg->s32_min_value;
6273	}
6274	s32_max = max(init_s32_max, init_s32_min);
6275	s32_min = min(init_s32_max, init_s32_min);
6276
6277	if ((s32_min >= 0) == (s32_max >= 0)) {
6278		reg->s32_min_value = s32_min;
6279		reg->s32_max_value = s32_max;
6280		reg->u32_min_value = (u32)s32_min;
6281		reg->u32_max_value = (u32)s32_max;
6282		return;
6283	}
6284
6285out:
6286	set_sext32_default_val(reg, size);
6287}
6288
6289static bool bpf_map_is_rdonly(const struct bpf_map *map)
6290{
6291	/* A map is considered read-only if the following condition are true:
6292	 *
6293	 * 1) BPF program side cannot change any of the map content. The
6294	 *    BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
6295	 *    and was set at map creation time.
6296	 * 2) The map value(s) have been initialized from user space by a
6297	 *    loader and then "frozen", such that no new map update/delete
6298	 *    operations from syscall side are possible for the rest of
6299	 *    the map's lifetime from that point onwards.
6300	 * 3) Any parallel/pending map update/delete operations from syscall
6301	 *    side have been completed. Only after that point, it's safe to
6302	 *    assume that map value(s) are immutable.
6303	 */
6304	return (map->map_flags & BPF_F_RDONLY_PROG) &&
6305	       READ_ONCE(map->frozen) &&
6306	       !bpf_map_write_active(map);
6307}
6308
6309static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
6310			       bool is_ldsx)
6311{
6312	void *ptr;
6313	u64 addr;
6314	int err;
6315
6316	err = map->ops->map_direct_value_addr(map, &addr, off);
6317	if (err)
6318		return err;
6319	ptr = (void *)(long)addr + off;
6320
6321	switch (size) {
6322	case sizeof(u8):
6323		*val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
6324		break;
6325	case sizeof(u16):
6326		*val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
6327		break;
6328	case sizeof(u32):
6329		*val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
6330		break;
6331	case sizeof(u64):
6332		*val = *(u64 *)ptr;
6333		break;
6334	default:
6335		return -EINVAL;
6336	}
6337	return 0;
6338}
6339
6340#define BTF_TYPE_SAFE_RCU(__type)  __PASTE(__type, __safe_rcu)
6341#define BTF_TYPE_SAFE_RCU_OR_NULL(__type)  __PASTE(__type, __safe_rcu_or_null)
6342#define BTF_TYPE_SAFE_TRUSTED(__type)  __PASTE(__type, __safe_trusted)
6343#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type)  __PASTE(__type, __safe_trusted_or_null)
6344
6345/*
6346 * Allow list few fields as RCU trusted or full trusted.
6347 * This logic doesn't allow mix tagging and will be removed once GCC supports
6348 * btf_type_tag.
6349 */
6350
6351/* RCU trusted: these fields are trusted in RCU CS and never NULL */
6352BTF_TYPE_SAFE_RCU(struct task_struct) {
6353	const cpumask_t *cpus_ptr;
6354	struct css_set __rcu *cgroups;
6355	struct task_struct __rcu *real_parent;
6356	struct task_struct *group_leader;
6357};
6358
6359BTF_TYPE_SAFE_RCU(struct cgroup) {
6360	/* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
6361	struct kernfs_node *kn;
6362};
6363
6364BTF_TYPE_SAFE_RCU(struct css_set) {
6365	struct cgroup *dfl_cgrp;
6366};
6367
6368/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
6369BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
6370	struct file __rcu *exe_file;
6371};
6372
6373/* skb->sk, req->sk are not RCU protected, but we mark them as such
6374 * because bpf prog accessible sockets are SOCK_RCU_FREE.
6375 */
6376BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
6377	struct sock *sk;
6378};
6379
6380BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
6381	struct sock *sk;
6382};
6383
6384/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
6385BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
6386	struct seq_file *seq;
6387};
6388
6389BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
6390	struct bpf_iter_meta *meta;
6391	struct task_struct *task;
6392};
6393
6394BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
6395	struct file *file;
6396};
6397
6398BTF_TYPE_SAFE_TRUSTED(struct file) {
6399	struct inode *f_inode;
6400};
6401
6402BTF_TYPE_SAFE_TRUSTED(struct dentry) {
6403	/* no negative dentry-s in places where bpf can see it */
6404	struct inode *d_inode;
6405};
6406
6407BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
6408	struct sock *sk;
6409};
6410
6411static bool type_is_rcu(struct bpf_verifier_env *env,
6412			struct bpf_reg_state *reg,
6413			const char *field_name, u32 btf_id)
6414{
6415	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
6416	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
6417	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
6418
6419	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
6420}
6421
6422static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
6423				struct bpf_reg_state *reg,
6424				const char *field_name, u32 btf_id)
6425{
6426	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
6427	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
6428	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
6429
6430	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
6431}
6432
6433static bool type_is_trusted(struct bpf_verifier_env *env,
6434			    struct bpf_reg_state *reg,
6435			    const char *field_name, u32 btf_id)
6436{
6437	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
6438	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
6439	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
6440	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
6441	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
6442
6443	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
6444}
6445
6446static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
6447				    struct bpf_reg_state *reg,
6448				    const char *field_name, u32 btf_id)
6449{
6450	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
6451
6452	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
6453					  "__safe_trusted_or_null");
6454}
6455
6456static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
6457				   struct bpf_reg_state *regs,
6458				   int regno, int off, int size,
6459				   enum bpf_access_type atype,
6460				   int value_regno)
6461{
6462	struct bpf_reg_state *reg = regs + regno;
6463	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
6464	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
6465	const char *field_name = NULL;
6466	enum bpf_type_flag flag = 0;
6467	u32 btf_id = 0;
6468	int ret;
6469
6470	if (!env->allow_ptr_leaks) {
6471		verbose(env,
6472			"'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
6473			tname);
6474		return -EPERM;
6475	}
6476	if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
6477		verbose(env,
6478			"Cannot access kernel 'struct %s' from non-GPL compatible program\n",
6479			tname);
6480		return -EINVAL;
6481	}
6482	if (off < 0) {
6483		verbose(env,
6484			"R%d is ptr_%s invalid negative access: off=%d\n",
6485			regno, tname, off);
6486		return -EACCES;
6487	}
6488	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
6489		char tn_buf[48];
6490
6491		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
6492		verbose(env,
6493			"R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
6494			regno, tname, off, tn_buf);
6495		return -EACCES;
6496	}
6497
6498	if (reg->type & MEM_USER) {
6499		verbose(env,
6500			"R%d is ptr_%s access user memory: off=%d\n",
6501			regno, tname, off);
6502		return -EACCES;
6503	}
6504
6505	if (reg->type & MEM_PERCPU) {
6506		verbose(env,
6507			"R%d is ptr_%s access percpu memory: off=%d\n",
6508			regno, tname, off);
6509		return -EACCES;
6510	}
6511
6512	if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
6513		if (!btf_is_kernel(reg->btf)) {
6514			verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
6515			return -EFAULT;
6516		}
6517		ret = env->ops->btf_struct_access(&env->log, reg, off, size);
6518	} else {
6519		/* Writes are permitted with default btf_struct_access for
6520		 * program allocated objects (which always have ref_obj_id > 0),
6521		 * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
6522		 */
6523		if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
6524			verbose(env, "only read is supported\n");
6525			return -EACCES;
6526		}
6527
6528		if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
6529		    !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
6530			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
6531			return -EFAULT;
6532		}
6533
6534		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
6535	}
6536
6537	if (ret < 0)
6538		return ret;
6539
6540	if (ret != PTR_TO_BTF_ID) {
6541		/* just mark; */
6542
6543	} else if (type_flag(reg->type) & PTR_UNTRUSTED) {
6544		/* If this is an untrusted pointer, all pointers formed by walking it
6545		 * also inherit the untrusted flag.
6546		 */
6547		flag = PTR_UNTRUSTED;
6548
6549	} else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
6550		/* By default any pointer obtained from walking a trusted pointer is no
6551		 * longer trusted, unless the field being accessed has explicitly been
6552		 * marked as inheriting its parent's state of trust (either full or RCU).
6553		 * For example:
6554		 * 'cgroups' pointer is untrusted if task->cgroups dereference
6555		 * happened in a sleepable program outside of bpf_rcu_read_lock()
6556		 * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
6557		 * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
6558		 *
6559		 * A regular RCU-protected pointer with __rcu tag can also be deemed
6560		 * trusted if we are in an RCU CS. Such pointer can be NULL.
6561		 */
6562		if (type_is_trusted(env, reg, field_name, btf_id)) {
6563			flag |= PTR_TRUSTED;
6564		} else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
6565			flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
6566		} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
6567			if (type_is_rcu(env, reg, field_name, btf_id)) {
6568				/* ignore __rcu tag and mark it MEM_RCU */
6569				flag |= MEM_RCU;
6570			} else if (flag & MEM_RCU ||
6571				   type_is_rcu_or_null(env, reg, field_name, btf_id)) {
6572				/* __rcu tagged pointers can be NULL */
6573				flag |= MEM_RCU | PTR_MAYBE_NULL;
6574
6575				/* We always trust them */
6576				if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
6577				    flag & PTR_UNTRUSTED)
6578					flag &= ~PTR_UNTRUSTED;
6579			} else if (flag & (MEM_PERCPU | MEM_USER)) {
6580				/* keep as-is */
6581			} else {
6582				/* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
6583				clear_trusted_flags(&flag);
6584			}
6585		} else {
6586			/*
6587			 * If not in RCU CS or MEM_RCU pointer can be NULL then
6588			 * aggressively mark as untrusted otherwise such
6589			 * pointers will be plain PTR_TO_BTF_ID without flags
6590			 * and will be allowed to be passed into helpers for
6591			 * compat reasons.
6592			 */
6593			flag = PTR_UNTRUSTED;
6594		}
6595	} else {
6596		/* Old compat. Deprecated */
6597		clear_trusted_flags(&flag);
6598	}
6599
6600	if (atype == BPF_READ && value_regno >= 0)
6601		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
6602
6603	return 0;
6604}
6605
6606static int check_ptr_to_map_access(struct bpf_verifier_env *env,
6607				   struct bpf_reg_state *regs,
6608				   int regno, int off, int size,
6609				   enum bpf_access_type atype,
6610				   int value_regno)
6611{
6612	struct bpf_reg_state *reg = regs + regno;
6613	struct bpf_map *map = reg->map_ptr;
6614	struct bpf_reg_state map_reg;
6615	enum bpf_type_flag flag = 0;
6616	const struct btf_type *t;
6617	const char *tname;
6618	u32 btf_id;
6619	int ret;
6620
6621	if (!btf_vmlinux) {
6622		verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
6623		return -ENOTSUPP;
6624	}
6625
6626	if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
6627		verbose(env, "map_ptr access not supported for map type %d\n",
6628			map->map_type);
6629		return -ENOTSUPP;
6630	}
6631
6632	t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
6633	tname = btf_name_by_offset(btf_vmlinux, t->name_off);
6634
6635	if (!env->allow_ptr_leaks) {
6636		verbose(env,
6637			"'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
6638			tname);
6639		return -EPERM;
6640	}
6641
6642	if (off < 0) {
6643		verbose(env, "R%d is %s invalid negative access: off=%d\n",
6644			regno, tname, off);
6645		return -EACCES;
6646	}
6647
6648	if (atype != BPF_READ) {
6649		verbose(env, "only read from %s is supported\n", tname);
6650		return -EACCES;
6651	}
6652
6653	/* Simulate access to a PTR_TO_BTF_ID */
6654	memset(&map_reg, 0, sizeof(map_reg));
6655	mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
6656	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
6657	if (ret < 0)
6658		return ret;
6659
6660	if (value_regno >= 0)
6661		mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
6662
6663	return 0;
6664}
6665
6666/* Check that the stack access at the given offset is within bounds. The
6667 * maximum valid offset is -1.
6668 *
6669 * The minimum valid offset is -MAX_BPF_STACK for writes, and
6670 * -state->allocated_stack for reads.
6671 */
6672static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
6673                                          s64 off,
6674                                          struct bpf_func_state *state,
6675                                          enum bpf_access_type t)
6676{
6677	int min_valid_off;
6678
6679	if (t == BPF_WRITE || env->allow_uninit_stack)
6680		min_valid_off = -MAX_BPF_STACK;
6681	else
6682		min_valid_off = -state->allocated_stack;
6683
6684	if (off < min_valid_off || off > -1)
6685		return -EACCES;
6686	return 0;
6687}
6688
6689/* Check that the stack access at 'regno + off' falls within the maximum stack
6690 * bounds.
6691 *
6692 * 'off' includes `regno->offset`, but not its dynamic part (if any).
6693 */
6694static int check_stack_access_within_bounds(
6695		struct bpf_verifier_env *env,
6696		int regno, int off, int access_size,
6697		enum bpf_access_src src, enum bpf_access_type type)
6698{
6699	struct bpf_reg_state *regs = cur_regs(env);
6700	struct bpf_reg_state *reg = regs + regno;
6701	struct bpf_func_state *state = func(env, reg);
6702	s64 min_off, max_off;
6703	int err;
6704	char *err_extra;
6705
6706	if (src == ACCESS_HELPER)
6707		/* We don't know if helpers are reading or writing (or both). */
6708		err_extra = " indirect access to";
6709	else if (type == BPF_READ)
6710		err_extra = " read from";
6711	else
6712		err_extra = " write to";
6713
6714	if (tnum_is_const(reg->var_off)) {
6715		min_off = (s64)reg->var_off.value + off;
6716		max_off = min_off + access_size;
6717	} else {
6718		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
6719		    reg->smin_value <= -BPF_MAX_VAR_OFF) {
6720			verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
6721				err_extra, regno);
6722			return -EACCES;
6723		}
6724		min_off = reg->smin_value + off;
6725		max_off = reg->smax_value + off + access_size;
6726	}
6727
6728	err = check_stack_slot_within_bounds(env, min_off, state, type);
6729	if (!err && max_off > 0)
6730		err = -EINVAL; /* out of stack access into non-negative offsets */
6731	if (!err && access_size < 0)
6732		/* access_size should not be negative (or overflow an int); others checks
6733		 * along the way should have prevented such an access.
6734		 */
6735		err = -EFAULT; /* invalid negative access size; integer overflow? */
6736
6737	if (err) {
6738		if (tnum_is_const(reg->var_off)) {
6739			verbose(env, "invalid%s stack R%d off=%d size=%d\n",
6740				err_extra, regno, off, access_size);
6741		} else {
6742			char tn_buf[48];
6743
6744			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
6745			verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
6746				err_extra, regno, tn_buf, off, access_size);
6747		}
6748		return err;
6749	}
6750
6751	/* Note that there is no stack access with offset zero, so the needed stack
6752	 * size is -min_off, not -min_off+1.
6753	 */
6754	return grow_stack_state(env, state, -min_off /* size */);
6755}
6756
6757/* check whether memory at (regno + off) is accessible for t = (read | write)
6758 * if t==write, value_regno is a register which value is stored into memory
6759 * if t==read, value_regno is a register which will receive the value from memory
6760 * if t==write && value_regno==-1, some unknown value is stored into memory
6761 * if t==read && value_regno==-1, don't care what we read from memory
6762 */
6763static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
6764			    int off, int bpf_size, enum bpf_access_type t,
6765			    int value_regno, bool strict_alignment_once, bool is_ldsx)
6766{
6767	struct bpf_reg_state *regs = cur_regs(env);
6768	struct bpf_reg_state *reg = regs + regno;
6769	int size, err = 0;
6770
6771	size = bpf_size_to_bytes(bpf_size);
6772	if (size < 0)
6773		return size;
6774
6775	/* alignment checks will add in reg->off themselves */
6776	err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
6777	if (err)
6778		return err;
6779
6780	/* for access checks, reg->off is just part of off */
6781	off += reg->off;
6782
6783	if (reg->type == PTR_TO_MAP_KEY) {
6784		if (t == BPF_WRITE) {
6785			verbose(env, "write to change key R%d not allowed\n", regno);
6786			return -EACCES;
6787		}
6788
6789		err = check_mem_region_access(env, regno, off, size,
6790					      reg->map_ptr->key_size, false);
6791		if (err)
6792			return err;
6793		if (value_regno >= 0)
6794			mark_reg_unknown(env, regs, value_regno);
6795	} else if (reg->type == PTR_TO_MAP_VALUE) {
6796		struct btf_field *kptr_field = NULL;
6797
6798		if (t == BPF_WRITE && value_regno >= 0 &&
6799		    is_pointer_value(env, value_regno)) {
6800			verbose(env, "R%d leaks addr into map\n", value_regno);
6801			return -EACCES;
6802		}
6803		err = check_map_access_type(env, regno, off, size, t);
6804		if (err)
6805			return err;
6806		err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
6807		if (err)
6808			return err;
6809		if (tnum_is_const(reg->var_off))
6810			kptr_field = btf_record_find(reg->map_ptr->record,
6811						     off + reg->var_off.value, BPF_KPTR);
6812		if (kptr_field) {
6813			err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
6814		} else if (t == BPF_READ && value_regno >= 0) {
6815			struct bpf_map *map = reg->map_ptr;
6816
6817			/* if map is read-only, track its contents as scalars */
6818			if (tnum_is_const(reg->var_off) &&
6819			    bpf_map_is_rdonly(map) &&
6820			    map->ops->map_direct_value_addr) {
6821				int map_off = off + reg->var_off.value;
6822				u64 val = 0;
6823
6824				err = bpf_map_direct_read(map, map_off, size,
6825							  &val, is_ldsx);
6826				if (err)
6827					return err;
6828
6829				regs[value_regno].type = SCALAR_VALUE;
6830				__mark_reg_known(&regs[value_regno], val);
6831			} else {
6832				mark_reg_unknown(env, regs, value_regno);
6833			}
6834		}
6835	} else if (base_type(reg->type) == PTR_TO_MEM) {
6836		bool rdonly_mem = type_is_rdonly_mem(reg->type);
6837
6838		if (type_may_be_null(reg->type)) {
6839			verbose(env, "R%d invalid mem access '%s'\n", regno,
6840				reg_type_str(env, reg->type));
6841			return -EACCES;
6842		}
6843
6844		if (t == BPF_WRITE && rdonly_mem) {
6845			verbose(env, "R%d cannot write into %s\n",
6846				regno, reg_type_str(env, reg->type));
6847			return -EACCES;
6848		}
6849
6850		if (t == BPF_WRITE && value_regno >= 0 &&
6851		    is_pointer_value(env, value_regno)) {
6852			verbose(env, "R%d leaks addr into mem\n", value_regno);
6853			return -EACCES;
6854		}
6855
6856		err = check_mem_region_access(env, regno, off, size,
6857					      reg->mem_size, false);
6858		if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
6859			mark_reg_unknown(env, regs, value_regno);
6860	} else if (reg->type == PTR_TO_CTX) {
6861		enum bpf_reg_type reg_type = SCALAR_VALUE;
6862		struct btf *btf = NULL;
6863		u32 btf_id = 0;
6864
6865		if (t == BPF_WRITE && value_regno >= 0 &&
6866		    is_pointer_value(env, value_regno)) {
6867			verbose(env, "R%d leaks addr into ctx\n", value_regno);
6868			return -EACCES;
6869		}
6870
6871		err = check_ptr_off_reg(env, reg, regno);
6872		if (err < 0)
6873			return err;
6874
6875		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
6876				       &btf_id);
6877		if (err)
6878			verbose_linfo(env, insn_idx, "; ");
6879		if (!err && t == BPF_READ && value_regno >= 0) {
6880			/* ctx access returns either a scalar, or a
6881			 * PTR_TO_PACKET[_META,_END]. In the latter
6882			 * case, we know the offset is zero.
6883			 */
6884			if (reg_type == SCALAR_VALUE) {
6885				mark_reg_unknown(env, regs, value_regno);
6886			} else {
6887				mark_reg_known_zero(env, regs,
6888						    value_regno);
6889				if (type_may_be_null(reg_type))
6890					regs[value_regno].id = ++env->id_gen;
6891				/* A load of ctx field could have different
6892				 * actual load size with the one encoded in the
6893				 * insn. When the dst is PTR, it is for sure not
6894				 * a sub-register.
6895				 */
6896				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
6897				if (base_type(reg_type) == PTR_TO_BTF_ID) {
6898					regs[value_regno].btf = btf;
6899					regs[value_regno].btf_id = btf_id;
6900				}
6901			}
6902			regs[value_regno].type = reg_type;
6903		}
6904
6905	} else if (reg->type == PTR_TO_STACK) {
6906		/* Basic bounds checks. */
6907		err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
6908		if (err)
6909			return err;
6910
6911		if (t == BPF_READ)
6912			err = check_stack_read(env, regno, off, size,
6913					       value_regno);
6914		else
6915			err = check_stack_write(env, regno, off, size,
6916						value_regno, insn_idx);
6917	} else if (reg_is_pkt_pointer(reg)) {
6918		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
6919			verbose(env, "cannot write into packet\n");
6920			return -EACCES;
6921		}
6922		if (t == BPF_WRITE && value_regno >= 0 &&
6923		    is_pointer_value(env, value_regno)) {
6924			verbose(env, "R%d leaks addr into packet\n",
6925				value_regno);
6926			return -EACCES;
6927		}
6928		err = check_packet_access(env, regno, off, size, false);
6929		if (!err && t == BPF_READ && value_regno >= 0)
6930			mark_reg_unknown(env, regs, value_regno);
6931	} else if (reg->type == PTR_TO_FLOW_KEYS) {
6932		if (t == BPF_WRITE && value_regno >= 0 &&
6933		    is_pointer_value(env, value_regno)) {
6934			verbose(env, "R%d leaks addr into flow keys\n",
6935				value_regno);
6936			return -EACCES;
6937		}
6938
6939		err = check_flow_keys_access(env, off, size);
6940		if (!err && t == BPF_READ && value_regno >= 0)
6941			mark_reg_unknown(env, regs, value_regno);
6942	} else if (type_is_sk_pointer(reg->type)) {
6943		if (t == BPF_WRITE) {
6944			verbose(env, "R%d cannot write into %s\n",
6945				regno, reg_type_str(env, reg->type));
6946			return -EACCES;
6947		}
6948		err = check_sock_access(env, insn_idx, regno, off, size, t);
6949		if (!err && value_regno >= 0)
6950			mark_reg_unknown(env, regs, value_regno);
6951	} else if (reg->type == PTR_TO_TP_BUFFER) {
6952		err = check_tp_buffer_access(env, reg, regno, off, size);
6953		if (!err && t == BPF_READ && value_regno >= 0)
6954			mark_reg_unknown(env, regs, value_regno);
6955	} else if (base_type(reg->type) == PTR_TO_BTF_ID &&
6956		   !type_may_be_null(reg->type)) {
6957		err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
6958					      value_regno);
6959	} else if (reg->type == CONST_PTR_TO_MAP) {
6960		err = check_ptr_to_map_access(env, regs, regno, off, size, t,
6961					      value_regno);
6962	} else if (base_type(reg->type) == PTR_TO_BUF) {
6963		bool rdonly_mem = type_is_rdonly_mem(reg->type);
6964		u32 *max_access;
6965
6966		if (rdonly_mem) {
6967			if (t == BPF_WRITE) {
6968				verbose(env, "R%d cannot write into %s\n",
6969					regno, reg_type_str(env, reg->type));
6970				return -EACCES;
6971			}
6972			max_access = &env->prog->aux->max_rdonly_access;
6973		} else {
6974			max_access = &env->prog->aux->max_rdwr_access;
6975		}
6976
6977		err = check_buffer_access(env, reg, regno, off, size, false,
6978					  max_access);
6979
6980		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
6981			mark_reg_unknown(env, regs, value_regno);
6982	} else if (reg->type == PTR_TO_ARENA) {
6983		if (t == BPF_READ && value_regno >= 0)
6984			mark_reg_unknown(env, regs, value_regno);
6985	} else {
6986		verbose(env, "R%d invalid mem access '%s'\n", regno,
6987			reg_type_str(env, reg->type));
6988		return -EACCES;
6989	}
6990
6991	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
6992	    regs[value_regno].type == SCALAR_VALUE) {
6993		if (!is_ldsx)
6994			/* b/h/w load zero-extends, mark upper bits as known 0 */
6995			coerce_reg_to_size(&regs[value_regno], size);
6996		else
6997			coerce_reg_to_size_sx(&regs[value_regno], size);
6998	}
6999	return err;
7000}
7001
7002static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
7003			     bool allow_trust_mismatch);
7004
7005static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
7006{
7007	int load_reg;
7008	int err;
7009
7010	switch (insn->imm) {
7011	case BPF_ADD:
7012	case BPF_ADD | BPF_FETCH:
7013	case BPF_AND:
7014	case BPF_AND | BPF_FETCH:
7015	case BPF_OR:
7016	case BPF_OR | BPF_FETCH:
7017	case BPF_XOR:
7018	case BPF_XOR | BPF_FETCH:
7019	case BPF_XCHG:
7020	case BPF_CMPXCHG:
7021		break;
7022	default:
7023		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
7024		return -EINVAL;
7025	}
7026
7027	if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
7028		verbose(env, "invalid atomic operand size\n");
7029		return -EINVAL;
7030	}
7031
7032	/* check src1 operand */
7033	err = check_reg_arg(env, insn->src_reg, SRC_OP);
7034	if (err)
7035		return err;
7036
7037	/* check src2 operand */
7038	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
7039	if (err)
7040		return err;
7041
7042	if (insn->imm == BPF_CMPXCHG) {
7043		/* Check comparison of R0 with memory location */
7044		const u32 aux_reg = BPF_REG_0;
7045
7046		err = check_reg_arg(env, aux_reg, SRC_OP);
7047		if (err)
7048			return err;
7049
7050		if (is_pointer_value(env, aux_reg)) {
7051			verbose(env, "R%d leaks addr into mem\n", aux_reg);
7052			return -EACCES;
7053		}
7054	}
7055
7056	if (is_pointer_value(env, insn->src_reg)) {
7057		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
7058		return -EACCES;
7059	}
7060
7061	if (is_ctx_reg(env, insn->dst_reg) ||
7062	    is_pkt_reg(env, insn->dst_reg) ||
7063	    is_flow_key_reg(env, insn->dst_reg) ||
7064	    is_sk_reg(env, insn->dst_reg) ||
7065	    (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) {
7066		verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
7067			insn->dst_reg,
7068			reg_type_str(env, reg_state(env, insn->dst_reg)->type));
7069		return -EACCES;
7070	}
7071
7072	if (insn->imm & BPF_FETCH) {
7073		if (insn->imm == BPF_CMPXCHG)
7074			load_reg = BPF_REG_0;
7075		else
7076			load_reg = insn->src_reg;
7077
7078		/* check and record load of old value */
7079		err = check_reg_arg(env, load_reg, DST_OP);
7080		if (err)
7081			return err;
7082	} else {
7083		/* This instruction accesses a memory location but doesn't
7084		 * actually load it into a register.
7085		 */
7086		load_reg = -1;
7087	}
7088
7089	/* Check whether we can read the memory, with second call for fetch
7090	 * case to simulate the register fill.
7091	 */
7092	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
7093			       BPF_SIZE(insn->code), BPF_READ, -1, true, false);
7094	if (!err && load_reg >= 0)
7095		err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
7096				       BPF_SIZE(insn->code), BPF_READ, load_reg,
7097				       true, false);
7098	if (err)
7099		return err;
7100
7101	if (is_arena_reg(env, insn->dst_reg)) {
7102		err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
7103		if (err)
7104			return err;
7105	}
7106	/* Check whether we can write into the same memory. */
7107	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
7108			       BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
7109	if (err)
7110		return err;
7111	return 0;
7112}
7113
7114/* When register 'regno' is used to read the stack (either directly or through
7115 * a helper function) make sure that it's within stack boundary and, depending
7116 * on the access type and privileges, that all elements of the stack are
7117 * initialized.
7118 *
7119 * 'off' includes 'regno->off', but not its dynamic part (if any).
7120 *
7121 * All registers that have been spilled on the stack in the slots within the
7122 * read offsets are marked as read.
7123 */
7124static int check_stack_range_initialized(
7125		struct bpf_verifier_env *env, int regno, int off,
7126		int access_size, bool zero_size_allowed,
7127		enum bpf_access_src type, struct bpf_call_arg_meta *meta)
7128{
7129	struct bpf_reg_state *reg = reg_state(env, regno);
7130	struct bpf_func_state *state = func(env, reg);
7131	int err, min_off, max_off, i, j, slot, spi;
7132	char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
7133	enum bpf_access_type bounds_check_type;
7134	/* Some accesses can write anything into the stack, others are
7135	 * read-only.
7136	 */
7137	bool clobber = false;
7138
7139	if (access_size == 0 && !zero_size_allowed) {
7140		verbose(env, "invalid zero-sized read\n");
7141		return -EACCES;
7142	}
7143
7144	if (type == ACCESS_HELPER) {
7145		/* The bounds checks for writes are more permissive than for
7146		 * reads. However, if raw_mode is not set, we'll do extra
7147		 * checks below.
7148		 */
7149		bounds_check_type = BPF_WRITE;
7150		clobber = true;
7151	} else {
7152		bounds_check_type = BPF_READ;
7153	}
7154	err = check_stack_access_within_bounds(env, regno, off, access_size,
7155					       type, bounds_check_type);
7156	if (err)
7157		return err;
7158
7159
7160	if (tnum_is_const(reg->var_off)) {
7161		min_off = max_off = reg->var_off.value + off;
7162	} else {
7163		/* Variable offset is prohibited for unprivileged mode for
7164		 * simplicity since it requires corresponding support in
7165		 * Spectre masking for stack ALU.
7166		 * See also retrieve_ptr_limit().
7167		 */
7168		if (!env->bypass_spec_v1) {
7169			char tn_buf[48];
7170
7171			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
7172			verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
7173				regno, err_extra, tn_buf);
7174			return -EACCES;
7175		}
7176		/* Only initialized buffer on stack is allowed to be accessed
7177		 * with variable offset. With uninitialized buffer it's hard to
7178		 * guarantee that whole memory is marked as initialized on
7179		 * helper return since specific bounds are unknown what may
7180		 * cause uninitialized stack leaking.
7181		 */
7182		if (meta && meta->raw_mode)
7183			meta = NULL;
7184
7185		min_off = reg->smin_value + off;
7186		max_off = reg->smax_value + off;
7187	}
7188
7189	if (meta && meta->raw_mode) {
7190		/* Ensure we won't be overwriting dynptrs when simulating byte
7191		 * by byte access in check_helper_call using meta.access_size.
7192		 * This would be a problem if we have a helper in the future
7193		 * which takes:
7194		 *
7195		 *	helper(uninit_mem, len, dynptr)
7196		 *
7197		 * Now, uninint_mem may overlap with dynptr pointer. Hence, it
7198		 * may end up writing to dynptr itself when touching memory from
7199		 * arg 1. This can be relaxed on a case by case basis for known
7200		 * safe cases, but reject due to the possibilitiy of aliasing by
7201		 * default.
7202		 */
7203		for (i = min_off; i < max_off + access_size; i++) {
7204			int stack_off = -i - 1;
7205
7206			spi = __get_spi(i);
7207			/* raw_mode may write past allocated_stack */
7208			if (state->allocated_stack <= stack_off)
7209				continue;
7210			if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
7211				verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
7212				return -EACCES;
7213			}
7214		}
7215		meta->access_size = access_size;
7216		meta->regno = regno;
7217		return 0;
7218	}
7219
7220	for (i = min_off; i < max_off + access_size; i++) {
7221		u8 *stype;
7222
7223		slot = -i - 1;
7224		spi = slot / BPF_REG_SIZE;
7225		if (state->allocated_stack <= slot) {
7226			verbose(env, "verifier bug: allocated_stack too small");
7227			return -EFAULT;
7228		}
7229
7230		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
7231		if (*stype == STACK_MISC)
7232			goto mark;
7233		if ((*stype == STACK_ZERO) ||
7234		    (*stype == STACK_INVALID && env->allow_uninit_stack)) {
7235			if (clobber) {
7236				/* helper can write anything into the stack */
7237				*stype = STACK_MISC;
7238			}
7239			goto mark;
7240		}
7241
7242		if (is_spilled_reg(&state->stack[spi]) &&
7243		    (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
7244		     env->allow_ptr_leaks)) {
7245			if (clobber) {
7246				__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
7247				for (j = 0; j < BPF_REG_SIZE; j++)
7248					scrub_spilled_slot(&state->stack[spi].slot_type[j]);
7249			}
7250			goto mark;
7251		}
7252
7253		if (tnum_is_const(reg->var_off)) {
7254			verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
7255				err_extra, regno, min_off, i - min_off, access_size);
7256		} else {
7257			char tn_buf[48];
7258
7259			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
7260			verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
7261				err_extra, regno, tn_buf, i - min_off, access_size);
7262		}
7263		return -EACCES;
7264mark:
7265		/* reading any byte out of 8-byte 'spill_slot' will cause
7266		 * the whole slot to be marked as 'read'
7267		 */
7268		mark_reg_read(env, &state->stack[spi].spilled_ptr,
7269			      state->stack[spi].spilled_ptr.parent,
7270			      REG_LIVE_READ64);
7271		/* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
7272		 * be sure that whether stack slot is written to or not. Hence,
7273		 * we must still conservatively propagate reads upwards even if
7274		 * helper may write to the entire memory range.
7275		 */
7276	}
7277	return 0;
7278}
7279
7280static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
7281				   int access_size, bool zero_size_allowed,
7282				   struct bpf_call_arg_meta *meta)
7283{
7284	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
7285	u32 *max_access;
7286
7287	switch (base_type(reg->type)) {
7288	case PTR_TO_PACKET:
7289	case PTR_TO_PACKET_META:
7290		return check_packet_access(env, regno, reg->off, access_size,
7291					   zero_size_allowed);
7292	case PTR_TO_MAP_KEY:
7293		if (meta && meta->raw_mode) {
7294			verbose(env, "R%d cannot write into %s\n", regno,
7295				reg_type_str(env, reg->type));
7296			return -EACCES;
7297		}
7298		return check_mem_region_access(env, regno, reg->off, access_size,
7299					       reg->map_ptr->key_size, false);
7300	case PTR_TO_MAP_VALUE:
7301		if (check_map_access_type(env, regno, reg->off, access_size,
7302					  meta && meta->raw_mode ? BPF_WRITE :
7303					  BPF_READ))
7304			return -EACCES;
7305		return check_map_access(env, regno, reg->off, access_size,
7306					zero_size_allowed, ACCESS_HELPER);
7307	case PTR_TO_MEM:
7308		if (type_is_rdonly_mem(reg->type)) {
7309			if (meta && meta->raw_mode) {
7310				verbose(env, "R%d cannot write into %s\n", regno,
7311					reg_type_str(env, reg->type));
7312				return -EACCES;
7313			}
7314		}
7315		return check_mem_region_access(env, regno, reg->off,
7316					       access_size, reg->mem_size,
7317					       zero_size_allowed);
7318	case PTR_TO_BUF:
7319		if (type_is_rdonly_mem(reg->type)) {
7320			if (meta && meta->raw_mode) {
7321				verbose(env, "R%d cannot write into %s\n", regno,
7322					reg_type_str(env, reg->type));
7323				return -EACCES;
7324			}
7325
7326			max_access = &env->prog->aux->max_rdonly_access;
7327		} else {
7328			max_access = &env->prog->aux->max_rdwr_access;
7329		}
7330		return check_buffer_access(env, reg, regno, reg->off,
7331					   access_size, zero_size_allowed,
7332					   max_access);
7333	case PTR_TO_STACK:
7334		return check_stack_range_initialized(
7335				env,
7336				regno, reg->off, access_size,
7337				zero_size_allowed, ACCESS_HELPER, meta);
7338	case PTR_TO_BTF_ID:
7339		return check_ptr_to_btf_access(env, regs, regno, reg->off,
7340					       access_size, BPF_READ, -1);
7341	case PTR_TO_CTX:
7342		/* in case the function doesn't know how to access the context,
7343		 * (because we are in a program of type SYSCALL for example), we
7344		 * can not statically check its size.
7345		 * Dynamically check it now.
7346		 */
7347		if (!env->ops->convert_ctx_access) {
7348			enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
7349			int offset = access_size - 1;
7350
7351			/* Allow zero-byte read from PTR_TO_CTX */
7352			if (access_size == 0)
7353				return zero_size_allowed ? 0 : -EACCES;
7354
7355			return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
7356						atype, -1, false, false);
7357		}
7358
7359		fallthrough;
7360	default: /* scalar_value or invalid ptr */
7361		/* Allow zero-byte read from NULL, regardless of pointer type */
7362		if (zero_size_allowed && access_size == 0 &&
7363		    register_is_null(reg))
7364			return 0;
7365
7366		verbose(env, "R%d type=%s ", regno,
7367			reg_type_str(env, reg->type));
7368		verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
7369		return -EACCES;
7370	}
7371}
7372
7373/* verify arguments to helpers or kfuncs consisting of a pointer and an access
7374 * size.
7375 *
7376 * @regno is the register containing the access size. regno-1 is the register
7377 * containing the pointer.
7378 */
7379static int check_mem_size_reg(struct bpf_verifier_env *env,
7380			      struct bpf_reg_state *reg, u32 regno,
7381			      bool zero_size_allowed,
7382			      struct bpf_call_arg_meta *meta)
7383{
7384	int err;
7385
7386	/* This is used to refine r0 return value bounds for helpers
7387	 * that enforce this value as an upper bound on return values.
7388	 * See do_refine_retval_range() for helpers that can refine
7389	 * the return value. C type of helper is u32 so we pull register
7390	 * bound from umax_value however, if negative verifier errors
7391	 * out. Only upper bounds can be learned because retval is an
7392	 * int type and negative retvals are allowed.
7393	 */
7394	meta->msize_max_value = reg->umax_value;
7395
7396	/* The register is SCALAR_VALUE; the access check
7397	 * happens using its boundaries.
7398	 */
7399	if (!tnum_is_const(reg->var_off))
7400		/* For unprivileged variable accesses, disable raw
7401		 * mode so that the program is required to
7402		 * initialize all the memory that the helper could
7403		 * just partially fill up.
7404		 */
7405		meta = NULL;
7406
7407	if (reg->smin_value < 0) {
7408		verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
7409			regno);
7410		return -EACCES;
7411	}
7412
7413	if (reg->umin_value == 0 && !zero_size_allowed) {
7414		verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
7415			regno, reg->umin_value, reg->umax_value);
7416		return -EACCES;
7417	}
7418
7419	if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
7420		verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
7421			regno);
7422		return -EACCES;
7423	}
7424	err = check_helper_mem_access(env, regno - 1,
7425				      reg->umax_value,
7426				      zero_size_allowed, meta);
7427	if (!err)
7428		err = mark_chain_precision(env, regno);
7429	return err;
7430}
7431
7432static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
7433			 u32 regno, u32 mem_size)
7434{
7435	bool may_be_null = type_may_be_null(reg->type);
7436	struct bpf_reg_state saved_reg;
7437	struct bpf_call_arg_meta meta;
7438	int err;
7439
7440	if (register_is_null(reg))
7441		return 0;
7442
7443	memset(&meta, 0, sizeof(meta));
7444	/* Assuming that the register contains a value check if the memory
7445	 * access is safe. Temporarily save and restore the register's state as
7446	 * the conversion shouldn't be visible to a caller.
7447	 */
7448	if (may_be_null) {
7449		saved_reg = *reg;
7450		mark_ptr_not_null_reg(reg);
7451	}
7452
7453	err = check_helper_mem_access(env, regno, mem_size, true, &meta);
7454	/* Check access for BPF_WRITE */
7455	meta.raw_mode = true;
7456	err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
7457
7458	if (may_be_null)
7459		*reg = saved_reg;
7460
7461	return err;
7462}
7463
7464static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
7465				    u32 regno)
7466{
7467	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
7468	bool may_be_null = type_may_be_null(mem_reg->type);
7469	struct bpf_reg_state saved_reg;
7470	struct bpf_call_arg_meta meta;
7471	int err;
7472
7473	WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
7474
7475	memset(&meta, 0, sizeof(meta));
7476
7477	if (may_be_null) {
7478		saved_reg = *mem_reg;
7479		mark_ptr_not_null_reg(mem_reg);
7480	}
7481
7482	err = check_mem_size_reg(env, reg, regno, true, &meta);
7483	/* Check access for BPF_WRITE */
7484	meta.raw_mode = true;
7485	err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
7486
7487	if (may_be_null)
7488		*mem_reg = saved_reg;
7489	return err;
7490}
7491
7492/* Implementation details:
7493 * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
7494 * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
7495 * Two bpf_map_lookups (even with the same key) will have different reg->id.
7496 * Two separate bpf_obj_new will also have different reg->id.
7497 * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
7498 * clears reg->id after value_or_null->value transition, since the verifier only
7499 * cares about the range of access to valid map value pointer and doesn't care
7500 * about actual address of the map element.
7501 * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
7502 * reg->id > 0 after value_or_null->value transition. By doing so
7503 * two bpf_map_lookups will be considered two different pointers that
7504 * point to different bpf_spin_locks. Likewise for pointers to allocated objects
7505 * returned from bpf_obj_new.
7506 * The verifier allows taking only one bpf_spin_lock at a time to avoid
7507 * dead-locks.
7508 * Since only one bpf_spin_lock is allowed the checks are simpler than
7509 * reg_is_refcounted() logic. The verifier needs to remember only
7510 * one spin_lock instead of array of acquired_refs.
7511 * cur_state->active_lock remembers which map value element or allocated
7512 * object got locked and clears it after bpf_spin_unlock.
7513 */
7514static int process_spin_lock(struct bpf_verifier_env *env, int regno,
7515			     bool is_lock)
7516{
7517	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
7518	struct bpf_verifier_state *cur = env->cur_state;
7519	bool is_const = tnum_is_const(reg->var_off);
7520	u64 val = reg->var_off.value;
7521	struct bpf_map *map = NULL;
7522	struct btf *btf = NULL;
7523	struct btf_record *rec;
7524
7525	if (!is_const) {
7526		verbose(env,
7527			"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
7528			regno);
7529		return -EINVAL;
7530	}
7531	if (reg->type == PTR_TO_MAP_VALUE) {
7532		map = reg->map_ptr;
7533		if (!map->btf) {
7534			verbose(env,
7535				"map '%s' has to have BTF in order to use bpf_spin_lock\n",
7536				map->name);
7537			return -EINVAL;
7538		}
7539	} else {
7540		btf = reg->btf;
7541	}
7542
7543	rec = reg_btf_record(reg);
7544	if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
7545		verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
7546			map ? map->name : "kptr");
7547		return -EINVAL;
7548	}
7549	if (rec->spin_lock_off != val + reg->off) {
7550		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
7551			val + reg->off, rec->spin_lock_off);
7552		return -EINVAL;
7553	}
7554	if (is_lock) {
7555		if (cur->active_lock.ptr) {
7556			verbose(env,
7557				"Locking two bpf_spin_locks are not allowed\n");
7558			return -EINVAL;
7559		}
7560		if (map)
7561			cur->active_lock.ptr = map;
7562		else
7563			cur->active_lock.ptr = btf;
7564		cur->active_lock.id = reg->id;
7565	} else {
7566		void *ptr;
7567
7568		if (map)
7569			ptr = map;
7570		else
7571			ptr = btf;
7572
7573		if (!cur->active_lock.ptr) {
7574			verbose(env, "bpf_spin_unlock without taking a lock\n");
7575			return -EINVAL;
7576		}
7577		if (cur->active_lock.ptr != ptr ||
7578		    cur->active_lock.id != reg->id) {
7579			verbose(env, "bpf_spin_unlock of different lock\n");
7580			return -EINVAL;
7581		}
7582
7583		invalidate_non_owning_refs(env);
7584
7585		cur->active_lock.ptr = NULL;
7586		cur->active_lock.id = 0;
7587	}
7588	return 0;
7589}
7590
7591static int process_timer_func(struct bpf_verifier_env *env, int regno,
7592			      struct bpf_call_arg_meta *meta)
7593{
7594	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
7595	bool is_const = tnum_is_const(reg->var_off);
7596	struct bpf_map *map = reg->map_ptr;
7597	u64 val = reg->var_off.value;
7598
7599	if (!is_const) {
7600		verbose(env,
7601			"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
7602			regno);
7603		return -EINVAL;
7604	}
7605	if (!map->btf) {
7606		verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
7607			map->name);
7608		return -EINVAL;
7609	}
7610	if (!btf_record_has_field(map->record, BPF_TIMER)) {
7611		verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
7612		return -EINVAL;
7613	}
7614	if (map->record->timer_off != val + reg->off) {
7615		verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
7616			val + reg->off, map->record->timer_off);
7617		return -EINVAL;
7618	}
7619	if (meta->map_ptr) {
7620		verbose(env, "verifier bug. Two map pointers in a timer helper\n");
7621		return -EFAULT;
7622	}
7623	meta->map_uid = reg->map_uid;
7624	meta->map_ptr = map;
7625	return 0;
7626}
7627
7628static int process_wq_func(struct bpf_verifier_env *env, int regno,
7629			   struct bpf_kfunc_call_arg_meta *meta)
7630{
7631	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
7632	struct bpf_map *map = reg->map_ptr;
7633	u64 val = reg->var_off.value;
7634
7635	if (map->record->wq_off != val + reg->off) {
7636		verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
7637			val + reg->off, map->record->wq_off);
7638		return -EINVAL;
7639	}
7640	meta->map.uid = reg->map_uid;
7641	meta->map.ptr = map;
7642	return 0;
7643}
7644
7645static int process_kptr_func(struct bpf_verifier_env *env, int regno,
7646			     struct bpf_call_arg_meta *meta)
7647{
7648	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
7649	struct bpf_map *map_ptr = reg->map_ptr;
7650	struct btf_field *kptr_field;
7651	u32 kptr_off;
7652
7653	if (!tnum_is_const(reg->var_off)) {
7654		verbose(env,
7655			"R%d doesn't have constant offset. kptr has to be at the constant offset\n",
7656			regno);
7657		return -EINVAL;
7658	}
7659	if (!map_ptr->btf) {
7660		verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
7661			map_ptr->name);
7662		return -EINVAL;
7663	}
7664	if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
7665		verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
7666		return -EINVAL;
7667	}
7668
7669	meta->map_ptr = map_ptr;
7670	kptr_off = reg->off + reg->var_off.value;
7671	kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
7672	if (!kptr_field) {
7673		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
7674		return -EACCES;
7675	}
7676	if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
7677		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
7678		return -EACCES;
7679	}
7680	meta->kptr_field = kptr_field;
7681	return 0;
7682}
7683
7684/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
7685 * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
7686 *
7687 * In both cases we deal with the first 8 bytes, but need to mark the next 8
7688 * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
7689 * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
7690 *
7691 * Mutability of bpf_dynptr is at two levels, one is at the level of struct
7692 * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
7693 * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
7694 * mutate the view of the dynptr and also possibly destroy it. In the latter
7695 * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
7696 * memory that dynptr points to.
7697 *
7698 * The verifier will keep track both levels of mutation (bpf_dynptr's in
7699 * reg->type and the memory's in reg->dynptr.type), but there is no support for
7700 * readonly dynptr view yet, hence only the first case is tracked and checked.
7701 *
7702 * This is consistent with how C applies the const modifier to a struct object,
7703 * where the pointer itself inside bpf_dynptr becomes const but not what it
7704 * points to.
7705 *
7706 * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
7707 * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
7708 */
7709static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
7710			       enum bpf_arg_type arg_type, int clone_ref_obj_id)
7711{
7712	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
7713	int err;
7714
7715	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
7716	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
7717	 */
7718	if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
7719		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
7720		return -EFAULT;
7721	}
7722
7723	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
7724	 *		 constructing a mutable bpf_dynptr object.
7725	 *
7726	 *		 Currently, this is only possible with PTR_TO_STACK
7727	 *		 pointing to a region of at least 16 bytes which doesn't
7728	 *		 contain an existing bpf_dynptr.
7729	 *
7730	 *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
7731	 *		 mutated or destroyed. However, the memory it points to
7732	 *		 may be mutated.
7733	 *
7734	 *  None       - Points to a initialized dynptr that can be mutated and
7735	 *		 destroyed, including mutation of the memory it points
7736	 *		 to.
7737	 */
7738	if (arg_type & MEM_UNINIT) {
7739		int i;
7740
7741		if (!is_dynptr_reg_valid_uninit(env, reg)) {
7742			verbose(env, "Dynptr has to be an uninitialized dynptr\n");
7743			return -EINVAL;
7744		}
7745
7746		/* we write BPF_DW bits (8 bytes) at a time */
7747		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
7748			err = check_mem_access(env, insn_idx, regno,
7749					       i, BPF_DW, BPF_WRITE, -1, false, false);
7750			if (err)
7751				return err;
7752		}
7753
7754		err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
7755	} else /* MEM_RDONLY and None case from above */ {
7756		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
7757		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
7758			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
7759			return -EINVAL;
7760		}
7761
7762		if (!is_dynptr_reg_valid_init(env, reg)) {
7763			verbose(env,
7764				"Expected an initialized dynptr as arg #%d\n",
7765				regno);
7766			return -EINVAL;
7767		}
7768
7769		/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
7770		if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
7771			verbose(env,
7772				"Expected a dynptr of type %s as arg #%d\n",
7773				dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
7774			return -EINVAL;
7775		}
7776
7777		err = mark_dynptr_read(env, reg);
7778	}
7779	return err;
7780}
7781
7782static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
7783{
7784	struct bpf_func_state *state = func(env, reg);
7785
7786	return state->stack[spi].spilled_ptr.ref_obj_id;
7787}
7788
7789static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
7790{
7791	return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
7792}
7793
7794static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
7795{
7796	return meta->kfunc_flags & KF_ITER_NEW;
7797}
7798
7799static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
7800{
7801	return meta->kfunc_flags & KF_ITER_NEXT;
7802}
7803
7804static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
7805{
7806	return meta->kfunc_flags & KF_ITER_DESTROY;
7807}
7808
7809static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
7810{
7811	/* btf_check_iter_kfuncs() guarantees that first argument of any iter
7812	 * kfunc is iter state pointer
7813	 */
7814	return arg == 0 && is_iter_kfunc(meta);
7815}
7816
7817static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
7818			    struct bpf_kfunc_call_arg_meta *meta)
7819{
7820	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
7821	const struct btf_type *t;
7822	const struct btf_param *arg;
7823	int spi, err, i, nr_slots;
7824	u32 btf_id;
7825
7826	/* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
7827	arg = &btf_params(meta->func_proto)[0];
7828	t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);	/* PTR */
7829	t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);	/* STRUCT */
7830	nr_slots = t->size / BPF_REG_SIZE;
7831
7832	if (is_iter_new_kfunc(meta)) {
7833		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
7834		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
7835			verbose(env, "expected uninitialized iter_%s as arg #%d\n",
7836				iter_type_str(meta->btf, btf_id), regno);
7837			return -EINVAL;
7838		}
7839
7840		for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
7841			err = check_mem_access(env, insn_idx, regno,
7842					       i, BPF_DW, BPF_WRITE, -1, false, false);
7843			if (err)
7844				return err;
7845		}
7846
7847		err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
7848		if (err)
7849			return err;
7850	} else {
7851		/* iter_next() or iter_destroy() expect initialized iter state*/
7852		err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
7853		switch (err) {
7854		case 0:
7855			break;
7856		case -EINVAL:
7857			verbose(env, "expected an initialized iter_%s as arg #%d\n",
7858				iter_type_str(meta->btf, btf_id), regno);
7859			return err;
7860		case -EPROTO:
7861			verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
7862			return err;
7863		default:
7864			return err;
7865		}
7866
7867		spi = iter_get_spi(env, reg, nr_slots);
7868		if (spi < 0)
7869			return spi;
7870
7871		err = mark_iter_read(env, reg, spi, nr_slots);
7872		if (err)
7873			return err;
7874
7875		/* remember meta->iter info for process_iter_next_call() */
7876		meta->iter.spi = spi;
7877		meta->iter.frameno = reg->frameno;
7878		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
7879
7880		if (is_iter_destroy_kfunc(meta)) {
7881			err = unmark_stack_slots_iter(env, reg, nr_slots);
7882			if (err)
7883				return err;
7884		}
7885	}
7886
7887	return 0;
7888}
7889
7890/* Look for a previous loop entry at insn_idx: nearest parent state
7891 * stopped at insn_idx with callsites matching those in cur->frame.
7892 */
7893static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
7894						  struct bpf_verifier_state *cur,
7895						  int insn_idx)
7896{
7897	struct bpf_verifier_state_list *sl;
7898	struct bpf_verifier_state *st;
7899
7900	/* Explored states are pushed in stack order, most recent states come first */
7901	sl = *explored_state(env, insn_idx);
7902	for (; sl; sl = sl->next) {
7903		/* If st->branches != 0 state is a part of current DFS verification path,
7904		 * hence cur & st for a loop.
7905		 */
7906		st = &sl->state;
7907		if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
7908		    st->dfs_depth < cur->dfs_depth)
7909			return st;
7910	}
7911
7912	return NULL;
7913}
7914
7915static void reset_idmap_scratch(struct bpf_verifier_env *env);
7916static bool regs_exact(const struct bpf_reg_state *rold,
7917		       const struct bpf_reg_state *rcur,
7918		       struct bpf_idmap *idmap);
7919
7920static void maybe_widen_reg(struct bpf_verifier_env *env,
7921			    struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
7922			    struct bpf_idmap *idmap)
7923{
7924	if (rold->type != SCALAR_VALUE)
7925		return;
7926	if (rold->type != rcur->type)
7927		return;
7928	if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
7929		return;
7930	__mark_reg_unknown(env, rcur);
7931}
7932
7933static int widen_imprecise_scalars(struct bpf_verifier_env *env,
7934				   struct bpf_verifier_state *old,
7935				   struct bpf_verifier_state *cur)
7936{
7937	struct bpf_func_state *fold, *fcur;
7938	int i, fr;
7939
7940	reset_idmap_scratch(env);
7941	for (fr = old->curframe; fr >= 0; fr--) {
7942		fold = old->frame[fr];
7943		fcur = cur->frame[fr];
7944
7945		for (i = 0; i < MAX_BPF_REG; i++)
7946			maybe_widen_reg(env,
7947					&fold->regs[i],
7948					&fcur->regs[i],
7949					&env->idmap_scratch);
7950
7951		for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
7952			if (!is_spilled_reg(&fold->stack[i]) ||
7953			    !is_spilled_reg(&fcur->stack[i]))
7954				continue;
7955
7956			maybe_widen_reg(env,
7957					&fold->stack[i].spilled_ptr,
7958					&fcur->stack[i].spilled_ptr,
7959					&env->idmap_scratch);
7960		}
7961	}
7962	return 0;
7963}
7964
7965/* process_iter_next_call() is called when verifier gets to iterator's next
7966 * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
7967 * to it as just "iter_next()" in comments below.
7968 *
7969 * BPF verifier relies on a crucial contract for any iter_next()
7970 * implementation: it should *eventually* return NULL, and once that happens
7971 * it should keep returning NULL. That is, once iterator exhausts elements to
7972 * iterate, it should never reset or spuriously return new elements.
7973 *
7974 * With the assumption of such contract, process_iter_next_call() simulates
7975 * a fork in the verifier state to validate loop logic correctness and safety
7976 * without having to simulate infinite amount of iterations.
7977 *
7978 * In current state, we first assume that iter_next() returned NULL and
7979 * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
7980 * conditions we should not form an infinite loop and should eventually reach
7981 * exit.
7982 *
7983 * Besides that, we also fork current state and enqueue it for later
7984 * verification. In a forked state we keep iterator state as ACTIVE
7985 * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
7986 * also bump iteration depth to prevent erroneous infinite loop detection
7987 * later on (see iter_active_depths_differ() comment for details). In this
7988 * state we assume that we'll eventually loop back to another iter_next()
7989 * calls (it could be in exactly same location or in some other instruction,
7990 * it doesn't matter, we don't make any unnecessary assumptions about this,
7991 * everything revolves around iterator state in a stack slot, not which
7992 * instruction is calling iter_next()). When that happens, we either will come
7993 * to iter_next() with equivalent state and can conclude that next iteration
7994 * will proceed in exactly the same way as we just verified, so it's safe to
7995 * assume that loop converges. If not, we'll go on another iteration
7996 * simulation with a different input state, until all possible starting states
7997 * are validated or we reach maximum number of instructions limit.
7998 *
7999 * This way, we will either exhaustively discover all possible input states
8000 * that iterator loop can start with and eventually will converge, or we'll
8001 * effectively regress into bounded loop simulation logic and either reach
8002 * maximum number of instructions if loop is not provably convergent, or there
8003 * is some statically known limit on number of iterations (e.g., if there is
8004 * an explicit `if n > 100 then break;` statement somewhere in the loop).
8005 *
8006 * Iteration convergence logic in is_state_visited() relies on exact
8007 * states comparison, which ignores read and precision marks.
8008 * This is necessary because read and precision marks are not finalized
8009 * while in the loop. Exact comparison might preclude convergence for
8010 * simple programs like below:
8011 *
8012 *     i = 0;
8013 *     while(iter_next(&it))
8014 *       i++;
8015 *
8016 * At each iteration step i++ would produce a new distinct state and
8017 * eventually instruction processing limit would be reached.
8018 *
8019 * To avoid such behavior speculatively forget (widen) range for
8020 * imprecise scalar registers, if those registers were not precise at the
8021 * end of the previous iteration and do not match exactly.
8022 *
8023 * This is a conservative heuristic that allows to verify wide range of programs,
8024 * however it precludes verification of programs that conjure an
8025 * imprecise value on the first loop iteration and use it as precise on a second.
8026 * For example, the following safe program would fail to verify:
8027 *
8028 *     struct bpf_num_iter it;
8029 *     int arr[10];
8030 *     int i = 0, a = 0;
8031 *     bpf_iter_num_new(&it, 0, 10);
8032 *     while (bpf_iter_num_next(&it)) {
8033 *       if (a == 0) {
8034 *         a = 1;
8035 *         i = 7; // Because i changed verifier would forget
8036 *                // it's range on second loop entry.
8037 *       } else {
8038 *         arr[i] = 42; // This would fail to verify.
8039 *       }
8040 *     }
8041 *     bpf_iter_num_destroy(&it);
8042 */
8043static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
8044				  struct bpf_kfunc_call_arg_meta *meta)
8045{
8046	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
8047	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
8048	struct bpf_reg_state *cur_iter, *queued_iter;
8049	int iter_frameno = meta->iter.frameno;
8050	int iter_spi = meta->iter.spi;
8051
8052	BTF_TYPE_EMIT(struct bpf_iter);
8053
8054	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
8055
8056	if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
8057	    cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
8058		verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
8059			cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
8060		return -EFAULT;
8061	}
8062
8063	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
8064		/* Because iter_next() call is a checkpoint is_state_visitied()
8065		 * should guarantee parent state with same call sites and insn_idx.
8066		 */
8067		if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
8068		    !same_callsites(cur_st->parent, cur_st)) {
8069			verbose(env, "bug: bad parent state for iter next call");
8070			return -EFAULT;
8071		}
8072		/* Note cur_st->parent in the call below, it is necessary to skip
8073		 * checkpoint created for cur_st by is_state_visited()
8074		 * right at this instruction.
8075		 */
8076		prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
8077		/* branch out active iter state */
8078		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
8079		if (!queued_st)
8080			return -ENOMEM;
8081
8082		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
8083		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
8084		queued_iter->iter.depth++;
8085		if (prev_st)
8086			widen_imprecise_scalars(env, prev_st, queued_st);
8087
8088		queued_fr = queued_st->frame[queued_st->curframe];
8089		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
8090	}
8091
8092	/* switch to DRAINED state, but keep the depth unchanged */
8093	/* mark current iter state as drained and assume returned NULL */
8094	cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
8095	__mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
8096
8097	return 0;
8098}
8099
8100static bool arg_type_is_mem_size(enum bpf_arg_type type)
8101{
8102	return type == ARG_CONST_SIZE ||
8103	       type == ARG_CONST_SIZE_OR_ZERO;
8104}
8105
8106static bool arg_type_is_release(enum bpf_arg_type type)
8107{
8108	return type & OBJ_RELEASE;
8109}
8110
8111static bool arg_type_is_dynptr(enum bpf_arg_type type)
8112{
8113	return base_type(type) == ARG_PTR_TO_DYNPTR;
8114}
8115
8116static int int_ptr_type_to_size(enum bpf_arg_type type)
8117{
8118	if (type == ARG_PTR_TO_INT)
8119		return sizeof(u32);
8120	else if (type == ARG_PTR_TO_LONG)
8121		return sizeof(u64);
8122
8123	return -EINVAL;
8124}
8125
8126static int resolve_map_arg_type(struct bpf_verifier_env *env,
8127				 const struct bpf_call_arg_meta *meta,
8128				 enum bpf_arg_type *arg_type)
8129{
8130	if (!meta->map_ptr) {
8131		/* kernel subsystem misconfigured verifier */
8132		verbose(env, "invalid map_ptr to access map->type\n");
8133		return -EACCES;
8134	}
8135
8136	switch (meta->map_ptr->map_type) {
8137	case BPF_MAP_TYPE_SOCKMAP:
8138	case BPF_MAP_TYPE_SOCKHASH:
8139		if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
8140			*arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
8141		} else {
8142			verbose(env, "invalid arg_type for sockmap/sockhash\n");
8143			return -EINVAL;
8144		}
8145		break;
8146	case BPF_MAP_TYPE_BLOOM_FILTER:
8147		if (meta->func_id == BPF_FUNC_map_peek_elem)
8148			*arg_type = ARG_PTR_TO_MAP_VALUE;
8149		break;
8150	default:
8151		break;
8152	}
8153	return 0;
8154}
8155
8156struct bpf_reg_types {
8157	const enum bpf_reg_type types[10];
8158	u32 *btf_id;
8159};
8160
8161static const struct bpf_reg_types sock_types = {
8162	.types = {
8163		PTR_TO_SOCK_COMMON,
8164		PTR_TO_SOCKET,
8165		PTR_TO_TCP_SOCK,
8166		PTR_TO_XDP_SOCK,
8167	},
8168};
8169
8170#ifdef CONFIG_NET
8171static const struct bpf_reg_types btf_id_sock_common_types = {
8172	.types = {
8173		PTR_TO_SOCK_COMMON,
8174		PTR_TO_SOCKET,
8175		PTR_TO_TCP_SOCK,
8176		PTR_TO_XDP_SOCK,
8177		PTR_TO_BTF_ID,
8178		PTR_TO_BTF_ID | PTR_TRUSTED,
8179	},
8180	.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
8181};
8182#endif
8183
8184static const struct bpf_reg_types mem_types = {
8185	.types = {
8186		PTR_TO_STACK,
8187		PTR_TO_PACKET,
8188		PTR_TO_PACKET_META,
8189		PTR_TO_MAP_KEY,
8190		PTR_TO_MAP_VALUE,
8191		PTR_TO_MEM,
8192		PTR_TO_MEM | MEM_RINGBUF,
8193		PTR_TO_BUF,
8194		PTR_TO_BTF_ID | PTR_TRUSTED,
8195	},
8196};
8197
8198static const struct bpf_reg_types int_ptr_types = {
8199	.types = {
8200		PTR_TO_STACK,
8201		PTR_TO_PACKET,
8202		PTR_TO_PACKET_META,
8203		PTR_TO_MAP_KEY,
8204		PTR_TO_MAP_VALUE,
8205	},
8206};
8207
8208static const struct bpf_reg_types spin_lock_types = {
8209	.types = {
8210		PTR_TO_MAP_VALUE,
8211		PTR_TO_BTF_ID | MEM_ALLOC,
8212	}
8213};
8214
8215static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
8216static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
8217static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
8218static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
8219static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
8220static const struct bpf_reg_types btf_ptr_types = {
8221	.types = {
8222		PTR_TO_BTF_ID,
8223		PTR_TO_BTF_ID | PTR_TRUSTED,
8224		PTR_TO_BTF_ID | MEM_RCU,
8225	},
8226};
8227static const struct bpf_reg_types percpu_btf_ptr_types = {
8228	.types = {
8229		PTR_TO_BTF_ID | MEM_PERCPU,
8230		PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
8231		PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
8232	}
8233};
8234static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
8235static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
8236static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
8237static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
8238static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
8239static const struct bpf_reg_types dynptr_types = {
8240	.types = {
8241		PTR_TO_STACK,
8242		CONST_PTR_TO_DYNPTR,
8243	}
8244};
8245
8246static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
8247	[ARG_PTR_TO_MAP_KEY]		= &mem_types,
8248	[ARG_PTR_TO_MAP_VALUE]		= &mem_types,
8249	[ARG_CONST_SIZE]		= &scalar_types,
8250	[ARG_CONST_SIZE_OR_ZERO]	= &scalar_types,
8251	[ARG_CONST_ALLOC_SIZE_OR_ZERO]	= &scalar_types,
8252	[ARG_CONST_MAP_PTR]		= &const_map_ptr_types,
8253	[ARG_PTR_TO_CTX]		= &context_types,
8254	[ARG_PTR_TO_SOCK_COMMON]	= &sock_types,
8255#ifdef CONFIG_NET
8256	[ARG_PTR_TO_BTF_ID_SOCK_COMMON]	= &btf_id_sock_common_types,
8257#endif
8258	[ARG_PTR_TO_SOCKET]		= &fullsock_types,
8259	[ARG_PTR_TO_BTF_ID]		= &btf_ptr_types,
8260	[ARG_PTR_TO_SPIN_LOCK]		= &spin_lock_types,
8261	[ARG_PTR_TO_MEM]		= &mem_types,
8262	[ARG_PTR_TO_RINGBUF_MEM]	= &ringbuf_mem_types,
8263	[ARG_PTR_TO_INT]		= &int_ptr_types,
8264	[ARG_PTR_TO_LONG]		= &int_ptr_types,
8265	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,
8266	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
8267	[ARG_PTR_TO_STACK]		= &stack_ptr_types,
8268	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
8269	[ARG_PTR_TO_TIMER]		= &timer_types,
8270	[ARG_PTR_TO_KPTR]		= &kptr_types,
8271	[ARG_PTR_TO_DYNPTR]		= &dynptr_types,
8272};
8273
8274static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
8275			  enum bpf_arg_type arg_type,
8276			  const u32 *arg_btf_id,
8277			  struct bpf_call_arg_meta *meta)
8278{
8279	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
8280	enum bpf_reg_type expected, type = reg->type;
8281	const struct bpf_reg_types *compatible;
8282	int i, j;
8283
8284	compatible = compatible_reg_types[base_type(arg_type)];
8285	if (!compatible) {
8286		verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
8287		return -EFAULT;
8288	}
8289
8290	/* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
8291	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
8292	 *
8293	 * Same for MAYBE_NULL:
8294	 *
8295	 * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
8296	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
8297	 *
8298	 * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
8299	 *
8300	 * Therefore we fold these flags depending on the arg_type before comparison.
8301	 */
8302	if (arg_type & MEM_RDONLY)
8303		type &= ~MEM_RDONLY;
8304	if (arg_type & PTR_MAYBE_NULL)
8305		type &= ~PTR_MAYBE_NULL;
8306	if (base_type(arg_type) == ARG_PTR_TO_MEM)
8307		type &= ~DYNPTR_TYPE_FLAG_MASK;
8308
8309	if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
8310		type &= ~MEM_ALLOC;
8311		type &= ~MEM_PERCPU;
8312	}
8313
8314	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
8315		expected = compatible->types[i];
8316		if (expected == NOT_INIT)
8317			break;
8318
8319		if (type == expected)
8320			goto found;
8321	}
8322
8323	verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
8324	for (j = 0; j + 1 < i; j++)
8325		verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
8326	verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
8327	return -EACCES;
8328
8329found:
8330	if (base_type(reg->type) != PTR_TO_BTF_ID)
8331		return 0;
8332
8333	if (compatible == &mem_types) {
8334		if (!(arg_type & MEM_RDONLY)) {
8335			verbose(env,
8336				"%s() may write into memory pointed by R%d type=%s\n",
8337				func_id_name(meta->func_id),
8338				regno, reg_type_str(env, reg->type));
8339			return -EACCES;
8340		}
8341		return 0;
8342	}
8343
8344	switch ((int)reg->type) {
8345	case PTR_TO_BTF_ID:
8346	case PTR_TO_BTF_ID | PTR_TRUSTED:
8347	case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
8348	case PTR_TO_BTF_ID | MEM_RCU:
8349	case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
8350	case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
8351	{
8352		/* For bpf_sk_release, it needs to match against first member
8353		 * 'struct sock_common', hence make an exception for it. This
8354		 * allows bpf_sk_release to work for multiple socket types.
8355		 */
8356		bool strict_type_match = arg_type_is_release(arg_type) &&
8357					 meta->func_id != BPF_FUNC_sk_release;
8358
8359		if (type_may_be_null(reg->type) &&
8360		    (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
8361			verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
8362			return -EACCES;
8363		}
8364
8365		if (!arg_btf_id) {
8366			if (!compatible->btf_id) {
8367				verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
8368				return -EFAULT;
8369			}
8370			arg_btf_id = compatible->btf_id;
8371		}
8372
8373		if (meta->func_id == BPF_FUNC_kptr_xchg) {
8374			if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
8375				return -EACCES;
8376		} else {
8377			if (arg_btf_id == BPF_PTR_POISON) {
8378				verbose(env, "verifier internal error:");
8379				verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
8380					regno);
8381				return -EACCES;
8382			}
8383
8384			if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
8385						  btf_vmlinux, *arg_btf_id,
8386						  strict_type_match)) {
8387				verbose(env, "R%d is of type %s but %s is expected\n",
8388					regno, btf_type_name(reg->btf, reg->btf_id),
8389					btf_type_name(btf_vmlinux, *arg_btf_id));
8390				return -EACCES;
8391			}
8392		}
8393		break;
8394	}
8395	case PTR_TO_BTF_ID | MEM_ALLOC:
8396	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
8397		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
8398		    meta->func_id != BPF_FUNC_kptr_xchg) {
8399			verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
8400			return -EFAULT;
8401		}
8402		if (meta->func_id == BPF_FUNC_kptr_xchg) {
8403			if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
8404				return -EACCES;
8405		}
8406		break;
8407	case PTR_TO_BTF_ID | MEM_PERCPU:
8408	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
8409	case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
8410		/* Handled by helper specific checks */
8411		break;
8412	default:
8413		verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n");
8414		return -EFAULT;
8415	}
8416	return 0;
8417}
8418
8419static struct btf_field *
8420reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
8421{
8422	struct btf_field *field;
8423	struct btf_record *rec;
8424
8425	rec = reg_btf_record(reg);
8426	if (!rec)
8427		return NULL;
8428
8429	field = btf_record_find(rec, off, fields);
8430	if (!field)
8431		return NULL;
8432
8433	return field;
8434}
8435
8436static int check_func_arg_reg_off(struct bpf_verifier_env *env,
8437				  const struct bpf_reg_state *reg, int regno,
8438				  enum bpf_arg_type arg_type)
8439{
8440	u32 type = reg->type;
8441
8442	/* When referenced register is passed to release function, its fixed
8443	 * offset must be 0.
8444	 *
8445	 * We will check arg_type_is_release reg has ref_obj_id when storing
8446	 * meta->release_regno.
8447	 */
8448	if (arg_type_is_release(arg_type)) {
8449		/* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
8450		 * may not directly point to the object being released, but to
8451		 * dynptr pointing to such object, which might be at some offset
8452		 * on the stack. In that case, we simply to fallback to the
8453		 * default handling.
8454		 */
8455		if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
8456			return 0;
8457
8458		/* Doing check_ptr_off_reg check for the offset will catch this
8459		 * because fixed_off_ok is false, but checking here allows us
8460		 * to give the user a better error message.
8461		 */
8462		if (reg->off) {
8463			verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
8464				regno);
8465			return -EINVAL;
8466		}
8467		return __check_ptr_off_reg(env, reg, regno, false);
8468	}
8469
8470	switch (type) {
8471	/* Pointer types where both fixed and variable offset is explicitly allowed: */
8472	case PTR_TO_STACK:
8473	case PTR_TO_PACKET:
8474	case PTR_TO_PACKET_META:
8475	case PTR_TO_MAP_KEY:
8476	case PTR_TO_MAP_VALUE:
8477	case PTR_TO_MEM:
8478	case PTR_TO_MEM | MEM_RDONLY:
8479	case PTR_TO_MEM | MEM_RINGBUF:
8480	case PTR_TO_BUF:
8481	case PTR_TO_BUF | MEM_RDONLY:
8482	case PTR_TO_ARENA:
8483	case SCALAR_VALUE:
8484		return 0;
8485	/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
8486	 * fixed offset.
8487	 */
8488	case PTR_TO_BTF_ID:
8489	case PTR_TO_BTF_ID | MEM_ALLOC:
8490	case PTR_TO_BTF_ID | PTR_TRUSTED:
8491	case PTR_TO_BTF_ID | MEM_RCU:
8492	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
8493	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
8494		/* When referenced PTR_TO_BTF_ID is passed to release function,
8495		 * its fixed offset must be 0. In the other cases, fixed offset
8496		 * can be non-zero. This was already checked above. So pass
8497		 * fixed_off_ok as true to allow fixed offset for all other
8498		 * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
8499		 * still need to do checks instead of returning.
8500		 */
8501		return __check_ptr_off_reg(env, reg, regno, true);
8502	default:
8503		return __check_ptr_off_reg(env, reg, regno, false);
8504	}
8505}
8506
8507static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
8508						const struct bpf_func_proto *fn,
8509						struct bpf_reg_state *regs)
8510{
8511	struct bpf_reg_state *state = NULL;
8512	int i;
8513
8514	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
8515		if (arg_type_is_dynptr(fn->arg_type[i])) {
8516			if (state) {
8517				verbose(env, "verifier internal error: multiple dynptr args\n");
8518				return NULL;
8519			}
8520			state = &regs[BPF_REG_1 + i];
8521		}
8522
8523	if (!state)
8524		verbose(env, "verifier internal error: no dynptr arg found\n");
8525
8526	return state;
8527}
8528
8529static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
8530{
8531	struct bpf_func_state *state = func(env, reg);
8532	int spi;
8533
8534	if (reg->type == CONST_PTR_TO_DYNPTR)
8535		return reg->id;
8536	spi = dynptr_get_spi(env, reg);
8537	if (spi < 0)
8538		return spi;
8539	return state->stack[spi].spilled_ptr.id;
8540}
8541
8542static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
8543{
8544	struct bpf_func_state *state = func(env, reg);
8545	int spi;
8546
8547	if (reg->type == CONST_PTR_TO_DYNPTR)
8548		return reg->ref_obj_id;
8549	spi = dynptr_get_spi(env, reg);
8550	if (spi < 0)
8551		return spi;
8552	return state->stack[spi].spilled_ptr.ref_obj_id;
8553}
8554
8555static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
8556					    struct bpf_reg_state *reg)
8557{
8558	struct bpf_func_state *state = func(env, reg);
8559	int spi;
8560
8561	if (reg->type == CONST_PTR_TO_DYNPTR)
8562		return reg->dynptr.type;
8563
8564	spi = __get_spi(reg->off);
8565	if (spi < 0) {
8566		verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
8567		return BPF_DYNPTR_TYPE_INVALID;
8568	}
8569
8570	return state->stack[spi].spilled_ptr.dynptr.type;
8571}
8572
8573static int check_reg_const_str(struct bpf_verifier_env *env,
8574			       struct bpf_reg_state *reg, u32 regno)
8575{
8576	struct bpf_map *map = reg->map_ptr;
8577	int err;
8578	int map_off;
8579	u64 map_addr;
8580	char *str_ptr;
8581
8582	if (reg->type != PTR_TO_MAP_VALUE)
8583		return -EINVAL;
8584
8585	if (!bpf_map_is_rdonly(map)) {
8586		verbose(env, "R%d does not point to a readonly map'\n", regno);
8587		return -EACCES;
8588	}
8589
8590	if (!tnum_is_const(reg->var_off)) {
8591		verbose(env, "R%d is not a constant address'\n", regno);
8592		return -EACCES;
8593	}
8594
8595	if (!map->ops->map_direct_value_addr) {
8596		verbose(env, "no direct value access support for this map type\n");
8597		return -EACCES;
8598	}
8599
8600	err = check_map_access(env, regno, reg->off,
8601			       map->value_size - reg->off, false,
8602			       ACCESS_HELPER);
8603	if (err)
8604		return err;
8605
8606	map_off = reg->off + reg->var_off.value;
8607	err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
8608	if (err) {
8609		verbose(env, "direct value access on string failed\n");
8610		return err;
8611	}
8612
8613	str_ptr = (char *)(long)(map_addr);
8614	if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
8615		verbose(env, "string is not zero-terminated\n");
8616		return -EINVAL;
8617	}
8618	return 0;
8619}
8620
8621static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
8622			  struct bpf_call_arg_meta *meta,
8623			  const struct bpf_func_proto *fn,
8624			  int insn_idx)
8625{
8626	u32 regno = BPF_REG_1 + arg;
8627	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
8628	enum bpf_arg_type arg_type = fn->arg_type[arg];
8629	enum bpf_reg_type type = reg->type;
8630	u32 *arg_btf_id = NULL;
8631	int err = 0;
8632
8633	if (arg_type == ARG_DONTCARE)
8634		return 0;
8635
8636	err = check_reg_arg(env, regno, SRC_OP);
8637	if (err)
8638		return err;
8639
8640	if (arg_type == ARG_ANYTHING) {
8641		if (is_pointer_value(env, regno)) {
8642			verbose(env, "R%d leaks addr into helper function\n",
8643				regno);
8644			return -EACCES;
8645		}
8646		return 0;
8647	}
8648
8649	if (type_is_pkt_pointer(type) &&
8650	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
8651		verbose(env, "helper access to the packet is not allowed\n");
8652		return -EACCES;
8653	}
8654
8655	if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
8656		err = resolve_map_arg_type(env, meta, &arg_type);
8657		if (err)
8658			return err;
8659	}
8660
8661	if (register_is_null(reg) && type_may_be_null(arg_type))
8662		/* A NULL register has a SCALAR_VALUE type, so skip
8663		 * type checking.
8664		 */
8665		goto skip_type_check;
8666
8667	/* arg_btf_id and arg_size are in a union. */
8668	if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
8669	    base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
8670		arg_btf_id = fn->arg_btf_id[arg];
8671
8672	err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
8673	if (err)
8674		return err;
8675
8676	err = check_func_arg_reg_off(env, reg, regno, arg_type);
8677	if (err)
8678		return err;
8679
8680skip_type_check:
8681	if (arg_type_is_release(arg_type)) {
8682		if (arg_type_is_dynptr(arg_type)) {
8683			struct bpf_func_state *state = func(env, reg);
8684			int spi;
8685
8686			/* Only dynptr created on stack can be released, thus
8687			 * the get_spi and stack state checks for spilled_ptr
8688			 * should only be done before process_dynptr_func for
8689			 * PTR_TO_STACK.
8690			 */
8691			if (reg->type == PTR_TO_STACK) {
8692				spi = dynptr_get_spi(env, reg);
8693				if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
8694					verbose(env, "arg %d is an unacquired reference\n", regno);
8695					return -EINVAL;
8696				}
8697			} else {
8698				verbose(env, "cannot release unowned const bpf_dynptr\n");
8699				return -EINVAL;
8700			}
8701		} else if (!reg->ref_obj_id && !register_is_null(reg)) {
8702			verbose(env, "R%d must be referenced when passed to release function\n",
8703				regno);
8704			return -EINVAL;
8705		}
8706		if (meta->release_regno) {
8707			verbose(env, "verifier internal error: more than one release argument\n");
8708			return -EFAULT;
8709		}
8710		meta->release_regno = regno;
8711	}
8712
8713	if (reg->ref_obj_id) {
8714		if (meta->ref_obj_id) {
8715			verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
8716				regno, reg->ref_obj_id,
8717				meta->ref_obj_id);
8718			return -EFAULT;
8719		}
8720		meta->ref_obj_id = reg->ref_obj_id;
8721	}
8722
8723	switch (base_type(arg_type)) {
8724	case ARG_CONST_MAP_PTR:
8725		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
8726		if (meta->map_ptr) {
8727			/* Use map_uid (which is unique id of inner map) to reject:
8728			 * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
8729			 * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
8730			 * if (inner_map1 && inner_map2) {
8731			 *     timer = bpf_map_lookup_elem(inner_map1);
8732			 *     if (timer)
8733			 *         // mismatch would have been allowed
8734			 *         bpf_timer_init(timer, inner_map2);
8735			 * }
8736			 *
8737			 * Comparing map_ptr is enough to distinguish normal and outer maps.
8738			 */
8739			if (meta->map_ptr != reg->map_ptr ||
8740			    meta->map_uid != reg->map_uid) {
8741				verbose(env,
8742					"timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
8743					meta->map_uid, reg->map_uid);
8744				return -EINVAL;
8745			}
8746		}
8747		meta->map_ptr = reg->map_ptr;
8748		meta->map_uid = reg->map_uid;
8749		break;
8750	case ARG_PTR_TO_MAP_KEY:
8751		/* bpf_map_xxx(..., map_ptr, ..., key) call:
8752		 * check that [key, key + map->key_size) are within
8753		 * stack limits and initialized
8754		 */
8755		if (!meta->map_ptr) {
8756			/* in function declaration map_ptr must come before
8757			 * map_key, so that it's verified and known before
8758			 * we have to check map_key here. Otherwise it means
8759			 * that kernel subsystem misconfigured verifier
8760			 */
8761			verbose(env, "invalid map_ptr to access map->key\n");
8762			return -EACCES;
8763		}
8764		err = check_helper_mem_access(env, regno,
8765					      meta->map_ptr->key_size, false,
8766					      NULL);
8767		break;
8768	case ARG_PTR_TO_MAP_VALUE:
8769		if (type_may_be_null(arg_type) && register_is_null(reg))
8770			return 0;
8771
8772		/* bpf_map_xxx(..., map_ptr, ..., value) call:
8773		 * check [value, value + map->value_size) validity
8774		 */
8775		if (!meta->map_ptr) {
8776			/* kernel subsystem misconfigured verifier */
8777			verbose(env, "invalid map_ptr to access map->value\n");
8778			return -EACCES;
8779		}
8780		meta->raw_mode = arg_type & MEM_UNINIT;
8781		err = check_helper_mem_access(env, regno,
8782					      meta->map_ptr->value_size, false,
8783					      meta);
8784		break;
8785	case ARG_PTR_TO_PERCPU_BTF_ID:
8786		if (!reg->btf_id) {
8787			verbose(env, "Helper has invalid btf_id in R%d\n", regno);
8788			return -EACCES;
8789		}
8790		meta->ret_btf = reg->btf;
8791		meta->ret_btf_id = reg->btf_id;
8792		break;
8793	case ARG_PTR_TO_SPIN_LOCK:
8794		if (in_rbtree_lock_required_cb(env)) {
8795			verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
8796			return -EACCES;
8797		}
8798		if (meta->func_id == BPF_FUNC_spin_lock) {
8799			err = process_spin_lock(env, regno, true);
8800			if (err)
8801				return err;
8802		} else if (meta->func_id == BPF_FUNC_spin_unlock) {
8803			err = process_spin_lock(env, regno, false);
8804			if (err)
8805				return err;
8806		} else {
8807			verbose(env, "verifier internal error\n");
8808			return -EFAULT;
8809		}
8810		break;
8811	case ARG_PTR_TO_TIMER:
8812		err = process_timer_func(env, regno, meta);
8813		if (err)
8814			return err;
8815		break;
8816	case ARG_PTR_TO_FUNC:
8817		meta->subprogno = reg->subprogno;
8818		break;
8819	case ARG_PTR_TO_MEM:
8820		/* The access to this pointer is only checked when we hit the
8821		 * next is_mem_size argument below.
8822		 */
8823		meta->raw_mode = arg_type & MEM_UNINIT;
8824		if (arg_type & MEM_FIXED_SIZE) {
8825			err = check_helper_mem_access(env, regno,
8826						      fn->arg_size[arg], false,
8827						      meta);
8828		}
8829		break;
8830	case ARG_CONST_SIZE:
8831		err = check_mem_size_reg(env, reg, regno, false, meta);
8832		break;
8833	case ARG_CONST_SIZE_OR_ZERO:
8834		err = check_mem_size_reg(env, reg, regno, true, meta);
8835		break;
8836	case ARG_PTR_TO_DYNPTR:
8837		err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
8838		if (err)
8839			return err;
8840		break;
8841	case ARG_CONST_ALLOC_SIZE_OR_ZERO:
8842		if (!tnum_is_const(reg->var_off)) {
8843			verbose(env, "R%d is not a known constant'\n",
8844				regno);
8845			return -EACCES;
8846		}
8847		meta->mem_size = reg->var_off.value;
8848		err = mark_chain_precision(env, regno);
8849		if (err)
8850			return err;
8851		break;
8852	case ARG_PTR_TO_INT:
8853	case ARG_PTR_TO_LONG:
8854	{
8855		int size = int_ptr_type_to_size(arg_type);
8856
8857		err = check_helper_mem_access(env, regno, size, false, meta);
8858		if (err)
8859			return err;
8860		err = check_ptr_alignment(env, reg, 0, size, true);
8861		break;
8862	}
8863	case ARG_PTR_TO_CONST_STR:
8864	{
8865		err = check_reg_const_str(env, reg, regno);
8866		if (err)
8867			return err;
8868		break;
8869	}
8870	case ARG_PTR_TO_KPTR:
8871		err = process_kptr_func(env, regno, meta);
8872		if (err)
8873			return err;
8874		break;
8875	}
8876
8877	return err;
8878}
8879
8880static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
8881{
8882	enum bpf_attach_type eatype = env->prog->expected_attach_type;
8883	enum bpf_prog_type type = resolve_prog_type(env->prog);
8884
8885	if (func_id != BPF_FUNC_map_update_elem &&
8886	    func_id != BPF_FUNC_map_delete_elem)
8887		return false;
8888
8889	/* It's not possible to get access to a locked struct sock in these
8890	 * contexts, so updating is safe.
8891	 */
8892	switch (type) {
8893	case BPF_PROG_TYPE_TRACING:
8894		if (eatype == BPF_TRACE_ITER)
8895			return true;
8896		break;
8897	case BPF_PROG_TYPE_SOCK_OPS:
8898		/* map_update allowed only via dedicated helpers with event type checks */
8899		if (func_id == BPF_FUNC_map_delete_elem)
8900			return true;
8901		break;
8902	case BPF_PROG_TYPE_SOCKET_FILTER:
8903	case BPF_PROG_TYPE_SCHED_CLS:
8904	case BPF_PROG_TYPE_SCHED_ACT:
8905	case BPF_PROG_TYPE_XDP:
8906	case BPF_PROG_TYPE_SK_REUSEPORT:
8907	case BPF_PROG_TYPE_FLOW_DISSECTOR:
8908	case BPF_PROG_TYPE_SK_LOOKUP:
8909		return true;
8910	default:
8911		break;
8912	}
8913
8914	verbose(env, "cannot update sockmap in this context\n");
8915	return false;
8916}
8917
8918static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
8919{
8920	return env->prog->jit_requested &&
8921	       bpf_jit_supports_subprog_tailcalls();
8922}
8923
8924static int check_map_func_compatibility(struct bpf_verifier_env *env,
8925					struct bpf_map *map, int func_id)
8926{
8927	if (!map)
8928		return 0;
8929
8930	/* We need a two way check, first is from map perspective ... */
8931	switch (map->map_type) {
8932	case BPF_MAP_TYPE_PROG_ARRAY:
8933		if (func_id != BPF_FUNC_tail_call)
8934			goto error;
8935		break;
8936	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
8937		if (func_id != BPF_FUNC_perf_event_read &&
8938		    func_id != BPF_FUNC_perf_event_output &&
8939		    func_id != BPF_FUNC_skb_output &&
8940		    func_id != BPF_FUNC_perf_event_read_value &&
8941		    func_id != BPF_FUNC_xdp_output)
8942			goto error;
8943		break;
8944	case BPF_MAP_TYPE_RINGBUF:
8945		if (func_id != BPF_FUNC_ringbuf_output &&
8946		    func_id != BPF_FUNC_ringbuf_reserve &&
8947		    func_id != BPF_FUNC_ringbuf_query &&
8948		    func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
8949		    func_id != BPF_FUNC_ringbuf_submit_dynptr &&
8950		    func_id != BPF_FUNC_ringbuf_discard_dynptr)
8951			goto error;
8952		break;
8953	case BPF_MAP_TYPE_USER_RINGBUF:
8954		if (func_id != BPF_FUNC_user_ringbuf_drain)
8955			goto error;
8956		break;
8957	case BPF_MAP_TYPE_STACK_TRACE:
8958		if (func_id != BPF_FUNC_get_stackid)
8959			goto error;
8960		break;
8961	case BPF_MAP_TYPE_CGROUP_ARRAY:
8962		if (func_id != BPF_FUNC_skb_under_cgroup &&
8963		    func_id != BPF_FUNC_current_task_under_cgroup)
8964			goto error;
8965		break;
8966	case BPF_MAP_TYPE_CGROUP_STORAGE:
8967	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
8968		if (func_id != BPF_FUNC_get_local_storage)
8969			goto error;
8970		break;
8971	case BPF_MAP_TYPE_DEVMAP:
8972	case BPF_MAP_TYPE_DEVMAP_HASH:
8973		if (func_id != BPF_FUNC_redirect_map &&
8974		    func_id != BPF_FUNC_map_lookup_elem)
8975			goto error;
8976		break;
8977	/* Restrict bpf side of cpumap and xskmap, open when use-cases
8978	 * appear.
8979	 */
8980	case BPF_MAP_TYPE_CPUMAP:
8981		if (func_id != BPF_FUNC_redirect_map)
8982			goto error;
8983		break;
8984	case BPF_MAP_TYPE_XSKMAP:
8985		if (func_id != BPF_FUNC_redirect_map &&
8986		    func_id != BPF_FUNC_map_lookup_elem)
8987			goto error;
8988		break;
8989	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
8990	case BPF_MAP_TYPE_HASH_OF_MAPS:
8991		if (func_id != BPF_FUNC_map_lookup_elem)
8992			goto error;
8993		break;
8994	case BPF_MAP_TYPE_SOCKMAP:
8995		if (func_id != BPF_FUNC_sk_redirect_map &&
8996		    func_id != BPF_FUNC_sock_map_update &&
8997		    func_id != BPF_FUNC_msg_redirect_map &&
8998		    func_id != BPF_FUNC_sk_select_reuseport &&
8999		    func_id != BPF_FUNC_map_lookup_elem &&
9000		    !may_update_sockmap(env, func_id))
9001			goto error;
9002		break;
9003	case BPF_MAP_TYPE_SOCKHASH:
9004		if (func_id != BPF_FUNC_sk_redirect_hash &&
9005		    func_id != BPF_FUNC_sock_hash_update &&
9006		    func_id != BPF_FUNC_msg_redirect_hash &&
9007		    func_id != BPF_FUNC_sk_select_reuseport &&
9008		    func_id != BPF_FUNC_map_lookup_elem &&
9009		    !may_update_sockmap(env, func_id))
9010			goto error;
9011		break;
9012	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
9013		if (func_id != BPF_FUNC_sk_select_reuseport)
9014			goto error;
9015		break;
9016	case BPF_MAP_TYPE_QUEUE:
9017	case BPF_MAP_TYPE_STACK:
9018		if (func_id != BPF_FUNC_map_peek_elem &&
9019		    func_id != BPF_FUNC_map_pop_elem &&
9020		    func_id != BPF_FUNC_map_push_elem)
9021			goto error;
9022		break;
9023	case BPF_MAP_TYPE_SK_STORAGE:
9024		if (func_id != BPF_FUNC_sk_storage_get &&
9025		    func_id != BPF_FUNC_sk_storage_delete &&
9026		    func_id != BPF_FUNC_kptr_xchg)
9027			goto error;
9028		break;
9029	case BPF_MAP_TYPE_INODE_STORAGE:
9030		if (func_id != BPF_FUNC_inode_storage_get &&
9031		    func_id != BPF_FUNC_inode_storage_delete &&
9032		    func_id != BPF_FUNC_kptr_xchg)
9033			goto error;
9034		break;
9035	case BPF_MAP_TYPE_TASK_STORAGE:
9036		if (func_id != BPF_FUNC_task_storage_get &&
9037		    func_id != BPF_FUNC_task_storage_delete &&
9038		    func_id != BPF_FUNC_kptr_xchg)
9039			goto error;
9040		break;
9041	case BPF_MAP_TYPE_CGRP_STORAGE:
9042		if (func_id != BPF_FUNC_cgrp_storage_get &&
9043		    func_id != BPF_FUNC_cgrp_storage_delete &&
9044		    func_id != BPF_FUNC_kptr_xchg)
9045			goto error;
9046		break;
9047	case BPF_MAP_TYPE_BLOOM_FILTER:
9048		if (func_id != BPF_FUNC_map_peek_elem &&
9049		    func_id != BPF_FUNC_map_push_elem)
9050			goto error;
9051		break;
9052	default:
9053		break;
9054	}
9055
9056	/* ... and second from the function itself. */
9057	switch (func_id) {
9058	case BPF_FUNC_tail_call:
9059		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
9060			goto error;
9061		if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
9062			verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
9063			return -EINVAL;
9064		}
9065		break;
9066	case BPF_FUNC_perf_event_read:
9067	case BPF_FUNC_perf_event_output:
9068	case BPF_FUNC_perf_event_read_value:
9069	case BPF_FUNC_skb_output:
9070	case BPF_FUNC_xdp_output:
9071		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
9072			goto error;
9073		break;
9074	case BPF_FUNC_ringbuf_output:
9075	case BPF_FUNC_ringbuf_reserve:
9076	case BPF_FUNC_ringbuf_query:
9077	case BPF_FUNC_ringbuf_reserve_dynptr:
9078	case BPF_FUNC_ringbuf_submit_dynptr:
9079	case BPF_FUNC_ringbuf_discard_dynptr:
9080		if (map->map_type != BPF_MAP_TYPE_RINGBUF)
9081			goto error;
9082		break;
9083	case BPF_FUNC_user_ringbuf_drain:
9084		if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
9085			goto error;
9086		break;
9087	case BPF_FUNC_get_stackid:
9088		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
9089			goto error;
9090		break;
9091	case BPF_FUNC_current_task_under_cgroup:
9092	case BPF_FUNC_skb_under_cgroup:
9093		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
9094			goto error;
9095		break;
9096	case BPF_FUNC_redirect_map:
9097		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
9098		    map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
9099		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
9100		    map->map_type != BPF_MAP_TYPE_XSKMAP)
9101			goto error;
9102		break;
9103	case BPF_FUNC_sk_redirect_map:
9104	case BPF_FUNC_msg_redirect_map:
9105	case BPF_FUNC_sock_map_update:
9106		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
9107			goto error;
9108		break;
9109	case BPF_FUNC_sk_redirect_hash:
9110	case BPF_FUNC_msg_redirect_hash:
9111	case BPF_FUNC_sock_hash_update:
9112		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
9113			goto error;
9114		break;
9115	case BPF_FUNC_get_local_storage:
9116		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
9117		    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
9118			goto error;
9119		break;
9120	case BPF_FUNC_sk_select_reuseport:
9121		if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
9122		    map->map_type != BPF_MAP_TYPE_SOCKMAP &&
9123		    map->map_type != BPF_MAP_TYPE_SOCKHASH)
9124			goto error;
9125		break;
9126	case BPF_FUNC_map_pop_elem:
9127		if (map->map_type != BPF_MAP_TYPE_QUEUE &&
9128		    map->map_type != BPF_MAP_TYPE_STACK)
9129			goto error;
9130		break;
9131	case BPF_FUNC_map_peek_elem:
9132	case BPF_FUNC_map_push_elem:
9133		if (map->map_type != BPF_MAP_TYPE_QUEUE &&
9134		    map->map_type != BPF_MAP_TYPE_STACK &&
9135		    map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
9136			goto error;
9137		break;
9138	case BPF_FUNC_map_lookup_percpu_elem:
9139		if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
9140		    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
9141		    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
9142			goto error;
9143		break;
9144	case BPF_FUNC_sk_storage_get:
9145	case BPF_FUNC_sk_storage_delete:
9146		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
9147			goto error;
9148		break;
9149	case BPF_FUNC_inode_storage_get:
9150	case BPF_FUNC_inode_storage_delete:
9151		if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
9152			goto error;
9153		break;
9154	case BPF_FUNC_task_storage_get:
9155	case BPF_FUNC_task_storage_delete:
9156		if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
9157			goto error;
9158		break;
9159	case BPF_FUNC_cgrp_storage_get:
9160	case BPF_FUNC_cgrp_storage_delete:
9161		if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
9162			goto error;
9163		break;
9164	default:
9165		break;
9166	}
9167
9168	return 0;
9169error:
9170	verbose(env, "cannot pass map_type %d into func %s#%d\n",
9171		map->map_type, func_id_name(func_id), func_id);
9172	return -EINVAL;
9173}
9174
9175static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
9176{
9177	int count = 0;
9178
9179	if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
9180		count++;
9181	if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
9182		count++;
9183	if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
9184		count++;
9185	if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
9186		count++;
9187	if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
9188		count++;
9189
9190	/* We only support one arg being in raw mode at the moment,
9191	 * which is sufficient for the helper functions we have
9192	 * right now.
9193	 */
9194	return count <= 1;
9195}
9196
9197static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
9198{
9199	bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
9200	bool has_size = fn->arg_size[arg] != 0;
9201	bool is_next_size = false;
9202
9203	if (arg + 1 < ARRAY_SIZE(fn->arg_type))
9204		is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
9205
9206	if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
9207		return is_next_size;
9208
9209	return has_size == is_next_size || is_next_size == is_fixed;
9210}
9211
9212static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
9213{
9214	/* bpf_xxx(..., buf, len) call will access 'len'
9215	 * bytes from memory 'buf'. Both arg types need
9216	 * to be paired, so make sure there's no buggy
9217	 * helper function specification.
9218	 */
9219	if (arg_type_is_mem_size(fn->arg1_type) ||
9220	    check_args_pair_invalid(fn, 0) ||
9221	    check_args_pair_invalid(fn, 1) ||
9222	    check_args_pair_invalid(fn, 2) ||
9223	    check_args_pair_invalid(fn, 3) ||
9224	    check_args_pair_invalid(fn, 4))
9225		return false;
9226
9227	return true;
9228}
9229
9230static bool check_btf_id_ok(const struct bpf_func_proto *fn)
9231{
9232	int i;
9233
9234	for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
9235		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
9236			return !!fn->arg_btf_id[i];
9237		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
9238			return fn->arg_btf_id[i] == BPF_PTR_POISON;
9239		if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
9240		    /* arg_btf_id and arg_size are in a union. */
9241		    (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
9242		     !(fn->arg_type[i] & MEM_FIXED_SIZE)))
9243			return false;
9244	}
9245
9246	return true;
9247}
9248
9249static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
9250{
9251	return check_raw_mode_ok(fn) &&
9252	       check_arg_pair_ok(fn) &&
9253	       check_btf_id_ok(fn) ? 0 : -EINVAL;
9254}
9255
9256/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
9257 * are now invalid, so turn them into unknown SCALAR_VALUE.
9258 *
9259 * This also applies to dynptr slices belonging to skb and xdp dynptrs,
9260 * since these slices point to packet data.
9261 */
9262static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
9263{
9264	struct bpf_func_state *state;
9265	struct bpf_reg_state *reg;
9266
9267	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
9268		if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
9269			mark_reg_invalid(env, reg);
9270	}));
9271}
9272
9273enum {
9274	AT_PKT_END = -1,
9275	BEYOND_PKT_END = -2,
9276};
9277
9278static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
9279{
9280	struct bpf_func_state *state = vstate->frame[vstate->curframe];
9281	struct bpf_reg_state *reg = &state->regs[regn];
9282
9283	if (reg->type != PTR_TO_PACKET)
9284		/* PTR_TO_PACKET_META is not supported yet */
9285		return;
9286
9287	/* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
9288	 * How far beyond pkt_end it goes is unknown.
9289	 * if (!range_open) it's the case of pkt >= pkt_end
9290	 * if (range_open) it's the case of pkt > pkt_end
9291	 * hence this pointer is at least 1 byte bigger than pkt_end
9292	 */
9293	if (range_open)
9294		reg->range = BEYOND_PKT_END;
9295	else
9296		reg->range = AT_PKT_END;
9297}
9298
9299/* The pointer with the specified id has released its reference to kernel
9300 * resources. Identify all copies of the same pointer and clear the reference.
9301 */
9302static int release_reference(struct bpf_verifier_env *env,
9303			     int ref_obj_id)
9304{
9305	struct bpf_func_state *state;
9306	struct bpf_reg_state *reg;
9307	int err;
9308
9309	err = release_reference_state(cur_func(env), ref_obj_id);
9310	if (err)
9311		return err;
9312
9313	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
9314		if (reg->ref_obj_id == ref_obj_id)
9315			mark_reg_invalid(env, reg);
9316	}));
9317
9318	return 0;
9319}
9320
9321static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
9322{
9323	struct bpf_func_state *unused;
9324	struct bpf_reg_state *reg;
9325
9326	bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
9327		if (type_is_non_owning_ref(reg->type))
9328			mark_reg_invalid(env, reg);
9329	}));
9330}
9331
9332static void clear_caller_saved_regs(struct bpf_verifier_env *env,
9333				    struct bpf_reg_state *regs)
9334{
9335	int i;
9336
9337	/* after the call registers r0 - r5 were scratched */
9338	for (i = 0; i < CALLER_SAVED_REGS; i++) {
9339		mark_reg_not_init(env, regs, caller_saved[i]);
9340		__check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
9341	}
9342}
9343
9344typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
9345				   struct bpf_func_state *caller,
9346				   struct bpf_func_state *callee,
9347				   int insn_idx);
9348
9349static int set_callee_state(struct bpf_verifier_env *env,
9350			    struct bpf_func_state *caller,
9351			    struct bpf_func_state *callee, int insn_idx);
9352
9353static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
9354			    set_callee_state_fn set_callee_state_cb,
9355			    struct bpf_verifier_state *state)
9356{
9357	struct bpf_func_state *caller, *callee;
9358	int err;
9359
9360	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
9361		verbose(env, "the call stack of %d frames is too deep\n",
9362			state->curframe + 2);
9363		return -E2BIG;
9364	}
9365
9366	if (state->frame[state->curframe + 1]) {
9367		verbose(env, "verifier bug. Frame %d already allocated\n",
9368			state->curframe + 1);
9369		return -EFAULT;
9370	}
9371
9372	caller = state->frame[state->curframe];
9373	callee = kzalloc(sizeof(*callee), GFP_KERNEL);
9374	if (!callee)
9375		return -ENOMEM;
9376	state->frame[state->curframe + 1] = callee;
9377
9378	/* callee cannot access r0, r6 - r9 for reading and has to write
9379	 * into its own stack before reading from it.
9380	 * callee can read/write into caller's stack
9381	 */
9382	init_func_state(env, callee,
9383			/* remember the callsite, it will be used by bpf_exit */
9384			callsite,
9385			state->curframe + 1 /* frameno within this callchain */,
9386			subprog /* subprog number within this prog */);
9387	/* Transfer references to the callee */
9388	err = copy_reference_state(callee, caller);
9389	err = err ?: set_callee_state_cb(env, caller, callee, callsite);
9390	if (err)
9391		goto err_out;
9392
9393	/* only increment it after check_reg_arg() finished */
9394	state->curframe++;
9395
9396	return 0;
9397
9398err_out:
9399	free_func_state(callee);
9400	state->frame[state->curframe + 1] = NULL;
9401	return err;
9402}
9403
9404static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
9405				    const struct btf *btf,
9406				    struct bpf_reg_state *regs)
9407{
9408	struct bpf_subprog_info *sub = subprog_info(env, subprog);
9409	struct bpf_verifier_log *log = &env->log;
9410	u32 i;
9411	int ret;
9412
9413	ret = btf_prepare_func_args(env, subprog);
9414	if (ret)
9415		return ret;
9416
9417	/* check that BTF function arguments match actual types that the
9418	 * verifier sees.
9419	 */
9420	for (i = 0; i < sub->arg_cnt; i++) {
9421		u32 regno = i + 1;
9422		struct bpf_reg_state *reg = &regs[regno];
9423		struct bpf_subprog_arg_info *arg = &sub->args[i];
9424
9425		if (arg->arg_type == ARG_ANYTHING) {
9426			if (reg->type != SCALAR_VALUE) {
9427				bpf_log(log, "R%d is not a scalar\n", regno);
9428				return -EINVAL;
9429			}
9430		} else if (arg->arg_type == ARG_PTR_TO_CTX) {
9431			ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
9432			if (ret < 0)
9433				return ret;
9434			/* If function expects ctx type in BTF check that caller
9435			 * is passing PTR_TO_CTX.
9436			 */
9437			if (reg->type != PTR_TO_CTX) {
9438				bpf_log(log, "arg#%d expects pointer to ctx\n", i);
9439				return -EINVAL;
9440			}
9441		} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
9442			ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
9443			if (ret < 0)
9444				return ret;
9445			if (check_mem_reg(env, reg, regno, arg->mem_size))
9446				return -EINVAL;
9447			if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
9448				bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
9449				return -EINVAL;
9450			}
9451		} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
9452			/*
9453			 * Can pass any value and the kernel won't crash, but
9454			 * only PTR_TO_ARENA or SCALAR make sense. Everything
9455			 * else is a bug in the bpf program. Point it out to
9456			 * the user at the verification time instead of
9457			 * run-time debug nightmare.
9458			 */
9459			if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
9460				bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
9461				return -EINVAL;
9462			}
9463		} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
9464			ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
9465			if (ret)
9466				return ret;
9467		} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
9468			struct bpf_call_arg_meta meta;
9469			int err;
9470
9471			if (register_is_null(reg) && type_may_be_null(arg->arg_type))
9472				continue;
9473
9474			memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
9475			err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
9476			err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
9477			if (err)
9478				return err;
9479		} else {
9480			bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
9481				i, arg->arg_type);
9482			return -EFAULT;
9483		}
9484	}
9485
9486	return 0;
9487}
9488
9489/* Compare BTF of a function call with given bpf_reg_state.
9490 * Returns:
9491 * EFAULT - there is a verifier bug. Abort verification.
9492 * EINVAL - there is a type mismatch or BTF is not available.
9493 * 0 - BTF matches with what bpf_reg_state expects.
9494 * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
9495 */
9496static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
9497				  struct bpf_reg_state *regs)
9498{
9499	struct bpf_prog *prog = env->prog;
9500	struct btf *btf = prog->aux->btf;
9501	u32 btf_id;
9502	int err;
9503
9504	if (!prog->aux->func_info)
9505		return -EINVAL;
9506
9507	btf_id = prog->aux->func_info[subprog].type_id;
9508	if (!btf_id)
9509		return -EFAULT;
9510
9511	if (prog->aux->func_info_aux[subprog].unreliable)
9512		return -EINVAL;
9513
9514	err = btf_check_func_arg_match(env, subprog, btf, regs);
9515	/* Compiler optimizations can remove arguments from static functions
9516	 * or mismatched type can be passed into a global function.
9517	 * In such cases mark the function as unreliable from BTF point of view.
9518	 */
9519	if (err)
9520		prog->aux->func_info_aux[subprog].unreliable = true;
9521	return err;
9522}
9523
9524static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
9525			      int insn_idx, int subprog,
9526			      set_callee_state_fn set_callee_state_cb)
9527{
9528	struct bpf_verifier_state *state = env->cur_state, *callback_state;
9529	struct bpf_func_state *caller, *callee;
9530	int err;
9531
9532	caller = state->frame[state->curframe];
9533	err = btf_check_subprog_call(env, subprog, caller->regs);
9534	if (err == -EFAULT)
9535		return err;
9536
9537	/* set_callee_state is used for direct subprog calls, but we are
9538	 * interested in validating only BPF helpers that can call subprogs as
9539	 * callbacks
9540	 */
9541	env->subprog_info[subprog].is_cb = true;
9542	if (bpf_pseudo_kfunc_call(insn) &&
9543	    !is_callback_calling_kfunc(insn->imm)) {
9544		verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
9545			func_id_name(insn->imm), insn->imm);
9546		return -EFAULT;
9547	} else if (!bpf_pseudo_kfunc_call(insn) &&
9548		   !is_callback_calling_function(insn->imm)) { /* helper */
9549		verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
9550			func_id_name(insn->imm), insn->imm);
9551		return -EFAULT;
9552	}
9553
9554	if (is_async_callback_calling_insn(insn)) {
9555		struct bpf_verifier_state *async_cb;
9556
9557		/* there is no real recursion here. timer and workqueue callbacks are async */
9558		env->subprog_info[subprog].is_async_cb = true;
9559		async_cb = push_async_cb(env, env->subprog_info[subprog].start,
9560					 insn_idx, subprog,
9561					 is_bpf_wq_set_callback_impl_kfunc(insn->imm));
9562		if (!async_cb)
9563			return -EFAULT;
9564		callee = async_cb->frame[0];
9565		callee->async_entry_cnt = caller->async_entry_cnt + 1;
9566
9567		/* Convert bpf_timer_set_callback() args into timer callback args */
9568		err = set_callee_state_cb(env, caller, callee, insn_idx);
9569		if (err)
9570			return err;
9571
9572		return 0;
9573	}
9574
9575	/* for callback functions enqueue entry to callback and
9576	 * proceed with next instruction within current frame.
9577	 */
9578	callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
9579	if (!callback_state)
9580		return -ENOMEM;
9581
9582	err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
9583			       callback_state);
9584	if (err)
9585		return err;
9586
9587	callback_state->callback_unroll_depth++;
9588	callback_state->frame[callback_state->curframe - 1]->callback_depth++;
9589	caller->callback_depth = 0;
9590	return 0;
9591}
9592
9593static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
9594			   int *insn_idx)
9595{
9596	struct bpf_verifier_state *state = env->cur_state;
9597	struct bpf_func_state *caller;
9598	int err, subprog, target_insn;
9599
9600	target_insn = *insn_idx + insn->imm + 1;
9601	subprog = find_subprog(env, target_insn);
9602	if (subprog < 0) {
9603		verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
9604		return -EFAULT;
9605	}
9606
9607	caller = state->frame[state->curframe];
9608	err = btf_check_subprog_call(env, subprog, caller->regs);
9609	if (err == -EFAULT)
9610		return err;
9611	if (subprog_is_global(env, subprog)) {
9612		const char *sub_name = subprog_name(env, subprog);
9613
9614		/* Only global subprogs cannot be called with a lock held. */
9615		if (env->cur_state->active_lock.ptr) {
9616			verbose(env, "global function calls are not allowed while holding a lock,\n"
9617				     "use static function instead\n");
9618			return -EINVAL;
9619		}
9620
9621		/* Only global subprogs cannot be called with preemption disabled. */
9622		if (env->cur_state->active_preempt_lock) {
9623			verbose(env, "global function calls are not allowed with preemption disabled,\n"
9624				     "use static function instead\n");
9625			return -EINVAL;
9626		}
9627
9628		if (err) {
9629			verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
9630				subprog, sub_name);
9631			return err;
9632		}
9633
9634		verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
9635			subprog, sub_name);
9636		/* mark global subprog for verifying after main prog */
9637		subprog_aux(env, subprog)->called = true;
9638		clear_caller_saved_regs(env, caller->regs);
9639
9640		/* All global functions return a 64-bit SCALAR_VALUE */
9641		mark_reg_unknown(env, caller->regs, BPF_REG_0);
9642		caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
9643
9644		/* continue with next insn after call */
9645		return 0;
9646	}
9647
9648	/* for regular function entry setup new frame and continue
9649	 * from that frame.
9650	 */
9651	err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
9652	if (err)
9653		return err;
9654
9655	clear_caller_saved_regs(env, caller->regs);
9656
9657	/* and go analyze first insn of the callee */
9658	*insn_idx = env->subprog_info[subprog].start - 1;
9659
9660	if (env->log.level & BPF_LOG_LEVEL) {
9661		verbose(env, "caller:\n");
9662		print_verifier_state(env, caller, true);
9663		verbose(env, "callee:\n");
9664		print_verifier_state(env, state->frame[state->curframe], true);
9665	}
9666
9667	return 0;
9668}
9669
9670int map_set_for_each_callback_args(struct bpf_verifier_env *env,
9671				   struct bpf_func_state *caller,
9672				   struct bpf_func_state *callee)
9673{
9674	/* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
9675	 *      void *callback_ctx, u64 flags);
9676	 * callback_fn(struct bpf_map *map, void *key, void *value,
9677	 *      void *callback_ctx);
9678	 */
9679	callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
9680
9681	callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
9682	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
9683	callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
9684
9685	callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
9686	__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
9687	callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
9688
9689	/* pointer to stack or null */
9690	callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
9691
9692	/* unused */
9693	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
9694	return 0;
9695}
9696
9697static int set_callee_state(struct bpf_verifier_env *env,
9698			    struct bpf_func_state *caller,
9699			    struct bpf_func_state *callee, int insn_idx)
9700{
9701	int i;
9702
9703	/* copy r1 - r5 args that callee can access.  The copy includes parent
9704	 * pointers, which connects us up to the liveness chain
9705	 */
9706	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
9707		callee->regs[i] = caller->regs[i];
9708	return 0;
9709}
9710
9711static int set_map_elem_callback_state(struct bpf_verifier_env *env,
9712				       struct bpf_func_state *caller,
9713				       struct bpf_func_state *callee,
9714				       int insn_idx)
9715{
9716	struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
9717	struct bpf_map *map;
9718	int err;
9719
9720	/* valid map_ptr and poison value does not matter */
9721	map = insn_aux->map_ptr_state.map_ptr;
9722	if (!map->ops->map_set_for_each_callback_args ||
9723	    !map->ops->map_for_each_callback) {
9724		verbose(env, "callback function not allowed for map\n");
9725		return -ENOTSUPP;
9726	}
9727
9728	err = map->ops->map_set_for_each_callback_args(env, caller, callee);
9729	if (err)
9730		return err;
9731
9732	callee->in_callback_fn = true;
9733	callee->callback_ret_range = retval_range(0, 1);
9734	return 0;
9735}
9736
9737static int set_loop_callback_state(struct bpf_verifier_env *env,
9738				   struct bpf_func_state *caller,
9739				   struct bpf_func_state *callee,
9740				   int insn_idx)
9741{
9742	/* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
9743	 *	    u64 flags);
9744	 * callback_fn(u32 index, void *callback_ctx);
9745	 */
9746	callee->regs[BPF_REG_1].type = SCALAR_VALUE;
9747	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
9748
9749	/* unused */
9750	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
9751	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
9752	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
9753
9754	callee->in_callback_fn = true;
9755	callee->callback_ret_range = retval_range(0, 1);
9756	return 0;
9757}
9758
9759static int set_timer_callback_state(struct bpf_verifier_env *env,
9760				    struct bpf_func_state *caller,
9761				    struct bpf_func_state *callee,
9762				    int insn_idx)
9763{
9764	struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
9765
9766	/* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
9767	 * callback_fn(struct bpf_map *map, void *key, void *value);
9768	 */
9769	callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
9770	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
9771	callee->regs[BPF_REG_1].map_ptr = map_ptr;
9772
9773	callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
9774	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
9775	callee->regs[BPF_REG_2].map_ptr = map_ptr;
9776
9777	callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
9778	__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
9779	callee->regs[BPF_REG_3].map_ptr = map_ptr;
9780
9781	/* unused */
9782	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
9783	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
9784	callee->in_async_callback_fn = true;
9785	callee->callback_ret_range = retval_range(0, 1);
9786	return 0;
9787}
9788
9789static int set_find_vma_callback_state(struct bpf_verifier_env *env,
9790				       struct bpf_func_state *caller,
9791				       struct bpf_func_state *callee,
9792				       int insn_idx)
9793{
9794	/* bpf_find_vma(struct task_struct *task, u64 addr,
9795	 *               void *callback_fn, void *callback_ctx, u64 flags)
9796	 * (callback_fn)(struct task_struct *task,
9797	 *               struct vm_area_struct *vma, void *callback_ctx);
9798	 */
9799	callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
9800
9801	callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
9802	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
9803	callee->regs[BPF_REG_2].btf =  btf_vmlinux;
9804	callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
9805
9806	/* pointer to stack or null */
9807	callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
9808
9809	/* unused */
9810	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
9811	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
9812	callee->in_callback_fn = true;
9813	callee->callback_ret_range = retval_range(0, 1);
9814	return 0;
9815}
9816
9817static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
9818					   struct bpf_func_state *caller,
9819					   struct bpf_func_state *callee,
9820					   int insn_idx)
9821{
9822	/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
9823	 *			  callback_ctx, u64 flags);
9824	 * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
9825	 */
9826	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
9827	mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
9828	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
9829
9830	/* unused */
9831	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
9832	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
9833	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
9834
9835	callee->in_callback_fn = true;
9836	callee->callback_ret_range = retval_range(0, 1);
9837	return 0;
9838}
9839
9840static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
9841					 struct bpf_func_state *caller,
9842					 struct bpf_func_state *callee,
9843					 int insn_idx)
9844{
9845	/* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
9846	 *                     bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
9847	 *
9848	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
9849	 * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
9850	 * by this point, so look at 'root'
9851	 */
9852	struct btf_field *field;
9853
9854	field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
9855				      BPF_RB_ROOT);
9856	if (!field || !field->graph_root.value_btf_id)
9857		return -EFAULT;
9858
9859	mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
9860	ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
9861	mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
9862	ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
9863
9864	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
9865	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
9866	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
9867	callee->in_callback_fn = true;
9868	callee->callback_ret_range = retval_range(0, 1);
9869	return 0;
9870}
9871
9872static bool is_rbtree_lock_required_kfunc(u32 btf_id);
9873
9874/* Are we currently verifying the callback for a rbtree helper that must
9875 * be called with lock held? If so, no need to complain about unreleased
9876 * lock
9877 */
9878static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
9879{
9880	struct bpf_verifier_state *state = env->cur_state;
9881	struct bpf_insn *insn = env->prog->insnsi;
9882	struct bpf_func_state *callee;
9883	int kfunc_btf_id;
9884
9885	if (!state->curframe)
9886		return false;
9887
9888	callee = state->frame[state->curframe];
9889
9890	if (!callee->in_callback_fn)
9891		return false;
9892
9893	kfunc_btf_id = insn[callee->callsite].imm;
9894	return is_rbtree_lock_required_kfunc(kfunc_btf_id);
9895}
9896
9897static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
9898{
9899	return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
9900}
9901
9902static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
9903{
9904	struct bpf_verifier_state *state = env->cur_state, *prev_st;
9905	struct bpf_func_state *caller, *callee;
9906	struct bpf_reg_state *r0;
9907	bool in_callback_fn;
9908	int err;
9909
9910	callee = state->frame[state->curframe];
9911	r0 = &callee->regs[BPF_REG_0];
9912	if (r0->type == PTR_TO_STACK) {
9913		/* technically it's ok to return caller's stack pointer
9914		 * (or caller's caller's pointer) back to the caller,
9915		 * since these pointers are valid. Only current stack
9916		 * pointer will be invalid as soon as function exits,
9917		 * but let's be conservative
9918		 */
9919		verbose(env, "cannot return stack pointer to the caller\n");
9920		return -EINVAL;
9921	}
9922
9923	caller = state->frame[state->curframe - 1];
9924	if (callee->in_callback_fn) {
9925		if (r0->type != SCALAR_VALUE) {
9926			verbose(env, "R0 not a scalar value\n");
9927			return -EACCES;
9928		}
9929
9930		/* we are going to rely on register's precise value */
9931		err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
9932		err = err ?: mark_chain_precision(env, BPF_REG_0);
9933		if (err)
9934			return err;
9935
9936		/* enforce R0 return value range */
9937		if (!retval_range_within(callee->callback_ret_range, r0)) {
9938			verbose_invalid_scalar(env, r0, callee->callback_ret_range,
9939					       "At callback return", "R0");
9940			return -EINVAL;
9941		}
9942		if (!calls_callback(env, callee->callsite)) {
9943			verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
9944				*insn_idx, callee->callsite);
9945			return -EFAULT;
9946		}
9947	} else {
9948		/* return to the caller whatever r0 had in the callee */
9949		caller->regs[BPF_REG_0] = *r0;
9950	}
9951
9952	/* callback_fn frame should have released its own additions to parent's
9953	 * reference state at this point, or check_reference_leak would
9954	 * complain, hence it must be the same as the caller. There is no need
9955	 * to copy it back.
9956	 */
9957	if (!callee->in_callback_fn) {
9958		/* Transfer references to the caller */
9959		err = copy_reference_state(caller, callee);
9960		if (err)
9961			return err;
9962	}
9963
9964	/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
9965	 * there function call logic would reschedule callback visit. If iteration
9966	 * converges is_state_visited() would prune that visit eventually.
9967	 */
9968	in_callback_fn = callee->in_callback_fn;
9969	if (in_callback_fn)
9970		*insn_idx = callee->callsite;
9971	else
9972		*insn_idx = callee->callsite + 1;
9973
9974	if (env->log.level & BPF_LOG_LEVEL) {
9975		verbose(env, "returning from callee:\n");
9976		print_verifier_state(env, callee, true);
9977		verbose(env, "to caller at %d:\n", *insn_idx);
9978		print_verifier_state(env, caller, true);
9979	}
9980	/* clear everything in the callee. In case of exceptional exits using
9981	 * bpf_throw, this will be done by copy_verifier_state for extra frames. */
9982	free_func_state(callee);
9983	state->frame[state->curframe--] = NULL;
9984
9985	/* for callbacks widen imprecise scalars to make programs like below verify:
9986	 *
9987	 *   struct ctx { int i; }
9988	 *   void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
9989	 *   ...
9990	 *   struct ctx = { .i = 0; }
9991	 *   bpf_loop(100, cb, &ctx, 0);
9992	 *
9993	 * This is similar to what is done in process_iter_next_call() for open
9994	 * coded iterators.
9995	 */
9996	prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
9997	if (prev_st) {
9998		err = widen_imprecise_scalars(env, prev_st, state);
9999		if (err)
10000			return err;
10001	}
10002	return 0;
10003}
10004
10005static int do_refine_retval_range(struct bpf_verifier_env *env,
10006				  struct bpf_reg_state *regs, int ret_type,
10007				  int func_id,
10008				  struct bpf_call_arg_meta *meta)
10009{
10010	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
10011
10012	if (ret_type != RET_INTEGER)
10013		return 0;
10014
10015	switch (func_id) {
10016	case BPF_FUNC_get_stack:
10017	case BPF_FUNC_get_task_stack:
10018	case BPF_FUNC_probe_read_str:
10019	case BPF_FUNC_probe_read_kernel_str:
10020	case BPF_FUNC_probe_read_user_str:
10021		ret_reg->smax_value = meta->msize_max_value;
10022		ret_reg->s32_max_value = meta->msize_max_value;
10023		ret_reg->smin_value = -MAX_ERRNO;
10024		ret_reg->s32_min_value = -MAX_ERRNO;
10025		reg_bounds_sync(ret_reg);
10026		break;
10027	case BPF_FUNC_get_smp_processor_id:
10028		ret_reg->umax_value = nr_cpu_ids - 1;
10029		ret_reg->u32_max_value = nr_cpu_ids - 1;
10030		ret_reg->smax_value = nr_cpu_ids - 1;
10031		ret_reg->s32_max_value = nr_cpu_ids - 1;
10032		ret_reg->umin_value = 0;
10033		ret_reg->u32_min_value = 0;
10034		ret_reg->smin_value = 0;
10035		ret_reg->s32_min_value = 0;
10036		reg_bounds_sync(ret_reg);
10037		break;
10038	}
10039
10040	return reg_bounds_sanity_check(env, ret_reg, "retval");
10041}
10042
10043static int
10044record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
10045		int func_id, int insn_idx)
10046{
10047	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
10048	struct bpf_map *map = meta->map_ptr;
10049
10050	if (func_id != BPF_FUNC_tail_call &&
10051	    func_id != BPF_FUNC_map_lookup_elem &&
10052	    func_id != BPF_FUNC_map_update_elem &&
10053	    func_id != BPF_FUNC_map_delete_elem &&
10054	    func_id != BPF_FUNC_map_push_elem &&
10055	    func_id != BPF_FUNC_map_pop_elem &&
10056	    func_id != BPF_FUNC_map_peek_elem &&
10057	    func_id != BPF_FUNC_for_each_map_elem &&
10058	    func_id != BPF_FUNC_redirect_map &&
10059	    func_id != BPF_FUNC_map_lookup_percpu_elem)
10060		return 0;
10061
10062	if (map == NULL) {
10063		verbose(env, "kernel subsystem misconfigured verifier\n");
10064		return -EINVAL;
10065	}
10066
10067	/* In case of read-only, some additional restrictions
10068	 * need to be applied in order to prevent altering the
10069	 * state of the map from program side.
10070	 */
10071	if ((map->map_flags & BPF_F_RDONLY_PROG) &&
10072	    (func_id == BPF_FUNC_map_delete_elem ||
10073	     func_id == BPF_FUNC_map_update_elem ||
10074	     func_id == BPF_FUNC_map_push_elem ||
10075	     func_id == BPF_FUNC_map_pop_elem)) {
10076		verbose(env, "write into map forbidden\n");
10077		return -EACCES;
10078	}
10079
10080	if (!aux->map_ptr_state.map_ptr)
10081		bpf_map_ptr_store(aux, meta->map_ptr,
10082				  !meta->map_ptr->bypass_spec_v1, false);
10083	else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
10084		bpf_map_ptr_store(aux, meta->map_ptr,
10085				  !meta->map_ptr->bypass_spec_v1, true);
10086	return 0;
10087}
10088
10089static int
10090record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
10091		int func_id, int insn_idx)
10092{
10093	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
10094	struct bpf_reg_state *regs = cur_regs(env), *reg;
10095	struct bpf_map *map = meta->map_ptr;
10096	u64 val, max;
10097	int err;
10098
10099	if (func_id != BPF_FUNC_tail_call)
10100		return 0;
10101	if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
10102		verbose(env, "kernel subsystem misconfigured verifier\n");
10103		return -EINVAL;
10104	}
10105
10106	reg = &regs[BPF_REG_3];
10107	val = reg->var_off.value;
10108	max = map->max_entries;
10109
10110	if (!(is_reg_const(reg, false) && val < max)) {
10111		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
10112		return 0;
10113	}
10114
10115	err = mark_chain_precision(env, BPF_REG_3);
10116	if (err)
10117		return err;
10118	if (bpf_map_key_unseen(aux))
10119		bpf_map_key_store(aux, val);
10120	else if (!bpf_map_key_poisoned(aux) &&
10121		  bpf_map_key_immediate(aux) != val)
10122		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
10123	return 0;
10124}
10125
10126static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
10127{
10128	struct bpf_func_state *state = cur_func(env);
10129	bool refs_lingering = false;
10130	int i;
10131
10132	if (!exception_exit && state->frameno && !state->in_callback_fn)
10133		return 0;
10134
10135	for (i = 0; i < state->acquired_refs; i++) {
10136		if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
10137			continue;
10138		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
10139			state->refs[i].id, state->refs[i].insn_idx);
10140		refs_lingering = true;
10141	}
10142	return refs_lingering ? -EINVAL : 0;
10143}
10144
10145static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
10146				   struct bpf_reg_state *regs)
10147{
10148	struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
10149	struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
10150	struct bpf_map *fmt_map = fmt_reg->map_ptr;
10151	struct bpf_bprintf_data data = {};
10152	int err, fmt_map_off, num_args;
10153	u64 fmt_addr;
10154	char *fmt;
10155
10156	/* data must be an array of u64 */
10157	if (data_len_reg->var_off.value % 8)
10158		return -EINVAL;
10159	num_args = data_len_reg->var_off.value / 8;
10160
10161	/* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
10162	 * and map_direct_value_addr is set.
10163	 */
10164	fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
10165	err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
10166						  fmt_map_off);
10167	if (err) {
10168		verbose(env, "verifier bug\n");
10169		return -EFAULT;
10170	}
10171	fmt = (char *)(long)fmt_addr + fmt_map_off;
10172
10173	/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
10174	 * can focus on validating the format specifiers.
10175	 */
10176	err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
10177	if (err < 0)
10178		verbose(env, "Invalid format string\n");
10179
10180	return err;
10181}
10182
10183static int check_get_func_ip(struct bpf_verifier_env *env)
10184{
10185	enum bpf_prog_type type = resolve_prog_type(env->prog);
10186	int func_id = BPF_FUNC_get_func_ip;
10187
10188	if (type == BPF_PROG_TYPE_TRACING) {
10189		if (!bpf_prog_has_trampoline(env->prog)) {
10190			verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
10191				func_id_name(func_id), func_id);
10192			return -ENOTSUPP;
10193		}
10194		return 0;
10195	} else if (type == BPF_PROG_TYPE_KPROBE) {
10196		return 0;
10197	}
10198
10199	verbose(env, "func %s#%d not supported for program type %d\n",
10200		func_id_name(func_id), func_id, type);
10201	return -ENOTSUPP;
10202}
10203
10204static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
10205{
10206	return &env->insn_aux_data[env->insn_idx];
10207}
10208
10209static bool loop_flag_is_zero(struct bpf_verifier_env *env)
10210{
10211	struct bpf_reg_state *regs = cur_regs(env);
10212	struct bpf_reg_state *reg = &regs[BPF_REG_4];
10213	bool reg_is_null = register_is_null(reg);
10214
10215	if (reg_is_null)
10216		mark_chain_precision(env, BPF_REG_4);
10217
10218	return reg_is_null;
10219}
10220
10221static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
10222{
10223	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
10224
10225	if (!state->initialized) {
10226		state->initialized = 1;
10227		state->fit_for_inline = loop_flag_is_zero(env);
10228		state->callback_subprogno = subprogno;
10229		return;
10230	}
10231
10232	if (!state->fit_for_inline)
10233		return;
10234
10235	state->fit_for_inline = (loop_flag_is_zero(env) &&
10236				 state->callback_subprogno == subprogno);
10237}
10238
10239static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
10240			     int *insn_idx_p)
10241{
10242	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
10243	bool returns_cpu_specific_alloc_ptr = false;
10244	const struct bpf_func_proto *fn = NULL;
10245	enum bpf_return_type ret_type;
10246	enum bpf_type_flag ret_flag;
10247	struct bpf_reg_state *regs;
10248	struct bpf_call_arg_meta meta;
10249	int insn_idx = *insn_idx_p;
10250	bool changes_data;
10251	int i, err, func_id;
10252
10253	/* find function prototype */
10254	func_id = insn->imm;
10255	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
10256		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
10257			func_id);
10258		return -EINVAL;
10259	}
10260
10261	if (env->ops->get_func_proto)
10262		fn = env->ops->get_func_proto(func_id, env->prog);
10263	if (!fn) {
10264		verbose(env, "program of this type cannot use helper %s#%d\n",
10265			func_id_name(func_id), func_id);
10266		return -EINVAL;
10267	}
10268
10269	/* eBPF programs must be GPL compatible to use GPL-ed functions */
10270	if (!env->prog->gpl_compatible && fn->gpl_only) {
10271		verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
10272		return -EINVAL;
10273	}
10274
10275	if (fn->allowed && !fn->allowed(env->prog)) {
10276		verbose(env, "helper call is not allowed in probe\n");
10277		return -EINVAL;
10278	}
10279
10280	if (!in_sleepable(env) && fn->might_sleep) {
10281		verbose(env, "helper call might sleep in a non-sleepable prog\n");
10282		return -EINVAL;
10283	}
10284
10285	/* With LD_ABS/IND some JITs save/restore skb from r1. */
10286	changes_data = bpf_helper_changes_pkt_data(fn->func);
10287	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
10288		verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
10289			func_id_name(func_id), func_id);
10290		return -EINVAL;
10291	}
10292
10293	memset(&meta, 0, sizeof(meta));
10294	meta.pkt_access = fn->pkt_access;
10295
10296	err = check_func_proto(fn, func_id);
10297	if (err) {
10298		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
10299			func_id_name(func_id), func_id);
10300		return err;
10301	}
10302
10303	if (env->cur_state->active_rcu_lock) {
10304		if (fn->might_sleep) {
10305			verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
10306				func_id_name(func_id), func_id);
10307			return -EINVAL;
10308		}
10309
10310		if (in_sleepable(env) && is_storage_get_function(func_id))
10311			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
10312	}
10313
10314	if (env->cur_state->active_preempt_lock) {
10315		if (fn->might_sleep) {
10316			verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
10317				func_id_name(func_id), func_id);
10318			return -EINVAL;
10319		}
10320
10321		if (in_sleepable(env) && is_storage_get_function(func_id))
10322			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
10323	}
10324
10325	meta.func_id = func_id;
10326	/* check args */
10327	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
10328		err = check_func_arg(env, i, &meta, fn, insn_idx);
10329		if (err)
10330			return err;
10331	}
10332
10333	err = record_func_map(env, &meta, func_id, insn_idx);
10334	if (err)
10335		return err;
10336
10337	err = record_func_key(env, &meta, func_id, insn_idx);
10338	if (err)
10339		return err;
10340
10341	/* Mark slots with STACK_MISC in case of raw mode, stack offset
10342	 * is inferred from register state.
10343	 */
10344	for (i = 0; i < meta.access_size; i++) {
10345		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
10346				       BPF_WRITE, -1, false, false);
10347		if (err)
10348			return err;
10349	}
10350
10351	regs = cur_regs(env);
10352
10353	if (meta.release_regno) {
10354		err = -EINVAL;
10355		/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
10356		 * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
10357		 * is safe to do directly.
10358		 */
10359		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
10360			if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
10361				verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
10362				return -EFAULT;
10363			}
10364			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
10365		} else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
10366			u32 ref_obj_id = meta.ref_obj_id;
10367			bool in_rcu = in_rcu_cs(env);
10368			struct bpf_func_state *state;
10369			struct bpf_reg_state *reg;
10370
10371			err = release_reference_state(cur_func(env), ref_obj_id);
10372			if (!err) {
10373				bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
10374					if (reg->ref_obj_id == ref_obj_id) {
10375						if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
10376							reg->ref_obj_id = 0;
10377							reg->type &= ~MEM_ALLOC;
10378							reg->type |= MEM_RCU;
10379						} else {
10380							mark_reg_invalid(env, reg);
10381						}
10382					}
10383				}));
10384			}
10385		} else if (meta.ref_obj_id) {
10386			err = release_reference(env, meta.ref_obj_id);
10387		} else if (register_is_null(&regs[meta.release_regno])) {
10388			/* meta.ref_obj_id can only be 0 if register that is meant to be
10389			 * released is NULL, which must be > R0.
10390			 */
10391			err = 0;
10392		}
10393		if (err) {
10394			verbose(env, "func %s#%d reference has not been acquired before\n",
10395				func_id_name(func_id), func_id);
10396			return err;
10397		}
10398	}
10399
10400	switch (func_id) {
10401	case BPF_FUNC_tail_call:
10402		err = check_reference_leak(env, false);
10403		if (err) {
10404			verbose(env, "tail_call would lead to reference leak\n");
10405			return err;
10406		}
10407		break;
10408	case BPF_FUNC_get_local_storage:
10409		/* check that flags argument in get_local_storage(map, flags) is 0,
10410		 * this is required because get_local_storage() can't return an error.
10411		 */
10412		if (!register_is_null(&regs[BPF_REG_2])) {
10413			verbose(env, "get_local_storage() doesn't support non-zero flags\n");
10414			return -EINVAL;
10415		}
10416		break;
10417	case BPF_FUNC_for_each_map_elem:
10418		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
10419					 set_map_elem_callback_state);
10420		break;
10421	case BPF_FUNC_timer_set_callback:
10422		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
10423					 set_timer_callback_state);
10424		break;
10425	case BPF_FUNC_find_vma:
10426		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
10427					 set_find_vma_callback_state);
10428		break;
10429	case BPF_FUNC_snprintf:
10430		err = check_bpf_snprintf_call(env, regs);
10431		break;
10432	case BPF_FUNC_loop:
10433		update_loop_inline_state(env, meta.subprogno);
10434		/* Verifier relies on R1 value to determine if bpf_loop() iteration
10435		 * is finished, thus mark it precise.
10436		 */
10437		err = mark_chain_precision(env, BPF_REG_1);
10438		if (err)
10439			return err;
10440		if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
10441			err = push_callback_call(env, insn, insn_idx, meta.subprogno,
10442						 set_loop_callback_state);
10443		} else {
10444			cur_func(env)->callback_depth = 0;
10445			if (env->log.level & BPF_LOG_LEVEL2)
10446				verbose(env, "frame%d bpf_loop iteration limit reached\n",
10447					env->cur_state->curframe);
10448		}
10449		break;
10450	case BPF_FUNC_dynptr_from_mem:
10451		if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
10452			verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
10453				reg_type_str(env, regs[BPF_REG_1].type));
10454			return -EACCES;
10455		}
10456		break;
10457	case BPF_FUNC_set_retval:
10458		if (prog_type == BPF_PROG_TYPE_LSM &&
10459		    env->prog->expected_attach_type == BPF_LSM_CGROUP) {
10460			if (!env->prog->aux->attach_func_proto->type) {
10461				/* Make sure programs that attach to void
10462				 * hooks don't try to modify return value.
10463				 */
10464				verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
10465				return -EINVAL;
10466			}
10467		}
10468		break;
10469	case BPF_FUNC_dynptr_data:
10470	{
10471		struct bpf_reg_state *reg;
10472		int id, ref_obj_id;
10473
10474		reg = get_dynptr_arg_reg(env, fn, regs);
10475		if (!reg)
10476			return -EFAULT;
10477
10478
10479		if (meta.dynptr_id) {
10480			verbose(env, "verifier internal error: meta.dynptr_id already set\n");
10481			return -EFAULT;
10482		}
10483		if (meta.ref_obj_id) {
10484			verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
10485			return -EFAULT;
10486		}
10487
10488		id = dynptr_id(env, reg);
10489		if (id < 0) {
10490			verbose(env, "verifier internal error: failed to obtain dynptr id\n");
10491			return id;
10492		}
10493
10494		ref_obj_id = dynptr_ref_obj_id(env, reg);
10495		if (ref_obj_id < 0) {
10496			verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
10497			return ref_obj_id;
10498		}
10499
10500		meta.dynptr_id = id;
10501		meta.ref_obj_id = ref_obj_id;
10502
10503		break;
10504	}
10505	case BPF_FUNC_dynptr_write:
10506	{
10507		enum bpf_dynptr_type dynptr_type;
10508		struct bpf_reg_state *reg;
10509
10510		reg = get_dynptr_arg_reg(env, fn, regs);
10511		if (!reg)
10512			return -EFAULT;
10513
10514		dynptr_type = dynptr_get_type(env, reg);
10515		if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
10516			return -EFAULT;
10517
10518		if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
10519			/* this will trigger clear_all_pkt_pointers(), which will
10520			 * invalidate all dynptr slices associated with the skb
10521			 */
10522			changes_data = true;
10523
10524		break;
10525	}
10526	case BPF_FUNC_per_cpu_ptr:
10527	case BPF_FUNC_this_cpu_ptr:
10528	{
10529		struct bpf_reg_state *reg = &regs[BPF_REG_1];
10530		const struct btf_type *type;
10531
10532		if (reg->type & MEM_RCU) {
10533			type = btf_type_by_id(reg->btf, reg->btf_id);
10534			if (!type || !btf_type_is_struct(type)) {
10535				verbose(env, "Helper has invalid btf/btf_id in R1\n");
10536				return -EFAULT;
10537			}
10538			returns_cpu_specific_alloc_ptr = true;
10539			env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
10540		}
10541		break;
10542	}
10543	case BPF_FUNC_user_ringbuf_drain:
10544		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
10545					 set_user_ringbuf_callback_state);
10546		break;
10547	}
10548
10549	if (err)
10550		return err;
10551
10552	/* reset caller saved regs */
10553	for (i = 0; i < CALLER_SAVED_REGS; i++) {
10554		mark_reg_not_init(env, regs, caller_saved[i]);
10555		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
10556	}
10557
10558	/* helper call returns 64-bit value. */
10559	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
10560
10561	/* update return register (already marked as written above) */
10562	ret_type = fn->ret_type;
10563	ret_flag = type_flag(ret_type);
10564
10565	switch (base_type(ret_type)) {
10566	case RET_INTEGER:
10567		/* sets type to SCALAR_VALUE */
10568		mark_reg_unknown(env, regs, BPF_REG_0);
10569		break;
10570	case RET_VOID:
10571		regs[BPF_REG_0].type = NOT_INIT;
10572		break;
10573	case RET_PTR_TO_MAP_VALUE:
10574		/* There is no offset yet applied, variable or fixed */
10575		mark_reg_known_zero(env, regs, BPF_REG_0);
10576		/* remember map_ptr, so that check_map_access()
10577		 * can check 'value_size' boundary of memory access
10578		 * to map element returned from bpf_map_lookup_elem()
10579		 */
10580		if (meta.map_ptr == NULL) {
10581			verbose(env,
10582				"kernel subsystem misconfigured verifier\n");
10583			return -EINVAL;
10584		}
10585		regs[BPF_REG_0].map_ptr = meta.map_ptr;
10586		regs[BPF_REG_0].map_uid = meta.map_uid;
10587		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
10588		if (!type_may_be_null(ret_type) &&
10589		    btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
10590			regs[BPF_REG_0].id = ++env->id_gen;
10591		}
10592		break;
10593	case RET_PTR_TO_SOCKET:
10594		mark_reg_known_zero(env, regs, BPF_REG_0);
10595		regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
10596		break;
10597	case RET_PTR_TO_SOCK_COMMON:
10598		mark_reg_known_zero(env, regs, BPF_REG_0);
10599		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
10600		break;
10601	case RET_PTR_TO_TCP_SOCK:
10602		mark_reg_known_zero(env, regs, BPF_REG_0);
10603		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
10604		break;
10605	case RET_PTR_TO_MEM:
10606		mark_reg_known_zero(env, regs, BPF_REG_0);
10607		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
10608		regs[BPF_REG_0].mem_size = meta.mem_size;
10609		break;
10610	case RET_PTR_TO_MEM_OR_BTF_ID:
10611	{
10612		const struct btf_type *t;
10613
10614		mark_reg_known_zero(env, regs, BPF_REG_0);
10615		t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
10616		if (!btf_type_is_struct(t)) {
10617			u32 tsize;
10618			const struct btf_type *ret;
10619			const char *tname;
10620
10621			/* resolve the type size of ksym. */
10622			ret = btf_resolve_size(meta.ret_btf, t, &tsize);
10623			if (IS_ERR(ret)) {
10624				tname = btf_name_by_offset(meta.ret_btf, t->name_off);
10625				verbose(env, "unable to resolve the size of type '%s': %ld\n",
10626					tname, PTR_ERR(ret));
10627				return -EINVAL;
10628			}
10629			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
10630			regs[BPF_REG_0].mem_size = tsize;
10631		} else {
10632			if (returns_cpu_specific_alloc_ptr) {
10633				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
10634			} else {
10635				/* MEM_RDONLY may be carried from ret_flag, but it
10636				 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
10637				 * it will confuse the check of PTR_TO_BTF_ID in
10638				 * check_mem_access().
10639				 */
10640				ret_flag &= ~MEM_RDONLY;
10641				regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
10642			}
10643
10644			regs[BPF_REG_0].btf = meta.ret_btf;
10645			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
10646		}
10647		break;
10648	}
10649	case RET_PTR_TO_BTF_ID:
10650	{
10651		struct btf *ret_btf;
10652		int ret_btf_id;
10653
10654		mark_reg_known_zero(env, regs, BPF_REG_0);
10655		regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
10656		if (func_id == BPF_FUNC_kptr_xchg) {
10657			ret_btf = meta.kptr_field->kptr.btf;
10658			ret_btf_id = meta.kptr_field->kptr.btf_id;
10659			if (!btf_is_kernel(ret_btf)) {
10660				regs[BPF_REG_0].type |= MEM_ALLOC;
10661				if (meta.kptr_field->type == BPF_KPTR_PERCPU)
10662					regs[BPF_REG_0].type |= MEM_PERCPU;
10663			}
10664		} else {
10665			if (fn->ret_btf_id == BPF_PTR_POISON) {
10666				verbose(env, "verifier internal error:");
10667				verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
10668					func_id_name(func_id));
10669				return -EINVAL;
10670			}
10671			ret_btf = btf_vmlinux;
10672			ret_btf_id = *fn->ret_btf_id;
10673		}
10674		if (ret_btf_id == 0) {
10675			verbose(env, "invalid return type %u of func %s#%d\n",
10676				base_type(ret_type), func_id_name(func_id),
10677				func_id);
10678			return -EINVAL;
10679		}
10680		regs[BPF_REG_0].btf = ret_btf;
10681		regs[BPF_REG_0].btf_id = ret_btf_id;
10682		break;
10683	}
10684	default:
10685		verbose(env, "unknown return type %u of func %s#%d\n",
10686			base_type(ret_type), func_id_name(func_id), func_id);
10687		return -EINVAL;
10688	}
10689
10690	if (type_may_be_null(regs[BPF_REG_0].type))
10691		regs[BPF_REG_0].id = ++env->id_gen;
10692
10693	if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
10694		verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
10695			func_id_name(func_id), func_id);
10696		return -EFAULT;
10697	}
10698
10699	if (is_dynptr_ref_function(func_id))
10700		regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
10701
10702	if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
10703		/* For release_reference() */
10704		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
10705	} else if (is_acquire_function(func_id, meta.map_ptr)) {
10706		int id = acquire_reference_state(env, insn_idx);
10707
10708		if (id < 0)
10709			return id;
10710		/* For mark_ptr_or_null_reg() */
10711		regs[BPF_REG_0].id = id;
10712		/* For release_reference() */
10713		regs[BPF_REG_0].ref_obj_id = id;
10714	}
10715
10716	err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
10717	if (err)
10718		return err;
10719
10720	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
10721	if (err)
10722		return err;
10723
10724	if ((func_id == BPF_FUNC_get_stack ||
10725	     func_id == BPF_FUNC_get_task_stack) &&
10726	    !env->prog->has_callchain_buf) {
10727		const char *err_str;
10728
10729#ifdef CONFIG_PERF_EVENTS
10730		err = get_callchain_buffers(sysctl_perf_event_max_stack);
10731		err_str = "cannot get callchain buffer for func %s#%d\n";
10732#else
10733		err = -ENOTSUPP;
10734		err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
10735#endif
10736		if (err) {
10737			verbose(env, err_str, func_id_name(func_id), func_id);
10738			return err;
10739		}
10740
10741		env->prog->has_callchain_buf = true;
10742	}
10743
10744	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
10745		env->prog->call_get_stack = true;
10746
10747	if (func_id == BPF_FUNC_get_func_ip) {
10748		if (check_get_func_ip(env))
10749			return -ENOTSUPP;
10750		env->prog->call_get_func_ip = true;
10751	}
10752
10753	if (changes_data)
10754		clear_all_pkt_pointers(env);
10755	return 0;
10756}
10757
10758/* mark_btf_func_reg_size() is used when the reg size is determined by
10759 * the BTF func_proto's return value size and argument.
10760 */
10761static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
10762				   size_t reg_size)
10763{
10764	struct bpf_reg_state *reg = &cur_regs(env)[regno];
10765
10766	if (regno == BPF_REG_0) {
10767		/* Function return value */
10768		reg->live |= REG_LIVE_WRITTEN;
10769		reg->subreg_def = reg_size == sizeof(u64) ?
10770			DEF_NOT_SUBREG : env->insn_idx + 1;
10771	} else {
10772		/* Function argument */
10773		if (reg_size == sizeof(u64)) {
10774			mark_insn_zext(env, reg);
10775			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
10776		} else {
10777			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
10778		}
10779	}
10780}
10781
10782static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
10783{
10784	return meta->kfunc_flags & KF_ACQUIRE;
10785}
10786
10787static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
10788{
10789	return meta->kfunc_flags & KF_RELEASE;
10790}
10791
10792static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
10793{
10794	return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
10795}
10796
10797static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
10798{
10799	return meta->kfunc_flags & KF_SLEEPABLE;
10800}
10801
10802static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
10803{
10804	return meta->kfunc_flags & KF_DESTRUCTIVE;
10805}
10806
10807static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
10808{
10809	return meta->kfunc_flags & KF_RCU;
10810}
10811
10812static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
10813{
10814	return meta->kfunc_flags & KF_RCU_PROTECTED;
10815}
10816
10817static bool is_kfunc_arg_mem_size(const struct btf *btf,
10818				  const struct btf_param *arg,
10819				  const struct bpf_reg_state *reg)
10820{
10821	const struct btf_type *t;
10822
10823	t = btf_type_skip_modifiers(btf, arg->type, NULL);
10824	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
10825		return false;
10826
10827	return btf_param_match_suffix(btf, arg, "__sz");
10828}
10829
10830static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
10831					const struct btf_param *arg,
10832					const struct bpf_reg_state *reg)
10833{
10834	const struct btf_type *t;
10835
10836	t = btf_type_skip_modifiers(btf, arg->type, NULL);
10837	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
10838		return false;
10839
10840	return btf_param_match_suffix(btf, arg, "__szk");
10841}
10842
10843static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
10844{
10845	return btf_param_match_suffix(btf, arg, "__opt");
10846}
10847
10848static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
10849{
10850	return btf_param_match_suffix(btf, arg, "__k");
10851}
10852
10853static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
10854{
10855	return btf_param_match_suffix(btf, arg, "__ign");
10856}
10857
10858static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
10859{
10860	return btf_param_match_suffix(btf, arg, "__map");
10861}
10862
10863static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
10864{
10865	return btf_param_match_suffix(btf, arg, "__alloc");
10866}
10867
10868static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
10869{
10870	return btf_param_match_suffix(btf, arg, "__uninit");
10871}
10872
10873static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
10874{
10875	return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
10876}
10877
10878static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
10879{
10880	return btf_param_match_suffix(btf, arg, "__nullable");
10881}
10882
10883static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
10884{
10885	return btf_param_match_suffix(btf, arg, "__str");
10886}
10887
10888static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
10889					  const struct btf_param *arg,
10890					  const char *name)
10891{
10892	int len, target_len = strlen(name);
10893	const char *param_name;
10894
10895	param_name = btf_name_by_offset(btf, arg->name_off);
10896	if (str_is_empty(param_name))
10897		return false;
10898	len = strlen(param_name);
10899	if (len != target_len)
10900		return false;
10901	if (strcmp(param_name, name))
10902		return false;
10903
10904	return true;
10905}
10906
10907enum {
10908	KF_ARG_DYNPTR_ID,
10909	KF_ARG_LIST_HEAD_ID,
10910	KF_ARG_LIST_NODE_ID,
10911	KF_ARG_RB_ROOT_ID,
10912	KF_ARG_RB_NODE_ID,
10913	KF_ARG_WORKQUEUE_ID,
10914};
10915
10916BTF_ID_LIST(kf_arg_btf_ids)
10917BTF_ID(struct, bpf_dynptr_kern)
10918BTF_ID(struct, bpf_list_head)
10919BTF_ID(struct, bpf_list_node)
10920BTF_ID(struct, bpf_rb_root)
10921BTF_ID(struct, bpf_rb_node)
10922BTF_ID(struct, bpf_wq)
10923
10924static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
10925				    const struct btf_param *arg, int type)
10926{
10927	const struct btf_type *t;
10928	u32 res_id;
10929
10930	t = btf_type_skip_modifiers(btf, arg->type, NULL);
10931	if (!t)
10932		return false;
10933	if (!btf_type_is_ptr(t))
10934		return false;
10935	t = btf_type_skip_modifiers(btf, t->type, &res_id);
10936	if (!t)
10937		return false;
10938	return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
10939}
10940
10941static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
10942{
10943	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
10944}
10945
10946static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
10947{
10948	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
10949}
10950
10951static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
10952{
10953	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
10954}
10955
10956static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
10957{
10958	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
10959}
10960
10961static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
10962{
10963	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
10964}
10965
10966static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
10967{
10968	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
10969}
10970
10971static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
10972				  const struct btf_param *arg)
10973{
10974	const struct btf_type *t;
10975
10976	t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
10977	if (!t)
10978		return false;
10979
10980	return true;
10981}
10982
10983/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
10984static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
10985					const struct btf *btf,
10986					const struct btf_type *t, int rec)
10987{
10988	const struct btf_type *member_type;
10989	const struct btf_member *member;
10990	u32 i;
10991
10992	if (!btf_type_is_struct(t))
10993		return false;
10994
10995	for_each_member(i, t, member) {
10996		const struct btf_array *array;
10997
10998		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
10999		if (btf_type_is_struct(member_type)) {
11000			if (rec >= 3) {
11001				verbose(env, "max struct nesting depth exceeded\n");
11002				return false;
11003			}
11004			if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
11005				return false;
11006			continue;
11007		}
11008		if (btf_type_is_array(member_type)) {
11009			array = btf_array(member_type);
11010			if (!array->nelems)
11011				return false;
11012			member_type = btf_type_skip_modifiers(btf, array->type, NULL);
11013			if (!btf_type_is_scalar(member_type))
11014				return false;
11015			continue;
11016		}
11017		if (!btf_type_is_scalar(member_type))
11018			return false;
11019	}
11020	return true;
11021}
11022
11023enum kfunc_ptr_arg_type {
11024	KF_ARG_PTR_TO_CTX,
11025	KF_ARG_PTR_TO_ALLOC_BTF_ID,    /* Allocated object */
11026	KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
11027	KF_ARG_PTR_TO_DYNPTR,
11028	KF_ARG_PTR_TO_ITER,
11029	KF_ARG_PTR_TO_LIST_HEAD,
11030	KF_ARG_PTR_TO_LIST_NODE,
11031	KF_ARG_PTR_TO_BTF_ID,	       /* Also covers reg2btf_ids conversions */
11032	KF_ARG_PTR_TO_MEM,
11033	KF_ARG_PTR_TO_MEM_SIZE,	       /* Size derived from next argument, skip it */
11034	KF_ARG_PTR_TO_CALLBACK,
11035	KF_ARG_PTR_TO_RB_ROOT,
11036	KF_ARG_PTR_TO_RB_NODE,
11037	KF_ARG_PTR_TO_NULL,
11038	KF_ARG_PTR_TO_CONST_STR,
11039	KF_ARG_PTR_TO_MAP,
11040	KF_ARG_PTR_TO_WORKQUEUE,
11041};
11042
11043enum special_kfunc_type {
11044	KF_bpf_obj_new_impl,
11045	KF_bpf_obj_drop_impl,
11046	KF_bpf_refcount_acquire_impl,
11047	KF_bpf_list_push_front_impl,
11048	KF_bpf_list_push_back_impl,
11049	KF_bpf_list_pop_front,
11050	KF_bpf_list_pop_back,
11051	KF_bpf_cast_to_kern_ctx,
11052	KF_bpf_rdonly_cast,
11053	KF_bpf_rcu_read_lock,
11054	KF_bpf_rcu_read_unlock,
11055	KF_bpf_rbtree_remove,
11056	KF_bpf_rbtree_add_impl,
11057	KF_bpf_rbtree_first,
11058	KF_bpf_dynptr_from_skb,
11059	KF_bpf_dynptr_from_xdp,
11060	KF_bpf_dynptr_slice,
11061	KF_bpf_dynptr_slice_rdwr,
11062	KF_bpf_dynptr_clone,
11063	KF_bpf_percpu_obj_new_impl,
11064	KF_bpf_percpu_obj_drop_impl,
11065	KF_bpf_throw,
11066	KF_bpf_wq_set_callback_impl,
11067	KF_bpf_preempt_disable,
11068	KF_bpf_preempt_enable,
11069	KF_bpf_iter_css_task_new,
11070	KF_bpf_session_cookie,
11071};
11072
11073BTF_SET_START(special_kfunc_set)
11074BTF_ID(func, bpf_obj_new_impl)
11075BTF_ID(func, bpf_obj_drop_impl)
11076BTF_ID(func, bpf_refcount_acquire_impl)
11077BTF_ID(func, bpf_list_push_front_impl)
11078BTF_ID(func, bpf_list_push_back_impl)
11079BTF_ID(func, bpf_list_pop_front)
11080BTF_ID(func, bpf_list_pop_back)
11081BTF_ID(func, bpf_cast_to_kern_ctx)
11082BTF_ID(func, bpf_rdonly_cast)
11083BTF_ID(func, bpf_rbtree_remove)
11084BTF_ID(func, bpf_rbtree_add_impl)
11085BTF_ID(func, bpf_rbtree_first)
11086BTF_ID(func, bpf_dynptr_from_skb)
11087BTF_ID(func, bpf_dynptr_from_xdp)
11088BTF_ID(func, bpf_dynptr_slice)
11089BTF_ID(func, bpf_dynptr_slice_rdwr)
11090BTF_ID(func, bpf_dynptr_clone)
11091BTF_ID(func, bpf_percpu_obj_new_impl)
11092BTF_ID(func, bpf_percpu_obj_drop_impl)
11093BTF_ID(func, bpf_throw)
11094BTF_ID(func, bpf_wq_set_callback_impl)
11095#ifdef CONFIG_CGROUPS
11096BTF_ID(func, bpf_iter_css_task_new)
11097#endif
11098BTF_SET_END(special_kfunc_set)
11099
11100BTF_ID_LIST(special_kfunc_list)
11101BTF_ID(func, bpf_obj_new_impl)
11102BTF_ID(func, bpf_obj_drop_impl)
11103BTF_ID(func, bpf_refcount_acquire_impl)
11104BTF_ID(func, bpf_list_push_front_impl)
11105BTF_ID(func, bpf_list_push_back_impl)
11106BTF_ID(func, bpf_list_pop_front)
11107BTF_ID(func, bpf_list_pop_back)
11108BTF_ID(func, bpf_cast_to_kern_ctx)
11109BTF_ID(func, bpf_rdonly_cast)
11110BTF_ID(func, bpf_rcu_read_lock)
11111BTF_ID(func, bpf_rcu_read_unlock)
11112BTF_ID(func, bpf_rbtree_remove)
11113BTF_ID(func, bpf_rbtree_add_impl)
11114BTF_ID(func, bpf_rbtree_first)
11115BTF_ID(func, bpf_dynptr_from_skb)
11116BTF_ID(func, bpf_dynptr_from_xdp)
11117BTF_ID(func, bpf_dynptr_slice)
11118BTF_ID(func, bpf_dynptr_slice_rdwr)
11119BTF_ID(func, bpf_dynptr_clone)
11120BTF_ID(func, bpf_percpu_obj_new_impl)
11121BTF_ID(func, bpf_percpu_obj_drop_impl)
11122BTF_ID(func, bpf_throw)
11123BTF_ID(func, bpf_wq_set_callback_impl)
11124BTF_ID(func, bpf_preempt_disable)
11125BTF_ID(func, bpf_preempt_enable)
11126#ifdef CONFIG_CGROUPS
11127BTF_ID(func, bpf_iter_css_task_new)
11128#else
11129BTF_ID_UNUSED
11130#endif
11131BTF_ID(func, bpf_session_cookie)
11132
11133static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
11134{
11135	if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
11136	    meta->arg_owning_ref) {
11137		return false;
11138	}
11139
11140	return meta->kfunc_flags & KF_RET_NULL;
11141}
11142
11143static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
11144{
11145	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
11146}
11147
11148static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
11149{
11150	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
11151}
11152
11153static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
11154{
11155	return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
11156}
11157
11158static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
11159{
11160	return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
11161}
11162
11163static enum kfunc_ptr_arg_type
11164get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
11165		       struct bpf_kfunc_call_arg_meta *meta,
11166		       const struct btf_type *t, const struct btf_type *ref_t,
11167		       const char *ref_tname, const struct btf_param *args,
11168		       int argno, int nargs)
11169{
11170	u32 regno = argno + 1;
11171	struct bpf_reg_state *regs = cur_regs(env);
11172	struct bpf_reg_state *reg = &regs[regno];
11173	bool arg_mem_size = false;
11174
11175	if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
11176		return KF_ARG_PTR_TO_CTX;
11177
11178	/* In this function, we verify the kfunc's BTF as per the argument type,
11179	 * leaving the rest of the verification with respect to the register
11180	 * type to our caller. When a set of conditions hold in the BTF type of
11181	 * arguments, we resolve it to a known kfunc_ptr_arg_type.
11182	 */
11183	if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
11184		return KF_ARG_PTR_TO_CTX;
11185
11186	if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
11187		return KF_ARG_PTR_TO_ALLOC_BTF_ID;
11188
11189	if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
11190		return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
11191
11192	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
11193		return KF_ARG_PTR_TO_DYNPTR;
11194
11195	if (is_kfunc_arg_iter(meta, argno))
11196		return KF_ARG_PTR_TO_ITER;
11197
11198	if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
11199		return KF_ARG_PTR_TO_LIST_HEAD;
11200
11201	if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
11202		return KF_ARG_PTR_TO_LIST_NODE;
11203
11204	if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
11205		return KF_ARG_PTR_TO_RB_ROOT;
11206
11207	if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
11208		return KF_ARG_PTR_TO_RB_NODE;
11209
11210	if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
11211		return KF_ARG_PTR_TO_CONST_STR;
11212
11213	if (is_kfunc_arg_map(meta->btf, &args[argno]))
11214		return KF_ARG_PTR_TO_MAP;
11215
11216	if (is_kfunc_arg_wq(meta->btf, &args[argno]))
11217		return KF_ARG_PTR_TO_WORKQUEUE;
11218
11219	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
11220		if (!btf_type_is_struct(ref_t)) {
11221			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
11222				meta->func_name, argno, btf_type_str(ref_t), ref_tname);
11223			return -EINVAL;
11224		}
11225		return KF_ARG_PTR_TO_BTF_ID;
11226	}
11227
11228	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
11229		return KF_ARG_PTR_TO_CALLBACK;
11230
11231	if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
11232		return KF_ARG_PTR_TO_NULL;
11233
11234	if (argno + 1 < nargs &&
11235	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
11236	     is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
11237		arg_mem_size = true;
11238
11239	/* This is the catch all argument type of register types supported by
11240	 * check_helper_mem_access. However, we only allow when argument type is
11241	 * pointer to scalar, or struct composed (recursively) of scalars. When
11242	 * arg_mem_size is true, the pointer can be void *.
11243	 */
11244	if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
11245	    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
11246		verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
11247			argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
11248		return -EINVAL;
11249	}
11250	return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
11251}
11252
11253static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
11254					struct bpf_reg_state *reg,
11255					const struct btf_type *ref_t,
11256					const char *ref_tname, u32 ref_id,
11257					struct bpf_kfunc_call_arg_meta *meta,
11258					int argno)
11259{
11260	const struct btf_type *reg_ref_t;
11261	bool strict_type_match = false;
11262	const struct btf *reg_btf;
11263	const char *reg_ref_tname;
11264	u32 reg_ref_id;
11265
11266	if (base_type(reg->type) == PTR_TO_BTF_ID) {
11267		reg_btf = reg->btf;
11268		reg_ref_id = reg->btf_id;
11269	} else {
11270		reg_btf = btf_vmlinux;
11271		reg_ref_id = *reg2btf_ids[base_type(reg->type)];
11272	}
11273
11274	/* Enforce strict type matching for calls to kfuncs that are acquiring
11275	 * or releasing a reference, or are no-cast aliases. We do _not_
11276	 * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
11277	 * as we want to enable BPF programs to pass types that are bitwise
11278	 * equivalent without forcing them to explicitly cast with something
11279	 * like bpf_cast_to_kern_ctx().
11280	 *
11281	 * For example, say we had a type like the following:
11282	 *
11283	 * struct bpf_cpumask {
11284	 *	cpumask_t cpumask;
11285	 *	refcount_t usage;
11286	 * };
11287	 *
11288	 * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
11289	 * to a struct cpumask, so it would be safe to pass a struct
11290	 * bpf_cpumask * to a kfunc expecting a struct cpumask *.
11291	 *
11292	 * The philosophy here is similar to how we allow scalars of different
11293	 * types to be passed to kfuncs as long as the size is the same. The
11294	 * only difference here is that we're simply allowing
11295	 * btf_struct_ids_match() to walk the struct at the 0th offset, and
11296	 * resolve types.
11297	 */
11298	if (is_kfunc_acquire(meta) ||
11299	    (is_kfunc_release(meta) && reg->ref_obj_id) ||
11300	    btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
11301		strict_type_match = true;
11302
11303	WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
11304
11305	reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
11306	reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
11307	if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
11308		verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
11309			meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
11310			btf_type_str(reg_ref_t), reg_ref_tname);
11311		return -EINVAL;
11312	}
11313	return 0;
11314}
11315
11316static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
11317{
11318	struct bpf_verifier_state *state = env->cur_state;
11319	struct btf_record *rec = reg_btf_record(reg);
11320
11321	if (!state->active_lock.ptr) {
11322		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
11323		return -EFAULT;
11324	}
11325
11326	if (type_flag(reg->type) & NON_OWN_REF) {
11327		verbose(env, "verifier internal error: NON_OWN_REF already set\n");
11328		return -EFAULT;
11329	}
11330
11331	reg->type |= NON_OWN_REF;
11332	if (rec->refcount_off >= 0)
11333		reg->type |= MEM_RCU;
11334
11335	return 0;
11336}
11337
11338static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
11339{
11340	struct bpf_func_state *state, *unused;
11341	struct bpf_reg_state *reg;
11342	int i;
11343
11344	state = cur_func(env);
11345
11346	if (!ref_obj_id) {
11347		verbose(env, "verifier internal error: ref_obj_id is zero for "
11348			     "owning -> non-owning conversion\n");
11349		return -EFAULT;
11350	}
11351
11352	for (i = 0; i < state->acquired_refs; i++) {
11353		if (state->refs[i].id != ref_obj_id)
11354			continue;
11355
11356		/* Clear ref_obj_id here so release_reference doesn't clobber
11357		 * the whole reg
11358		 */
11359		bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
11360			if (reg->ref_obj_id == ref_obj_id) {
11361				reg->ref_obj_id = 0;
11362				ref_set_non_owning(env, reg);
11363			}
11364		}));
11365		return 0;
11366	}
11367
11368	verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
11369	return -EFAULT;
11370}
11371
11372/* Implementation details:
11373 *
11374 * Each register points to some region of memory, which we define as an
11375 * allocation. Each allocation may embed a bpf_spin_lock which protects any
11376 * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
11377 * allocation. The lock and the data it protects are colocated in the same
11378 * memory region.
11379 *
11380 * Hence, everytime a register holds a pointer value pointing to such
11381 * allocation, the verifier preserves a unique reg->id for it.
11382 *
11383 * The verifier remembers the lock 'ptr' and the lock 'id' whenever
11384 * bpf_spin_lock is called.
11385 *
11386 * To enable this, lock state in the verifier captures two values:
11387 *	active_lock.ptr = Register's type specific pointer
11388 *	active_lock.id  = A unique ID for each register pointer value
11389 *
11390 * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
11391 * supported register types.
11392 *
11393 * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
11394 * allocated objects is the reg->btf pointer.
11395 *
11396 * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
11397 * can establish the provenance of the map value statically for each distinct
11398 * lookup into such maps. They always contain a single map value hence unique
11399 * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
11400 *
11401 * So, in case of global variables, they use array maps with max_entries = 1,
11402 * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
11403 * into the same map value as max_entries is 1, as described above).
11404 *
11405 * In case of inner map lookups, the inner map pointer has same map_ptr as the
11406 * outer map pointer (in verifier context), but each lookup into an inner map
11407 * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
11408 * maps from the same outer map share the same map_ptr as active_lock.ptr, they
11409 * will get different reg->id assigned to each lookup, hence different
11410 * active_lock.id.
11411 *
11412 * In case of allocated objects, active_lock.ptr is the reg->btf, and the
11413 * reg->id is a unique ID preserved after the NULL pointer check on the pointer
11414 * returned from bpf_obj_new. Each allocation receives a new reg->id.
11415 */
11416static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
11417{
11418	void *ptr;
11419	u32 id;
11420
11421	switch ((int)reg->type) {
11422	case PTR_TO_MAP_VALUE:
11423		ptr = reg->map_ptr;
11424		break;
11425	case PTR_TO_BTF_ID | MEM_ALLOC:
11426		ptr = reg->btf;
11427		break;
11428	default:
11429		verbose(env, "verifier internal error: unknown reg type for lock check\n");
11430		return -EFAULT;
11431	}
11432	id = reg->id;
11433
11434	if (!env->cur_state->active_lock.ptr)
11435		return -EINVAL;
11436	if (env->cur_state->active_lock.ptr != ptr ||
11437	    env->cur_state->active_lock.id != id) {
11438		verbose(env, "held lock and object are not in the same allocation\n");
11439		return -EINVAL;
11440	}
11441	return 0;
11442}
11443
11444static bool is_bpf_list_api_kfunc(u32 btf_id)
11445{
11446	return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
11447	       btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
11448	       btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
11449	       btf_id == special_kfunc_list[KF_bpf_list_pop_back];
11450}
11451
11452static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
11453{
11454	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
11455	       btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
11456	       btf_id == special_kfunc_list[KF_bpf_rbtree_first];
11457}
11458
11459static bool is_bpf_graph_api_kfunc(u32 btf_id)
11460{
11461	return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
11462	       btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
11463}
11464
11465static bool is_sync_callback_calling_kfunc(u32 btf_id)
11466{
11467	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
11468}
11469
11470static bool is_async_callback_calling_kfunc(u32 btf_id)
11471{
11472	return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
11473}
11474
11475static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
11476{
11477	return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
11478	       insn->imm == special_kfunc_list[KF_bpf_throw];
11479}
11480
11481static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
11482{
11483	return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
11484}
11485
11486static bool is_callback_calling_kfunc(u32 btf_id)
11487{
11488	return is_sync_callback_calling_kfunc(btf_id) ||
11489	       is_async_callback_calling_kfunc(btf_id);
11490}
11491
11492static bool is_rbtree_lock_required_kfunc(u32 btf_id)
11493{
11494	return is_bpf_rbtree_api_kfunc(btf_id);
11495}
11496
11497static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
11498					  enum btf_field_type head_field_type,
11499					  u32 kfunc_btf_id)
11500{
11501	bool ret;
11502
11503	switch (head_field_type) {
11504	case BPF_LIST_HEAD:
11505		ret = is_bpf_list_api_kfunc(kfunc_btf_id);
11506		break;
11507	case BPF_RB_ROOT:
11508		ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
11509		break;
11510	default:
11511		verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
11512			btf_field_type_name(head_field_type));
11513		return false;
11514	}
11515
11516	if (!ret)
11517		verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
11518			btf_field_type_name(head_field_type));
11519	return ret;
11520}
11521
11522static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
11523					  enum btf_field_type node_field_type,
11524					  u32 kfunc_btf_id)
11525{
11526	bool ret;
11527
11528	switch (node_field_type) {
11529	case BPF_LIST_NODE:
11530		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
11531		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
11532		break;
11533	case BPF_RB_NODE:
11534		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
11535		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
11536		break;
11537	default:
11538		verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
11539			btf_field_type_name(node_field_type));
11540		return false;
11541	}
11542
11543	if (!ret)
11544		verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
11545			btf_field_type_name(node_field_type));
11546	return ret;
11547}
11548
11549static int
11550__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
11551				   struct bpf_reg_state *reg, u32 regno,
11552				   struct bpf_kfunc_call_arg_meta *meta,
11553				   enum btf_field_type head_field_type,
11554				   struct btf_field **head_field)
11555{
11556	const char *head_type_name;
11557	struct btf_field *field;
11558	struct btf_record *rec;
11559	u32 head_off;
11560
11561	if (meta->btf != btf_vmlinux) {
11562		verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
11563		return -EFAULT;
11564	}
11565
11566	if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
11567		return -EFAULT;
11568
11569	head_type_name = btf_field_type_name(head_field_type);
11570	if (!tnum_is_const(reg->var_off)) {
11571		verbose(env,
11572			"R%d doesn't have constant offset. %s has to be at the constant offset\n",
11573			regno, head_type_name);
11574		return -EINVAL;
11575	}
11576
11577	rec = reg_btf_record(reg);
11578	head_off = reg->off + reg->var_off.value;
11579	field = btf_record_find(rec, head_off, head_field_type);
11580	if (!field) {
11581		verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
11582		return -EINVAL;
11583	}
11584
11585	/* All functions require bpf_list_head to be protected using a bpf_spin_lock */
11586	if (check_reg_allocation_locked(env, reg)) {
11587		verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
11588			rec->spin_lock_off, head_type_name);
11589		return -EINVAL;
11590	}
11591
11592	if (*head_field) {
11593		verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);
11594		return -EFAULT;
11595	}
11596	*head_field = field;
11597	return 0;
11598}
11599
11600static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
11601					   struct bpf_reg_state *reg, u32 regno,
11602					   struct bpf_kfunc_call_arg_meta *meta)
11603{
11604	return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
11605							  &meta->arg_list_head.field);
11606}
11607
11608static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
11609					     struct bpf_reg_state *reg, u32 regno,
11610					     struct bpf_kfunc_call_arg_meta *meta)
11611{
11612	return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
11613							  &meta->arg_rbtree_root.field);
11614}
11615
11616static int
11617__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
11618				   struct bpf_reg_state *reg, u32 regno,
11619				   struct bpf_kfunc_call_arg_meta *meta,
11620				   enum btf_field_type head_field_type,
11621				   enum btf_field_type node_field_type,
11622				   struct btf_field **node_field)
11623{
11624	const char *node_type_name;
11625	const struct btf_type *et, *t;
11626	struct btf_field *field;
11627	u32 node_off;
11628
11629	if (meta->btf != btf_vmlinux) {
11630		verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
11631		return -EFAULT;
11632	}
11633
11634	if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
11635		return -EFAULT;
11636
11637	node_type_name = btf_field_type_name(node_field_type);
11638	if (!tnum_is_const(reg->var_off)) {
11639		verbose(env,
11640			"R%d doesn't have constant offset. %s has to be at the constant offset\n",
11641			regno, node_type_name);
11642		return -EINVAL;
11643	}
11644
11645	node_off = reg->off + reg->var_off.value;
11646	field = reg_find_field_offset(reg, node_off, node_field_type);
11647	if (!field || field->offset != node_off) {
11648		verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
11649		return -EINVAL;
11650	}
11651
11652	field = *node_field;
11653
11654	et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
11655	t = btf_type_by_id(reg->btf, reg->btf_id);
11656	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
11657				  field->graph_root.value_btf_id, true)) {
11658		verbose(env, "operation on %s expects arg#1 %s at offset=%d "
11659			"in struct %s, but arg is at offset=%d in struct %s\n",
11660			btf_field_type_name(head_field_type),
11661			btf_field_type_name(node_field_type),
11662			field->graph_root.node_offset,
11663			btf_name_by_offset(field->graph_root.btf, et->name_off),
11664			node_off, btf_name_by_offset(reg->btf, t->name_off));
11665		return -EINVAL;
11666	}
11667	meta->arg_btf = reg->btf;
11668	meta->arg_btf_id = reg->btf_id;
11669
11670	if (node_off != field->graph_root.node_offset) {
11671		verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
11672			node_off, btf_field_type_name(node_field_type),
11673			field->graph_root.node_offset,
11674			btf_name_by_offset(field->graph_root.btf, et->name_off));
11675		return -EINVAL;
11676	}
11677
11678	return 0;
11679}
11680
11681static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
11682					   struct bpf_reg_state *reg, u32 regno,
11683					   struct bpf_kfunc_call_arg_meta *meta)
11684{
11685	return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
11686						  BPF_LIST_HEAD, BPF_LIST_NODE,
11687						  &meta->arg_list_head.field);
11688}
11689
11690static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
11691					     struct bpf_reg_state *reg, u32 regno,
11692					     struct bpf_kfunc_call_arg_meta *meta)
11693{
11694	return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
11695						  BPF_RB_ROOT, BPF_RB_NODE,
11696						  &meta->arg_rbtree_root.field);
11697}
11698
11699/*
11700 * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
11701 * LSM hooks and iters (both sleepable and non-sleepable) are safe.
11702 * Any sleepable progs are also safe since bpf_check_attach_target() enforce
11703 * them can only be attached to some specific hook points.
11704 */
11705static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
11706{
11707	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
11708
11709	switch (prog_type) {
11710	case BPF_PROG_TYPE_LSM:
11711		return true;
11712	case BPF_PROG_TYPE_TRACING:
11713		if (env->prog->expected_attach_type == BPF_TRACE_ITER)
11714			return true;
11715		fallthrough;
11716	default:
11717		return in_sleepable(env);
11718	}
11719}
11720
11721static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
11722			    int insn_idx)
11723{
11724	const char *func_name = meta->func_name, *ref_tname;
11725	const struct btf *btf = meta->btf;
11726	const struct btf_param *args;
11727	struct btf_record *rec;
11728	u32 i, nargs;
11729	int ret;
11730
11731	args = (const struct btf_param *)(meta->func_proto + 1);
11732	nargs = btf_type_vlen(meta->func_proto);
11733	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
11734		verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
11735			MAX_BPF_FUNC_REG_ARGS);
11736		return -EINVAL;
11737	}
11738
11739	/* Check that BTF function arguments match actual types that the
11740	 * verifier sees.
11741	 */
11742	for (i = 0; i < nargs; i++) {
11743		struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
11744		const struct btf_type *t, *ref_t, *resolve_ret;
11745		enum bpf_arg_type arg_type = ARG_DONTCARE;
11746		u32 regno = i + 1, ref_id, type_size;
11747		bool is_ret_buf_sz = false;
11748		int kf_arg_type;
11749
11750		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
11751
11752		if (is_kfunc_arg_ignore(btf, &args[i]))
11753			continue;
11754
11755		if (btf_type_is_scalar(t)) {
11756			if (reg->type != SCALAR_VALUE) {
11757				verbose(env, "R%d is not a scalar\n", regno);
11758				return -EINVAL;
11759			}
11760
11761			if (is_kfunc_arg_constant(meta->btf, &args[i])) {
11762				if (meta->arg_constant.found) {
11763					verbose(env, "verifier internal error: only one constant argument permitted\n");
11764					return -EFAULT;
11765				}
11766				if (!tnum_is_const(reg->var_off)) {
11767					verbose(env, "R%d must be a known constant\n", regno);
11768					return -EINVAL;
11769				}
11770				ret = mark_chain_precision(env, regno);
11771				if (ret < 0)
11772					return ret;
11773				meta->arg_constant.found = true;
11774				meta->arg_constant.value = reg->var_off.value;
11775			} else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
11776				meta->r0_rdonly = true;
11777				is_ret_buf_sz = true;
11778			} else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
11779				is_ret_buf_sz = true;
11780			}
11781
11782			if (is_ret_buf_sz) {
11783				if (meta->r0_size) {
11784					verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
11785					return -EINVAL;
11786				}
11787
11788				if (!tnum_is_const(reg->var_off)) {
11789					verbose(env, "R%d is not a const\n", regno);
11790					return -EINVAL;
11791				}
11792
11793				meta->r0_size = reg->var_off.value;
11794				ret = mark_chain_precision(env, regno);
11795				if (ret)
11796					return ret;
11797			}
11798			continue;
11799		}
11800
11801		if (!btf_type_is_ptr(t)) {
11802			verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
11803			return -EINVAL;
11804		}
11805
11806		if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
11807		    (register_is_null(reg) || type_may_be_null(reg->type)) &&
11808			!is_kfunc_arg_nullable(meta->btf, &args[i])) {
11809			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
11810			return -EACCES;
11811		}
11812
11813		if (reg->ref_obj_id) {
11814			if (is_kfunc_release(meta) && meta->ref_obj_id) {
11815				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
11816					regno, reg->ref_obj_id,
11817					meta->ref_obj_id);
11818				return -EFAULT;
11819			}
11820			meta->ref_obj_id = reg->ref_obj_id;
11821			if (is_kfunc_release(meta))
11822				meta->release_regno = regno;
11823		}
11824
11825		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
11826		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
11827
11828		kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
11829		if (kf_arg_type < 0)
11830			return kf_arg_type;
11831
11832		switch (kf_arg_type) {
11833		case KF_ARG_PTR_TO_NULL:
11834			continue;
11835		case KF_ARG_PTR_TO_MAP:
11836			if (!reg->map_ptr) {
11837				verbose(env, "pointer in R%d isn't map pointer\n", regno);
11838				return -EINVAL;
11839			}
11840			if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
11841				/* Use map_uid (which is unique id of inner map) to reject:
11842				 * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
11843				 * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
11844				 * if (inner_map1 && inner_map2) {
11845				 *     wq = bpf_map_lookup_elem(inner_map1);
11846				 *     if (wq)
11847				 *         // mismatch would have been allowed
11848				 *         bpf_wq_init(wq, inner_map2);
11849				 * }
11850				 *
11851				 * Comparing map_ptr is enough to distinguish normal and outer maps.
11852				 */
11853				if (meta->map.ptr != reg->map_ptr ||
11854				    meta->map.uid != reg->map_uid) {
11855					verbose(env,
11856						"workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
11857						meta->map.uid, reg->map_uid);
11858					return -EINVAL;
11859				}
11860			}
11861			meta->map.ptr = reg->map_ptr;
11862			meta->map.uid = reg->map_uid;
11863			fallthrough;
11864		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
11865		case KF_ARG_PTR_TO_BTF_ID:
11866			if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
11867				break;
11868
11869			if (!is_trusted_reg(reg)) {
11870				if (!is_kfunc_rcu(meta)) {
11871					verbose(env, "R%d must be referenced or trusted\n", regno);
11872					return -EINVAL;
11873				}
11874				if (!is_rcu_reg(reg)) {
11875					verbose(env, "R%d must be a rcu pointer\n", regno);
11876					return -EINVAL;
11877				}
11878			}
11879
11880			fallthrough;
11881		case KF_ARG_PTR_TO_CTX:
11882			/* Trusted arguments have the same offset checks as release arguments */
11883			arg_type |= OBJ_RELEASE;
11884			break;
11885		case KF_ARG_PTR_TO_DYNPTR:
11886		case KF_ARG_PTR_TO_ITER:
11887		case KF_ARG_PTR_TO_LIST_HEAD:
11888		case KF_ARG_PTR_TO_LIST_NODE:
11889		case KF_ARG_PTR_TO_RB_ROOT:
11890		case KF_ARG_PTR_TO_RB_NODE:
11891		case KF_ARG_PTR_TO_MEM:
11892		case KF_ARG_PTR_TO_MEM_SIZE:
11893		case KF_ARG_PTR_TO_CALLBACK:
11894		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
11895		case KF_ARG_PTR_TO_CONST_STR:
11896		case KF_ARG_PTR_TO_WORKQUEUE:
11897			/* Trusted by default */
11898			break;
11899		default:
11900			WARN_ON_ONCE(1);
11901			return -EFAULT;
11902		}
11903
11904		if (is_kfunc_release(meta) && reg->ref_obj_id)
11905			arg_type |= OBJ_RELEASE;
11906		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
11907		if (ret < 0)
11908			return ret;
11909
11910		switch (kf_arg_type) {
11911		case KF_ARG_PTR_TO_CTX:
11912			if (reg->type != PTR_TO_CTX) {
11913				verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
11914				return -EINVAL;
11915			}
11916
11917			if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
11918				ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
11919				if (ret < 0)
11920					return -EINVAL;
11921				meta->ret_btf_id  = ret;
11922			}
11923			break;
11924		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
11925			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
11926				if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
11927					verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
11928					return -EINVAL;
11929				}
11930			} else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
11931				if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
11932					verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
11933					return -EINVAL;
11934				}
11935			} else {
11936				verbose(env, "arg#%d expected pointer to allocated object\n", i);
11937				return -EINVAL;
11938			}
11939			if (!reg->ref_obj_id) {
11940				verbose(env, "allocated object must be referenced\n");
11941				return -EINVAL;
11942			}
11943			if (meta->btf == btf_vmlinux) {
11944				meta->arg_btf = reg->btf;
11945				meta->arg_btf_id = reg->btf_id;
11946			}
11947			break;
11948		case KF_ARG_PTR_TO_DYNPTR:
11949		{
11950			enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
11951			int clone_ref_obj_id = 0;
11952
11953			if (reg->type != PTR_TO_STACK &&
11954			    reg->type != CONST_PTR_TO_DYNPTR) {
11955				verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
11956				return -EINVAL;
11957			}
11958
11959			if (reg->type == CONST_PTR_TO_DYNPTR)
11960				dynptr_arg_type |= MEM_RDONLY;
11961
11962			if (is_kfunc_arg_uninit(btf, &args[i]))
11963				dynptr_arg_type |= MEM_UNINIT;
11964
11965			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
11966				dynptr_arg_type |= DYNPTR_TYPE_SKB;
11967			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
11968				dynptr_arg_type |= DYNPTR_TYPE_XDP;
11969			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
11970				   (dynptr_arg_type & MEM_UNINIT)) {
11971				enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
11972
11973				if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
11974					verbose(env, "verifier internal error: no dynptr type for parent of clone\n");
11975					return -EFAULT;
11976				}
11977
11978				dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
11979				clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
11980				if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
11981					verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
11982					return -EFAULT;
11983				}
11984			}
11985
11986			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
11987			if (ret < 0)
11988				return ret;
11989
11990			if (!(dynptr_arg_type & MEM_UNINIT)) {
11991				int id = dynptr_id(env, reg);
11992
11993				if (id < 0) {
11994					verbose(env, "verifier internal error: failed to obtain dynptr id\n");
11995					return id;
11996				}
11997				meta->initialized_dynptr.id = id;
11998				meta->initialized_dynptr.type = dynptr_get_type(env, reg);
11999				meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
12000			}
12001
12002			break;
12003		}
12004		case KF_ARG_PTR_TO_ITER:
12005			if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
12006				if (!check_css_task_iter_allowlist(env)) {
12007					verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
12008					return -EINVAL;
12009				}
12010			}
12011			ret = process_iter_arg(env, regno, insn_idx, meta);
12012			if (ret < 0)
12013				return ret;
12014			break;
12015		case KF_ARG_PTR_TO_LIST_HEAD:
12016			if (reg->type != PTR_TO_MAP_VALUE &&
12017			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
12018				verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
12019				return -EINVAL;
12020			}
12021			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
12022				verbose(env, "allocated object must be referenced\n");
12023				return -EINVAL;
12024			}
12025			ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
12026			if (ret < 0)
12027				return ret;
12028			break;
12029		case KF_ARG_PTR_TO_RB_ROOT:
12030			if (reg->type != PTR_TO_MAP_VALUE &&
12031			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
12032				verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
12033				return -EINVAL;
12034			}
12035			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
12036				verbose(env, "allocated object must be referenced\n");
12037				return -EINVAL;
12038			}
12039			ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
12040			if (ret < 0)
12041				return ret;
12042			break;
12043		case KF_ARG_PTR_TO_LIST_NODE:
12044			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
12045				verbose(env, "arg#%d expected pointer to allocated object\n", i);
12046				return -EINVAL;
12047			}
12048			if (!reg->ref_obj_id) {
12049				verbose(env, "allocated object must be referenced\n");
12050				return -EINVAL;
12051			}
12052			ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
12053			if (ret < 0)
12054				return ret;
12055			break;
12056		case KF_ARG_PTR_TO_RB_NODE:
12057			if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
12058				if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
12059					verbose(env, "rbtree_remove node input must be non-owning ref\n");
12060					return -EINVAL;
12061				}
12062				if (in_rbtree_lock_required_cb(env)) {
12063					verbose(env, "rbtree_remove not allowed in rbtree cb\n");
12064					return -EINVAL;
12065				}
12066			} else {
12067				if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
12068					verbose(env, "arg#%d expected pointer to allocated object\n", i);
12069					return -EINVAL;
12070				}
12071				if (!reg->ref_obj_id) {
12072					verbose(env, "allocated object must be referenced\n");
12073					return -EINVAL;
12074				}
12075			}
12076
12077			ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
12078			if (ret < 0)
12079				return ret;
12080			break;
12081		case KF_ARG_PTR_TO_MAP:
12082			/* If argument has '__map' suffix expect 'struct bpf_map *' */
12083			ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
12084			ref_t = btf_type_by_id(btf_vmlinux, ref_id);
12085			ref_tname = btf_name_by_offset(btf, ref_t->name_off);
12086			fallthrough;
12087		case KF_ARG_PTR_TO_BTF_ID:
12088			/* Only base_type is checked, further checks are done here */
12089			if ((base_type(reg->type) != PTR_TO_BTF_ID ||
12090			     (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
12091			    !reg2btf_ids[base_type(reg->type)]) {
12092				verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
12093				verbose(env, "expected %s or socket\n",
12094					reg_type_str(env, base_type(reg->type) |
12095							  (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
12096				return -EINVAL;
12097			}
12098			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
12099			if (ret < 0)
12100				return ret;
12101			break;
12102		case KF_ARG_PTR_TO_MEM:
12103			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
12104			if (IS_ERR(resolve_ret)) {
12105				verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
12106					i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
12107				return -EINVAL;
12108			}
12109			ret = check_mem_reg(env, reg, regno, type_size);
12110			if (ret < 0)
12111				return ret;
12112			break;
12113		case KF_ARG_PTR_TO_MEM_SIZE:
12114		{
12115			struct bpf_reg_state *buff_reg = &regs[regno];
12116			const struct btf_param *buff_arg = &args[i];
12117			struct bpf_reg_state *size_reg = &regs[regno + 1];
12118			const struct btf_param *size_arg = &args[i + 1];
12119
12120			if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
12121				ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
12122				if (ret < 0) {
12123					verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
12124					return ret;
12125				}
12126			}
12127
12128			if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
12129				if (meta->arg_constant.found) {
12130					verbose(env, "verifier internal error: only one constant argument permitted\n");
12131					return -EFAULT;
12132				}
12133				if (!tnum_is_const(size_reg->var_off)) {
12134					verbose(env, "R%d must be a known constant\n", regno + 1);
12135					return -EINVAL;
12136				}
12137				meta->arg_constant.found = true;
12138				meta->arg_constant.value = size_reg->var_off.value;
12139			}
12140
12141			/* Skip next '__sz' or '__szk' argument */
12142			i++;
12143			break;
12144		}
12145		case KF_ARG_PTR_TO_CALLBACK:
12146			if (reg->type != PTR_TO_FUNC) {
12147				verbose(env, "arg%d expected pointer to func\n", i);
12148				return -EINVAL;
12149			}
12150			meta->subprogno = reg->subprogno;
12151			break;
12152		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
12153			if (!type_is_ptr_alloc_obj(reg->type)) {
12154				verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
12155				return -EINVAL;
12156			}
12157			if (!type_is_non_owning_ref(reg->type))
12158				meta->arg_owning_ref = true;
12159
12160			rec = reg_btf_record(reg);
12161			if (!rec) {
12162				verbose(env, "verifier internal error: Couldn't find btf_record\n");
12163				return -EFAULT;
12164			}
12165
12166			if (rec->refcount_off < 0) {
12167				verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
12168				return -EINVAL;
12169			}
12170
12171			meta->arg_btf = reg->btf;
12172			meta->arg_btf_id = reg->btf_id;
12173			break;
12174		case KF_ARG_PTR_TO_CONST_STR:
12175			if (reg->type != PTR_TO_MAP_VALUE) {
12176				verbose(env, "arg#%d doesn't point to a const string\n", i);
12177				return -EINVAL;
12178			}
12179			ret = check_reg_const_str(env, reg, regno);
12180			if (ret)
12181				return ret;
12182			break;
12183		case KF_ARG_PTR_TO_WORKQUEUE:
12184			if (reg->type != PTR_TO_MAP_VALUE) {
12185				verbose(env, "arg#%d doesn't point to a map value\n", i);
12186				return -EINVAL;
12187			}
12188			ret = process_wq_func(env, regno, meta);
12189			if (ret < 0)
12190				return ret;
12191			break;
12192		}
12193	}
12194
12195	if (is_kfunc_release(meta) && !meta->release_regno) {
12196		verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
12197			func_name);
12198		return -EINVAL;
12199	}
12200
12201	return 0;
12202}
12203
12204static int fetch_kfunc_meta(struct bpf_verifier_env *env,
12205			    struct bpf_insn *insn,
12206			    struct bpf_kfunc_call_arg_meta *meta,
12207			    const char **kfunc_name)
12208{
12209	const struct btf_type *func, *func_proto;
12210	u32 func_id, *kfunc_flags;
12211	const char *func_name;
12212	struct btf *desc_btf;
12213
12214	if (kfunc_name)
12215		*kfunc_name = NULL;
12216
12217	if (!insn->imm)
12218		return -EINVAL;
12219
12220	desc_btf = find_kfunc_desc_btf(env, insn->off);
12221	if (IS_ERR(desc_btf))
12222		return PTR_ERR(desc_btf);
12223
12224	func_id = insn->imm;
12225	func = btf_type_by_id(desc_btf, func_id);
12226	func_name = btf_name_by_offset(desc_btf, func->name_off);
12227	if (kfunc_name)
12228		*kfunc_name = func_name;
12229	func_proto = btf_type_by_id(desc_btf, func->type);
12230
12231	kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
12232	if (!kfunc_flags) {
12233		return -EACCES;
12234	}
12235
12236	memset(meta, 0, sizeof(*meta));
12237	meta->btf = desc_btf;
12238	meta->func_id = func_id;
12239	meta->kfunc_flags = *kfunc_flags;
12240	meta->func_proto = func_proto;
12241	meta->func_name = func_name;
12242
12243	return 0;
12244}
12245
12246static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
12247
12248static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
12249			    int *insn_idx_p)
12250{
12251	bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
12252	u32 i, nargs, ptr_type_id, release_ref_obj_id;
12253	struct bpf_reg_state *regs = cur_regs(env);
12254	const char *func_name, *ptr_type_name;
12255	const struct btf_type *t, *ptr_type;
12256	struct bpf_kfunc_call_arg_meta meta;
12257	struct bpf_insn_aux_data *insn_aux;
12258	int err, insn_idx = *insn_idx_p;
12259	const struct btf_param *args;
12260	const struct btf_type *ret_t;
12261	struct btf *desc_btf;
12262
12263	/* skip for now, but return error when we find this in fixup_kfunc_call */
12264	if (!insn->imm)
12265		return 0;
12266
12267	err = fetch_kfunc_meta(env, insn, &meta, &func_name);
12268	if (err == -EACCES && func_name)
12269		verbose(env, "calling kernel function %s is not allowed\n", func_name);
12270	if (err)
12271		return err;
12272	desc_btf = meta.btf;
12273	insn_aux = &env->insn_aux_data[insn_idx];
12274
12275	insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
12276
12277	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
12278		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
12279		return -EACCES;
12280	}
12281
12282	sleepable = is_kfunc_sleepable(&meta);
12283	if (sleepable && !in_sleepable(env)) {
12284		verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
12285		return -EACCES;
12286	}
12287
12288	/* Check the arguments */
12289	err = check_kfunc_args(env, &meta, insn_idx);
12290	if (err < 0)
12291		return err;
12292
12293	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
12294		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
12295					 set_rbtree_add_callback_state);
12296		if (err) {
12297			verbose(env, "kfunc %s#%d failed callback verification\n",
12298				func_name, meta.func_id);
12299			return err;
12300		}
12301	}
12302
12303	if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
12304		meta.r0_size = sizeof(u64);
12305		meta.r0_rdonly = false;
12306	}
12307
12308	if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
12309		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
12310					 set_timer_callback_state);
12311		if (err) {
12312			verbose(env, "kfunc %s#%d failed callback verification\n",
12313				func_name, meta.func_id);
12314			return err;
12315		}
12316	}
12317
12318	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
12319	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
12320
12321	preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
12322	preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
12323
12324	if (env->cur_state->active_rcu_lock) {
12325		struct bpf_func_state *state;
12326		struct bpf_reg_state *reg;
12327		u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
12328
12329		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
12330			verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
12331			return -EACCES;
12332		}
12333
12334		if (rcu_lock) {
12335			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
12336			return -EINVAL;
12337		} else if (rcu_unlock) {
12338			bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
12339				if (reg->type & MEM_RCU) {
12340					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
12341					reg->type |= PTR_UNTRUSTED;
12342				}
12343			}));
12344			env->cur_state->active_rcu_lock = false;
12345		} else if (sleepable) {
12346			verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
12347			return -EACCES;
12348		}
12349	} else if (rcu_lock) {
12350		env->cur_state->active_rcu_lock = true;
12351	} else if (rcu_unlock) {
12352		verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
12353		return -EINVAL;
12354	}
12355
12356	if (env->cur_state->active_preempt_lock) {
12357		if (preempt_disable) {
12358			env->cur_state->active_preempt_lock++;
12359		} else if (preempt_enable) {
12360			env->cur_state->active_preempt_lock--;
12361		} else if (sleepable) {
12362			verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
12363			return -EACCES;
12364		}
12365	} else if (preempt_disable) {
12366		env->cur_state->active_preempt_lock++;
12367	} else if (preempt_enable) {
12368		verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
12369		return -EINVAL;
12370	}
12371
12372	/* In case of release function, we get register number of refcounted
12373	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
12374	 */
12375	if (meta.release_regno) {
12376		err = release_reference(env, regs[meta.release_regno].ref_obj_id);
12377		if (err) {
12378			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
12379				func_name, meta.func_id);
12380			return err;
12381		}
12382	}
12383
12384	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
12385	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
12386	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
12387		release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
12388		insn_aux->insert_off = regs[BPF_REG_2].off;
12389		insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
12390		err = ref_convert_owning_non_owning(env, release_ref_obj_id);
12391		if (err) {
12392			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
12393				func_name, meta.func_id);
12394			return err;
12395		}
12396
12397		err = release_reference(env, release_ref_obj_id);
12398		if (err) {
12399			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
12400				func_name, meta.func_id);
12401			return err;
12402		}
12403	}
12404
12405	if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
12406		if (!bpf_jit_supports_exceptions()) {
12407			verbose(env, "JIT does not support calling kfunc %s#%d\n",
12408				func_name, meta.func_id);
12409			return -ENOTSUPP;
12410		}
12411		env->seen_exception = true;
12412
12413		/* In the case of the default callback, the cookie value passed
12414		 * to bpf_throw becomes the return value of the program.
12415		 */
12416		if (!env->exception_callback_subprog) {
12417			err = check_return_code(env, BPF_REG_1, "R1");
12418			if (err < 0)
12419				return err;
12420		}
12421	}
12422
12423	for (i = 0; i < CALLER_SAVED_REGS; i++)
12424		mark_reg_not_init(env, regs, caller_saved[i]);
12425
12426	/* Check return type */
12427	t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
12428
12429	if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
12430		/* Only exception is bpf_obj_new_impl */
12431		if (meta.btf != btf_vmlinux ||
12432		    (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
12433		     meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
12434		     meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
12435			verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
12436			return -EINVAL;
12437		}
12438	}
12439
12440	if (btf_type_is_scalar(t)) {
12441		mark_reg_unknown(env, regs, BPF_REG_0);
12442		mark_btf_func_reg_size(env, BPF_REG_0, t->size);
12443	} else if (btf_type_is_ptr(t)) {
12444		ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
12445
12446		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
12447			if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
12448			    meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
12449				struct btf_struct_meta *struct_meta;
12450				struct btf *ret_btf;
12451				u32 ret_btf_id;
12452
12453				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
12454					return -ENOMEM;
12455
12456				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
12457					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
12458					return -EINVAL;
12459				}
12460
12461				ret_btf = env->prog->aux->btf;
12462				ret_btf_id = meta.arg_constant.value;
12463
12464				/* This may be NULL due to user not supplying a BTF */
12465				if (!ret_btf) {
12466					verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
12467					return -EINVAL;
12468				}
12469
12470				ret_t = btf_type_by_id(ret_btf, ret_btf_id);
12471				if (!ret_t || !__btf_type_is_struct(ret_t)) {
12472					verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
12473					return -EINVAL;
12474				}
12475
12476				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
12477					if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
12478						verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
12479							ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
12480						return -EINVAL;
12481					}
12482
12483					if (!bpf_global_percpu_ma_set) {
12484						mutex_lock(&bpf_percpu_ma_lock);
12485						if (!bpf_global_percpu_ma_set) {
12486							/* Charge memory allocated with bpf_global_percpu_ma to
12487							 * root memcg. The obj_cgroup for root memcg is NULL.
12488							 */
12489							err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
12490							if (!err)
12491								bpf_global_percpu_ma_set = true;
12492						}
12493						mutex_unlock(&bpf_percpu_ma_lock);
12494						if (err)
12495							return err;
12496					}
12497
12498					mutex_lock(&bpf_percpu_ma_lock);
12499					err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
12500					mutex_unlock(&bpf_percpu_ma_lock);
12501					if (err)
12502						return err;
12503				}
12504
12505				struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
12506				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
12507					if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
12508						verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
12509						return -EINVAL;
12510					}
12511
12512					if (struct_meta) {
12513						verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
12514						return -EINVAL;
12515					}
12516				}
12517
12518				mark_reg_known_zero(env, regs, BPF_REG_0);
12519				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
12520				regs[BPF_REG_0].btf = ret_btf;
12521				regs[BPF_REG_0].btf_id = ret_btf_id;
12522				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
12523					regs[BPF_REG_0].type |= MEM_PERCPU;
12524
12525				insn_aux->obj_new_size = ret_t->size;
12526				insn_aux->kptr_struct_meta = struct_meta;
12527			} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
12528				mark_reg_known_zero(env, regs, BPF_REG_0);
12529				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
12530				regs[BPF_REG_0].btf = meta.arg_btf;
12531				regs[BPF_REG_0].btf_id = meta.arg_btf_id;
12532
12533				insn_aux->kptr_struct_meta =
12534					btf_find_struct_meta(meta.arg_btf,
12535							     meta.arg_btf_id);
12536			} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
12537				   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
12538				struct btf_field *field = meta.arg_list_head.field;
12539
12540				mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
12541			} else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
12542				   meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
12543				struct btf_field *field = meta.arg_rbtree_root.field;
12544
12545				mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
12546			} else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
12547				mark_reg_known_zero(env, regs, BPF_REG_0);
12548				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
12549				regs[BPF_REG_0].btf = desc_btf;
12550				regs[BPF_REG_0].btf_id = meta.ret_btf_id;
12551			} else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
12552				ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
12553				if (!ret_t || !btf_type_is_struct(ret_t)) {
12554					verbose(env,
12555						"kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
12556					return -EINVAL;
12557				}
12558
12559				mark_reg_known_zero(env, regs, BPF_REG_0);
12560				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
12561				regs[BPF_REG_0].btf = desc_btf;
12562				regs[BPF_REG_0].btf_id = meta.arg_constant.value;
12563			} else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
12564				   meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
12565				enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
12566
12567				mark_reg_known_zero(env, regs, BPF_REG_0);
12568
12569				if (!meta.arg_constant.found) {
12570					verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
12571					return -EFAULT;
12572				}
12573
12574				regs[BPF_REG_0].mem_size = meta.arg_constant.value;
12575
12576				/* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
12577				regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
12578
12579				if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
12580					regs[BPF_REG_0].type |= MEM_RDONLY;
12581				} else {
12582					/* this will set env->seen_direct_write to true */
12583					if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
12584						verbose(env, "the prog does not allow writes to packet data\n");
12585						return -EINVAL;
12586					}
12587				}
12588
12589				if (!meta.initialized_dynptr.id) {
12590					verbose(env, "verifier internal error: no dynptr id\n");
12591					return -EFAULT;
12592				}
12593				regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
12594
12595				/* we don't need to set BPF_REG_0's ref obj id
12596				 * because packet slices are not refcounted (see
12597				 * dynptr_type_refcounted)
12598				 */
12599			} else {
12600				verbose(env, "kernel function %s unhandled dynamic return type\n",
12601					meta.func_name);
12602				return -EFAULT;
12603			}
12604		} else if (btf_type_is_void(ptr_type)) {
12605			/* kfunc returning 'void *' is equivalent to returning scalar */
12606			mark_reg_unknown(env, regs, BPF_REG_0);
12607		} else if (!__btf_type_is_struct(ptr_type)) {
12608			if (!meta.r0_size) {
12609				__u32 sz;
12610
12611				if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
12612					meta.r0_size = sz;
12613					meta.r0_rdonly = true;
12614				}
12615			}
12616			if (!meta.r0_size) {
12617				ptr_type_name = btf_name_by_offset(desc_btf,
12618								   ptr_type->name_off);
12619				verbose(env,
12620					"kernel function %s returns pointer type %s %s is not supported\n",
12621					func_name,
12622					btf_type_str(ptr_type),
12623					ptr_type_name);
12624				return -EINVAL;
12625			}
12626
12627			mark_reg_known_zero(env, regs, BPF_REG_0);
12628			regs[BPF_REG_0].type = PTR_TO_MEM;
12629			regs[BPF_REG_0].mem_size = meta.r0_size;
12630
12631			if (meta.r0_rdonly)
12632				regs[BPF_REG_0].type |= MEM_RDONLY;
12633
12634			/* Ensures we don't access the memory after a release_reference() */
12635			if (meta.ref_obj_id)
12636				regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
12637		} else {
12638			mark_reg_known_zero(env, regs, BPF_REG_0);
12639			regs[BPF_REG_0].btf = desc_btf;
12640			regs[BPF_REG_0].type = PTR_TO_BTF_ID;
12641			regs[BPF_REG_0].btf_id = ptr_type_id;
12642		}
12643
12644		if (is_kfunc_ret_null(&meta)) {
12645			regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
12646			/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
12647			regs[BPF_REG_0].id = ++env->id_gen;
12648		}
12649		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
12650		if (is_kfunc_acquire(&meta)) {
12651			int id = acquire_reference_state(env, insn_idx);
12652
12653			if (id < 0)
12654				return id;
12655			if (is_kfunc_ret_null(&meta))
12656				regs[BPF_REG_0].id = id;
12657			regs[BPF_REG_0].ref_obj_id = id;
12658		} else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
12659			ref_set_non_owning(env, &regs[BPF_REG_0]);
12660		}
12661
12662		if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
12663			regs[BPF_REG_0].id = ++env->id_gen;
12664	} else if (btf_type_is_void(t)) {
12665		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
12666			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
12667			    meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
12668				insn_aux->kptr_struct_meta =
12669					btf_find_struct_meta(meta.arg_btf,
12670							     meta.arg_btf_id);
12671			}
12672		}
12673	}
12674
12675	nargs = btf_type_vlen(meta.func_proto);
12676	args = (const struct btf_param *)(meta.func_proto + 1);
12677	for (i = 0; i < nargs; i++) {
12678		u32 regno = i + 1;
12679
12680		t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
12681		if (btf_type_is_ptr(t))
12682			mark_btf_func_reg_size(env, regno, sizeof(void *));
12683		else
12684			/* scalar. ensured by btf_check_kfunc_arg_match() */
12685			mark_btf_func_reg_size(env, regno, t->size);
12686	}
12687
12688	if (is_iter_next_kfunc(&meta)) {
12689		err = process_iter_next_call(env, insn_idx, &meta);
12690		if (err)
12691			return err;
12692	}
12693
12694	return 0;
12695}
12696
12697static bool signed_add_overflows(s64 a, s64 b)
12698{
12699	/* Do the add in u64, where overflow is well-defined */
12700	s64 res = (s64)((u64)a + (u64)b);
12701
12702	if (b < 0)
12703		return res > a;
12704	return res < a;
12705}
12706
12707static bool signed_add32_overflows(s32 a, s32 b)
12708{
12709	/* Do the add in u32, where overflow is well-defined */
12710	s32 res = (s32)((u32)a + (u32)b);
12711
12712	if (b < 0)
12713		return res > a;
12714	return res < a;
12715}
12716
12717static bool signed_sub_overflows(s64 a, s64 b)
12718{
12719	/* Do the sub in u64, where overflow is well-defined */
12720	s64 res = (s64)((u64)a - (u64)b);
12721
12722	if (b < 0)
12723		return res < a;
12724	return res > a;
12725}
12726
12727static bool signed_sub32_overflows(s32 a, s32 b)
12728{
12729	/* Do the sub in u32, where overflow is well-defined */
12730	s32 res = (s32)((u32)a - (u32)b);
12731
12732	if (b < 0)
12733		return res < a;
12734	return res > a;
12735}
12736
12737static bool check_reg_sane_offset(struct bpf_verifier_env *env,
12738				  const struct bpf_reg_state *reg,
12739				  enum bpf_reg_type type)
12740{
12741	bool known = tnum_is_const(reg->var_off);
12742	s64 val = reg->var_off.value;
12743	s64 smin = reg->smin_value;
12744
12745	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
12746		verbose(env, "math between %s pointer and %lld is not allowed\n",
12747			reg_type_str(env, type), val);
12748		return false;
12749	}
12750
12751	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
12752		verbose(env, "%s pointer offset %d is not allowed\n",
12753			reg_type_str(env, type), reg->off);
12754		return false;
12755	}
12756
12757	if (smin == S64_MIN) {
12758		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
12759			reg_type_str(env, type));
12760		return false;
12761	}
12762
12763	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
12764		verbose(env, "value %lld makes %s pointer be out of bounds\n",
12765			smin, reg_type_str(env, type));
12766		return false;
12767	}
12768
12769	return true;
12770}
12771
12772enum {
12773	REASON_BOUNDS	= -1,
12774	REASON_TYPE	= -2,
12775	REASON_PATHS	= -3,
12776	REASON_LIMIT	= -4,
12777	REASON_STACK	= -5,
12778};
12779
12780static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
12781			      u32 *alu_limit, bool mask_to_left)
12782{
12783	u32 max = 0, ptr_limit = 0;
12784
12785	switch (ptr_reg->type) {
12786	case PTR_TO_STACK:
12787		/* Offset 0 is out-of-bounds, but acceptable start for the
12788		 * left direction, see BPF_REG_FP. Also, unknown scalar
12789		 * offset where we would need to deal with min/max bounds is
12790		 * currently prohibited for unprivileged.
12791		 */
12792		max = MAX_BPF_STACK + mask_to_left;
12793		ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
12794		break;
12795	case PTR_TO_MAP_VALUE:
12796		max = ptr_reg->map_ptr->value_size;
12797		ptr_limit = (mask_to_left ?
12798			     ptr_reg->smin_value :
12799			     ptr_reg->umax_value) + ptr_reg->off;
12800		break;
12801	default:
12802		return REASON_TYPE;
12803	}
12804
12805	if (ptr_limit >= max)
12806		return REASON_LIMIT;
12807	*alu_limit = ptr_limit;
12808	return 0;
12809}
12810
12811static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
12812				    const struct bpf_insn *insn)
12813{
12814	return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
12815}
12816
12817static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
12818				       u32 alu_state, u32 alu_limit)
12819{
12820	/* If we arrived here from different branches with different
12821	 * state or limits to sanitize, then this won't work.
12822	 */
12823	if (aux->alu_state &&
12824	    (aux->alu_state != alu_state ||
12825	     aux->alu_limit != alu_limit))
12826		return REASON_PATHS;
12827
12828	/* Corresponding fixup done in do_misc_fixups(). */
12829	aux->alu_state = alu_state;
12830	aux->alu_limit = alu_limit;
12831	return 0;
12832}
12833
12834static int sanitize_val_alu(struct bpf_verifier_env *env,
12835			    struct bpf_insn *insn)
12836{
12837	struct bpf_insn_aux_data *aux = cur_aux(env);
12838
12839	if (can_skip_alu_sanitation(env, insn))
12840		return 0;
12841
12842	return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
12843}
12844
12845static bool sanitize_needed(u8 opcode)
12846{
12847	return opcode == BPF_ADD || opcode == BPF_SUB;
12848}
12849
12850struct bpf_sanitize_info {
12851	struct bpf_insn_aux_data aux;
12852	bool mask_to_left;
12853};
12854
12855static struct bpf_verifier_state *
12856sanitize_speculative_path(struct bpf_verifier_env *env,
12857			  const struct bpf_insn *insn,
12858			  u32 next_idx, u32 curr_idx)
12859{
12860	struct bpf_verifier_state *branch;
12861	struct bpf_reg_state *regs;
12862
12863	branch = push_stack(env, next_idx, curr_idx, true);
12864	if (branch && insn) {
12865		regs = branch->frame[branch->curframe]->regs;
12866		if (BPF_SRC(insn->code) == BPF_K) {
12867			mark_reg_unknown(env, regs, insn->dst_reg);
12868		} else if (BPF_SRC(insn->code) == BPF_X) {
12869			mark_reg_unknown(env, regs, insn->dst_reg);
12870			mark_reg_unknown(env, regs, insn->src_reg);
12871		}
12872	}
12873	return branch;
12874}
12875
12876static int sanitize_ptr_alu(struct bpf_verifier_env *env,
12877			    struct bpf_insn *insn,
12878			    const struct bpf_reg_state *ptr_reg,
12879			    const struct bpf_reg_state *off_reg,
12880			    struct bpf_reg_state *dst_reg,
12881			    struct bpf_sanitize_info *info,
12882			    const bool commit_window)
12883{
12884	struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
12885	struct bpf_verifier_state *vstate = env->cur_state;
12886	bool off_is_imm = tnum_is_const(off_reg->var_off);
12887	bool off_is_neg = off_reg->smin_value < 0;
12888	bool ptr_is_dst_reg = ptr_reg == dst_reg;
12889	u8 opcode = BPF_OP(insn->code);
12890	u32 alu_state, alu_limit;
12891	struct bpf_reg_state tmp;
12892	bool ret;
12893	int err;
12894
12895	if (can_skip_alu_sanitation(env, insn))
12896		return 0;
12897
12898	/* We already marked aux for masking from non-speculative
12899	 * paths, thus we got here in the first place. We only care
12900	 * to explore bad access from here.
12901	 */
12902	if (vstate->speculative)
12903		goto do_sim;
12904
12905	if (!commit_window) {
12906		if (!tnum_is_const(off_reg->var_off) &&
12907		    (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
12908			return REASON_BOUNDS;
12909
12910		info->mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
12911				     (opcode == BPF_SUB && !off_is_neg);
12912	}
12913
12914	err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
12915	if (err < 0)
12916		return err;
12917
12918	if (commit_window) {
12919		/* In commit phase we narrow the masking window based on
12920		 * the observed pointer move after the simulated operation.
12921		 */
12922		alu_state = info->aux.alu_state;
12923		alu_limit = abs(info->aux.alu_limit - alu_limit);
12924	} else {
12925		alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
12926		alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
12927		alu_state |= ptr_is_dst_reg ?
12928			     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
12929
12930		/* Limit pruning on unknown scalars to enable deep search for
12931		 * potential masking differences from other program paths.
12932		 */
12933		if (!off_is_imm)
12934			env->explore_alu_limits = true;
12935	}
12936
12937	err = update_alu_sanitation_state(aux, alu_state, alu_limit);
12938	if (err < 0)
12939		return err;
12940do_sim:
12941	/* If we're in commit phase, we're done here given we already
12942	 * pushed the truncated dst_reg into the speculative verification
12943	 * stack.
12944	 *
12945	 * Also, when register is a known constant, we rewrite register-based
12946	 * operation to immediate-based, and thus do not need masking (and as
12947	 * a consequence, do not need to simulate the zero-truncation either).
12948	 */
12949	if (commit_window || off_is_imm)
12950		return 0;
12951
12952	/* Simulate and find potential out-of-bounds access under
12953	 * speculative execution from truncation as a result of
12954	 * masking when off was not within expected range. If off
12955	 * sits in dst, then we temporarily need to move ptr there
12956	 * to simulate dst (== 0) +/-= ptr. Needed, for example,
12957	 * for cases where we use K-based arithmetic in one direction
12958	 * and truncated reg-based in the other in order to explore
12959	 * bad access.
12960	 */
12961	if (!ptr_is_dst_reg) {
12962		tmp = *dst_reg;
12963		copy_register_state(dst_reg, ptr_reg);
12964	}
12965	ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
12966					env->insn_idx);
12967	if (!ptr_is_dst_reg && ret)
12968		*dst_reg = tmp;
12969	return !ret ? REASON_STACK : 0;
12970}
12971
12972static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
12973{
12974	struct bpf_verifier_state *vstate = env->cur_state;
12975
12976	/* If we simulate paths under speculation, we don't update the
12977	 * insn as 'seen' such that when we verify unreachable paths in
12978	 * the non-speculative domain, sanitize_dead_code() can still
12979	 * rewrite/sanitize them.
12980	 */
12981	if (!vstate->speculative)
12982		env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
12983}
12984
12985static int sanitize_err(struct bpf_verifier_env *env,
12986			const struct bpf_insn *insn, int reason,
12987			const struct bpf_reg_state *off_reg,
12988			const struct bpf_reg_state *dst_reg)
12989{
12990	static const char *err = "pointer arithmetic with it prohibited for !root";
12991	const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
12992	u32 dst = insn->dst_reg, src = insn->src_reg;
12993
12994	switch (reason) {
12995	case REASON_BOUNDS:
12996		verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
12997			off_reg == dst_reg ? dst : src, err);
12998		break;
12999	case REASON_TYPE:
13000		verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
13001			off_reg == dst_reg ? src : dst, err);
13002		break;
13003	case REASON_PATHS:
13004		verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
13005			dst, op, err);
13006		break;
13007	case REASON_LIMIT:
13008		verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
13009			dst, op, err);
13010		break;
13011	case REASON_STACK:
13012		verbose(env, "R%d could not be pushed for speculative verification, %s\n",
13013			dst, err);
13014		break;
13015	default:
13016		verbose(env, "verifier internal error: unknown reason (%d)\n",
13017			reason);
13018		break;
13019	}
13020
13021	return -EACCES;
13022}
13023
13024/* check that stack access falls within stack limits and that 'reg' doesn't
13025 * have a variable offset.
13026 *
13027 * Variable offset is prohibited for unprivileged mode for simplicity since it
13028 * requires corresponding support in Spectre masking for stack ALU.  See also
13029 * retrieve_ptr_limit().
13030 *
13031 *
13032 * 'off' includes 'reg->off'.
13033 */
13034static int check_stack_access_for_ptr_arithmetic(
13035				struct bpf_verifier_env *env,
13036				int regno,
13037				const struct bpf_reg_state *reg,
13038				int off)
13039{
13040	if (!tnum_is_const(reg->var_off)) {
13041		char tn_buf[48];
13042
13043		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
13044		verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
13045			regno, tn_buf, off);
13046		return -EACCES;
13047	}
13048
13049	if (off >= 0 || off < -MAX_BPF_STACK) {
13050		verbose(env, "R%d stack pointer arithmetic goes out of range, "
13051			"prohibited for !root; off=%d\n", regno, off);
13052		return -EACCES;
13053	}
13054
13055	return 0;
13056}
13057
13058static int sanitize_check_bounds(struct bpf_verifier_env *env,
13059				 const struct bpf_insn *insn,
13060				 const struct bpf_reg_state *dst_reg)
13061{
13062	u32 dst = insn->dst_reg;
13063
13064	/* For unprivileged we require that resulting offset must be in bounds
13065	 * in order to be able to sanitize access later on.
13066	 */
13067	if (env->bypass_spec_v1)
13068		return 0;
13069
13070	switch (dst_reg->type) {
13071	case PTR_TO_STACK:
13072		if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
13073					dst_reg->off + dst_reg->var_off.value))
13074			return -EACCES;
13075		break;
13076	case PTR_TO_MAP_VALUE:
13077		if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
13078			verbose(env, "R%d pointer arithmetic of map value goes out of range, "
13079				"prohibited for !root\n", dst);
13080			return -EACCES;
13081		}
13082		break;
13083	default:
13084		break;
13085	}
13086
13087	return 0;
13088}
13089
13090/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
13091 * Caller should also handle BPF_MOV case separately.
13092 * If we return -EACCES, caller may want to try again treating pointer as a
13093 * scalar.  So we only emit a diagnostic if !env->allow_ptr_leaks.
13094 */
13095static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
13096				   struct bpf_insn *insn,
13097				   const struct bpf_reg_state *ptr_reg,
13098				   const struct bpf_reg_state *off_reg)
13099{
13100	struct bpf_verifier_state *vstate = env->cur_state;
13101	struct bpf_func_state *state = vstate->frame[vstate->curframe];
13102	struct bpf_reg_state *regs = state->regs, *dst_reg;
13103	bool known = tnum_is_const(off_reg->var_off);
13104	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
13105	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
13106	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
13107	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
13108	struct bpf_sanitize_info info = {};
13109	u8 opcode = BPF_OP(insn->code);
13110	u32 dst = insn->dst_reg;
13111	int ret;
13112
13113	dst_reg = &regs[dst];
13114
13115	if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
13116	    smin_val > smax_val || umin_val > umax_val) {
13117		/* Taint dst register if offset had invalid bounds derived from
13118		 * e.g. dead branches.
13119		 */
13120		__mark_reg_unknown(env, dst_reg);
13121		return 0;
13122	}
13123
13124	if (BPF_CLASS(insn->code) != BPF_ALU64) {
13125		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
13126		if (opcode == BPF_SUB && env->allow_ptr_leaks) {
13127			__mark_reg_unknown(env, dst_reg);
13128			return 0;
13129		}
13130
13131		verbose(env,
13132			"R%d 32-bit pointer arithmetic prohibited\n",
13133			dst);
13134		return -EACCES;
13135	}
13136
13137	if (ptr_reg->type & PTR_MAYBE_NULL) {
13138		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
13139			dst, reg_type_str(env, ptr_reg->type));
13140		return -EACCES;
13141	}
13142
13143	switch (base_type(ptr_reg->type)) {
13144	case PTR_TO_CTX:
13145	case PTR_TO_MAP_VALUE:
13146	case PTR_TO_MAP_KEY:
13147	case PTR_TO_STACK:
13148	case PTR_TO_PACKET_META:
13149	case PTR_TO_PACKET:
13150	case PTR_TO_TP_BUFFER:
13151	case PTR_TO_BTF_ID:
13152	case PTR_TO_MEM:
13153	case PTR_TO_BUF:
13154	case PTR_TO_FUNC:
13155	case CONST_PTR_TO_DYNPTR:
13156		break;
13157	case PTR_TO_FLOW_KEYS:
13158		if (known)
13159			break;
13160		fallthrough;
13161	case CONST_PTR_TO_MAP:
13162		/* smin_val represents the known value */
13163		if (known && smin_val == 0 && opcode == BPF_ADD)
13164			break;
13165		fallthrough;
13166	default:
13167		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
13168			dst, reg_type_str(env, ptr_reg->type));
13169		return -EACCES;
13170	}
13171
13172	/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
13173	 * The id may be overwritten later if we create a new variable offset.
13174	 */
13175	dst_reg->type = ptr_reg->type;
13176	dst_reg->id = ptr_reg->id;
13177
13178	if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
13179	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
13180		return -EINVAL;
13181
13182	/* pointer types do not carry 32-bit bounds at the moment. */
13183	__mark_reg32_unbounded(dst_reg);
13184
13185	if (sanitize_needed(opcode)) {
13186		ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
13187				       &info, false);
13188		if (ret < 0)
13189			return sanitize_err(env, insn, ret, off_reg, dst_reg);
13190	}
13191
13192	switch (opcode) {
13193	case BPF_ADD:
13194		/* We can take a fixed offset as long as it doesn't overflow
13195		 * the s32 'off' field
13196		 */
13197		if (known && (ptr_reg->off + smin_val ==
13198			      (s64)(s32)(ptr_reg->off + smin_val))) {
13199			/* pointer += K.  Accumulate it into fixed offset */
13200			dst_reg->smin_value = smin_ptr;
13201			dst_reg->smax_value = smax_ptr;
13202			dst_reg->umin_value = umin_ptr;
13203			dst_reg->umax_value = umax_ptr;
13204			dst_reg->var_off = ptr_reg->var_off;
13205			dst_reg->off = ptr_reg->off + smin_val;
13206			dst_reg->raw = ptr_reg->raw;
13207			break;
13208		}
13209		/* A new variable offset is created.  Note that off_reg->off
13210		 * == 0, since it's a scalar.
13211		 * dst_reg gets the pointer type and since some positive
13212		 * integer value was added to the pointer, give it a new 'id'
13213		 * if it's a PTR_TO_PACKET.
13214		 * this creates a new 'base' pointer, off_reg (variable) gets
13215		 * added into the variable offset, and we copy the fixed offset
13216		 * from ptr_reg.
13217		 */
13218		if (signed_add_overflows(smin_ptr, smin_val) ||
13219		    signed_add_overflows(smax_ptr, smax_val)) {
13220			dst_reg->smin_value = S64_MIN;
13221			dst_reg->smax_value = S64_MAX;
13222		} else {
13223			dst_reg->smin_value = smin_ptr + smin_val;
13224			dst_reg->smax_value = smax_ptr + smax_val;
13225		}
13226		if (umin_ptr + umin_val < umin_ptr ||
13227		    umax_ptr + umax_val < umax_ptr) {
13228			dst_reg->umin_value = 0;
13229			dst_reg->umax_value = U64_MAX;
13230		} else {
13231			dst_reg->umin_value = umin_ptr + umin_val;
13232			dst_reg->umax_value = umax_ptr + umax_val;
13233		}
13234		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
13235		dst_reg->off = ptr_reg->off;
13236		dst_reg->raw = ptr_reg->raw;
13237		if (reg_is_pkt_pointer(ptr_reg)) {
13238			dst_reg->id = ++env->id_gen;
13239			/* something was added to pkt_ptr, set range to zero */
13240			memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
13241		}
13242		break;
13243	case BPF_SUB:
13244		if (dst_reg == off_reg) {
13245			/* scalar -= pointer.  Creates an unknown scalar */
13246			verbose(env, "R%d tried to subtract pointer from scalar\n",
13247				dst);
13248			return -EACCES;
13249		}
13250		/* We don't allow subtraction from FP, because (according to
13251		 * test_verifier.c test "invalid fp arithmetic", JITs might not
13252		 * be able to deal with it.
13253		 */
13254		if (ptr_reg->type == PTR_TO_STACK) {
13255			verbose(env, "R%d subtraction from stack pointer prohibited\n",
13256				dst);
13257			return -EACCES;
13258		}
13259		if (known && (ptr_reg->off - smin_val ==
13260			      (s64)(s32)(ptr_reg->off - smin_val))) {
13261			/* pointer -= K.  Subtract it from fixed offset */
13262			dst_reg->smin_value = smin_ptr;
13263			dst_reg->smax_value = smax_ptr;
13264			dst_reg->umin_value = umin_ptr;
13265			dst_reg->umax_value = umax_ptr;
13266			dst_reg->var_off = ptr_reg->var_off;
13267			dst_reg->id = ptr_reg->id;
13268			dst_reg->off = ptr_reg->off - smin_val;
13269			dst_reg->raw = ptr_reg->raw;
13270			break;
13271		}
13272		/* A new variable offset is created.  If the subtrahend is known
13273		 * nonnegative, then any reg->range we had before is still good.
13274		 */
13275		if (signed_sub_overflows(smin_ptr, smax_val) ||
13276		    signed_sub_overflows(smax_ptr, smin_val)) {
13277			/* Overflow possible, we know nothing */
13278			dst_reg->smin_value = S64_MIN;
13279			dst_reg->smax_value = S64_MAX;
13280		} else {
13281			dst_reg->smin_value = smin_ptr - smax_val;
13282			dst_reg->smax_value = smax_ptr - smin_val;
13283		}
13284		if (umin_ptr < umax_val) {
13285			/* Overflow possible, we know nothing */
13286			dst_reg->umin_value = 0;
13287			dst_reg->umax_value = U64_MAX;
13288		} else {
13289			/* Cannot overflow (as long as bounds are consistent) */
13290			dst_reg->umin_value = umin_ptr - umax_val;
13291			dst_reg->umax_value = umax_ptr - umin_val;
13292		}
13293		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
13294		dst_reg->off = ptr_reg->off;
13295		dst_reg->raw = ptr_reg->raw;
13296		if (reg_is_pkt_pointer(ptr_reg)) {
13297			dst_reg->id = ++env->id_gen;
13298			/* something was added to pkt_ptr, set range to zero */
13299			if (smin_val < 0)
13300				memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
13301		}
13302		break;
13303	case BPF_AND:
13304	case BPF_OR:
13305	case BPF_XOR:
13306		/* bitwise ops on pointers are troublesome, prohibit. */
13307		verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
13308			dst, bpf_alu_string[opcode >> 4]);
13309		return -EACCES;
13310	default:
13311		/* other operators (e.g. MUL,LSH) produce non-pointer results */
13312		verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
13313			dst, bpf_alu_string[opcode >> 4]);
13314		return -EACCES;
13315	}
13316
13317	if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
13318		return -EINVAL;
13319	reg_bounds_sync(dst_reg);
13320	if (sanitize_check_bounds(env, insn, dst_reg) < 0)
13321		return -EACCES;
13322	if (sanitize_needed(opcode)) {
13323		ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
13324				       &info, true);
13325		if (ret < 0)
13326			return sanitize_err(env, insn, ret, off_reg, dst_reg);
13327	}
13328
13329	return 0;
13330}
13331
13332static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
13333				 struct bpf_reg_state *src_reg)
13334{
13335	s32 smin_val = src_reg->s32_min_value;
13336	s32 smax_val = src_reg->s32_max_value;
13337	u32 umin_val = src_reg->u32_min_value;
13338	u32 umax_val = src_reg->u32_max_value;
13339
13340	if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
13341	    signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
13342		dst_reg->s32_min_value = S32_MIN;
13343		dst_reg->s32_max_value = S32_MAX;
13344	} else {
13345		dst_reg->s32_min_value += smin_val;
13346		dst_reg->s32_max_value += smax_val;
13347	}
13348	if (dst_reg->u32_min_value + umin_val < umin_val ||
13349	    dst_reg->u32_max_value + umax_val < umax_val) {
13350		dst_reg->u32_min_value = 0;
13351		dst_reg->u32_max_value = U32_MAX;
13352	} else {
13353		dst_reg->u32_min_value += umin_val;
13354		dst_reg->u32_max_value += umax_val;
13355	}
13356}
13357
13358static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
13359			       struct bpf_reg_state *src_reg)
13360{
13361	s64 smin_val = src_reg->smin_value;
13362	s64 smax_val = src_reg->smax_value;
13363	u64 umin_val = src_reg->umin_value;
13364	u64 umax_val = src_reg->umax_value;
13365
13366	if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
13367	    signed_add_overflows(dst_reg->smax_value, smax_val)) {
13368		dst_reg->smin_value = S64_MIN;
13369		dst_reg->smax_value = S64_MAX;
13370	} else {
13371		dst_reg->smin_value += smin_val;
13372		dst_reg->smax_value += smax_val;
13373	}
13374	if (dst_reg->umin_value + umin_val < umin_val ||
13375	    dst_reg->umax_value + umax_val < umax_val) {
13376		dst_reg->umin_value = 0;
13377		dst_reg->umax_value = U64_MAX;
13378	} else {
13379		dst_reg->umin_value += umin_val;
13380		dst_reg->umax_value += umax_val;
13381	}
13382}
13383
13384static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
13385				 struct bpf_reg_state *src_reg)
13386{
13387	s32 smin_val = src_reg->s32_min_value;
13388	s32 smax_val = src_reg->s32_max_value;
13389	u32 umin_val = src_reg->u32_min_value;
13390	u32 umax_val = src_reg->u32_max_value;
13391
13392	if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
13393	    signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
13394		/* Overflow possible, we know nothing */
13395		dst_reg->s32_min_value = S32_MIN;
13396		dst_reg->s32_max_value = S32_MAX;
13397	} else {
13398		dst_reg->s32_min_value -= smax_val;
13399		dst_reg->s32_max_value -= smin_val;
13400	}
13401	if (dst_reg->u32_min_value < umax_val) {
13402		/* Overflow possible, we know nothing */
13403		dst_reg->u32_min_value = 0;
13404		dst_reg->u32_max_value = U32_MAX;
13405	} else {
13406		/* Cannot overflow (as long as bounds are consistent) */
13407		dst_reg->u32_min_value -= umax_val;
13408		dst_reg->u32_max_value -= umin_val;
13409	}
13410}
13411
13412static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
13413			       struct bpf_reg_state *src_reg)
13414{
13415	s64 smin_val = src_reg->smin_value;
13416	s64 smax_val = src_reg->smax_value;
13417	u64 umin_val = src_reg->umin_value;
13418	u64 umax_val = src_reg->umax_value;
13419
13420	if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
13421	    signed_sub_overflows(dst_reg->smax_value, smin_val)) {
13422		/* Overflow possible, we know nothing */
13423		dst_reg->smin_value = S64_MIN;
13424		dst_reg->smax_value = S64_MAX;
13425	} else {
13426		dst_reg->smin_value -= smax_val;
13427		dst_reg->smax_value -= smin_val;
13428	}
13429	if (dst_reg->umin_value < umax_val) {
13430		/* Overflow possible, we know nothing */
13431		dst_reg->umin_value = 0;
13432		dst_reg->umax_value = U64_MAX;
13433	} else {
13434		/* Cannot overflow (as long as bounds are consistent) */
13435		dst_reg->umin_value -= umax_val;
13436		dst_reg->umax_value -= umin_val;
13437	}
13438}
13439
13440static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
13441				 struct bpf_reg_state *src_reg)
13442{
13443	s32 smin_val = src_reg->s32_min_value;
13444	u32 umin_val = src_reg->u32_min_value;
13445	u32 umax_val = src_reg->u32_max_value;
13446
13447	if (smin_val < 0 || dst_reg->s32_min_value < 0) {
13448		/* Ain't nobody got time to multiply that sign */
13449		__mark_reg32_unbounded(dst_reg);
13450		return;
13451	}
13452	/* Both values are positive, so we can work with unsigned and
13453	 * copy the result to signed (unless it exceeds S32_MAX).
13454	 */
13455	if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
13456		/* Potential overflow, we know nothing */
13457		__mark_reg32_unbounded(dst_reg);
13458		return;
13459	}
13460	dst_reg->u32_min_value *= umin_val;
13461	dst_reg->u32_max_value *= umax_val;
13462	if (dst_reg->u32_max_value > S32_MAX) {
13463		/* Overflow possible, we know nothing */
13464		dst_reg->s32_min_value = S32_MIN;
13465		dst_reg->s32_max_value = S32_MAX;
13466	} else {
13467		dst_reg->s32_min_value = dst_reg->u32_min_value;
13468		dst_reg->s32_max_value = dst_reg->u32_max_value;
13469	}
13470}
13471
13472static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
13473			       struct bpf_reg_state *src_reg)
13474{
13475	s64 smin_val = src_reg->smin_value;
13476	u64 umin_val = src_reg->umin_value;
13477	u64 umax_val = src_reg->umax_value;
13478
13479	if (smin_val < 0 || dst_reg->smin_value < 0) {
13480		/* Ain't nobody got time to multiply that sign */
13481		__mark_reg64_unbounded(dst_reg);
13482		return;
13483	}
13484	/* Both values are positive, so we can work with unsigned and
13485	 * copy the result to signed (unless it exceeds S64_MAX).
13486	 */
13487	if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
13488		/* Potential overflow, we know nothing */
13489		__mark_reg64_unbounded(dst_reg);
13490		return;
13491	}
13492	dst_reg->umin_value *= umin_val;
13493	dst_reg->umax_value *= umax_val;
13494	if (dst_reg->umax_value > S64_MAX) {
13495		/* Overflow possible, we know nothing */
13496		dst_reg->smin_value = S64_MIN;
13497		dst_reg->smax_value = S64_MAX;
13498	} else {
13499		dst_reg->smin_value = dst_reg->umin_value;
13500		dst_reg->smax_value = dst_reg->umax_value;
13501	}
13502}
13503
13504static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
13505				 struct bpf_reg_state *src_reg)
13506{
13507	bool src_known = tnum_subreg_is_const(src_reg->var_off);
13508	bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
13509	struct tnum var32_off = tnum_subreg(dst_reg->var_off);
13510	u32 umax_val = src_reg->u32_max_value;
13511
13512	if (src_known && dst_known) {
13513		__mark_reg32_known(dst_reg, var32_off.value);
13514		return;
13515	}
13516
13517	/* We get our minimum from the var_off, since that's inherently
13518	 * bitwise.  Our maximum is the minimum of the operands' maxima.
13519	 */
13520	dst_reg->u32_min_value = var32_off.value;
13521	dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
13522
13523	/* Safe to set s32 bounds by casting u32 result into s32 when u32
13524	 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
13525	 */
13526	if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
13527		dst_reg->s32_min_value = dst_reg->u32_min_value;
13528		dst_reg->s32_max_value = dst_reg->u32_max_value;
13529	} else {
13530		dst_reg->s32_min_value = S32_MIN;
13531		dst_reg->s32_max_value = S32_MAX;
13532	}
13533}
13534
13535static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
13536			       struct bpf_reg_state *src_reg)
13537{
13538	bool src_known = tnum_is_const(src_reg->var_off);
13539	bool dst_known = tnum_is_const(dst_reg->var_off);
13540	u64 umax_val = src_reg->umax_value;
13541
13542	if (src_known && dst_known) {
13543		__mark_reg_known(dst_reg, dst_reg->var_off.value);
13544		return;
13545	}
13546
13547	/* We get our minimum from the var_off, since that's inherently
13548	 * bitwise.  Our maximum is the minimum of the operands' maxima.
13549	 */
13550	dst_reg->umin_value = dst_reg->var_off.value;
13551	dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
13552
13553	/* Safe to set s64 bounds by casting u64 result into s64 when u64
13554	 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
13555	 */
13556	if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
13557		dst_reg->smin_value = dst_reg->umin_value;
13558		dst_reg->smax_value = dst_reg->umax_value;
13559	} else {
13560		dst_reg->smin_value = S64_MIN;
13561		dst_reg->smax_value = S64_MAX;
13562	}
13563	/* We may learn something more from the var_off */
13564	__update_reg_bounds(dst_reg);
13565}
13566
13567static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
13568				struct bpf_reg_state *src_reg)
13569{
13570	bool src_known = tnum_subreg_is_const(src_reg->var_off);
13571	bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
13572	struct tnum var32_off = tnum_subreg(dst_reg->var_off);
13573	u32 umin_val = src_reg->u32_min_value;
13574
13575	if (src_known && dst_known) {
13576		__mark_reg32_known(dst_reg, var32_off.value);
13577		return;
13578	}
13579
13580	/* We get our maximum from the var_off, and our minimum is the
13581	 * maximum of the operands' minima
13582	 */
13583	dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
13584	dst_reg->u32_max_value = var32_off.value | var32_off.mask;
13585
13586	/* Safe to set s32 bounds by casting u32 result into s32 when u32
13587	 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
13588	 */
13589	if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
13590		dst_reg->s32_min_value = dst_reg->u32_min_value;
13591		dst_reg->s32_max_value = dst_reg->u32_max_value;
13592	} else {
13593		dst_reg->s32_min_value = S32_MIN;
13594		dst_reg->s32_max_value = S32_MAX;
13595	}
13596}
13597
13598static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
13599			      struct bpf_reg_state *src_reg)
13600{
13601	bool src_known = tnum_is_const(src_reg->var_off);
13602	bool dst_known = tnum_is_const(dst_reg->var_off);
13603	u64 umin_val = src_reg->umin_value;
13604
13605	if (src_known && dst_known) {
13606		__mark_reg_known(dst_reg, dst_reg->var_off.value);
13607		return;
13608	}
13609
13610	/* We get our maximum from the var_off, and our minimum is the
13611	 * maximum of the operands' minima
13612	 */
13613	dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
13614	dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
13615
13616	/* Safe to set s64 bounds by casting u64 result into s64 when u64
13617	 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
13618	 */
13619	if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
13620		dst_reg->smin_value = dst_reg->umin_value;
13621		dst_reg->smax_value = dst_reg->umax_value;
13622	} else {
13623		dst_reg->smin_value = S64_MIN;
13624		dst_reg->smax_value = S64_MAX;
13625	}
13626	/* We may learn something more from the var_off */
13627	__update_reg_bounds(dst_reg);
13628}
13629
13630static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
13631				 struct bpf_reg_state *src_reg)
13632{
13633	bool src_known = tnum_subreg_is_const(src_reg->var_off);
13634	bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
13635	struct tnum var32_off = tnum_subreg(dst_reg->var_off);
13636
13637	if (src_known && dst_known) {
13638		__mark_reg32_known(dst_reg, var32_off.value);
13639		return;
13640	}
13641
13642	/* We get both minimum and maximum from the var32_off. */
13643	dst_reg->u32_min_value = var32_off.value;
13644	dst_reg->u32_max_value = var32_off.value | var32_off.mask;
13645
13646	/* Safe to set s32 bounds by casting u32 result into s32 when u32
13647	 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
13648	 */
13649	if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
13650		dst_reg->s32_min_value = dst_reg->u32_min_value;
13651		dst_reg->s32_max_value = dst_reg->u32_max_value;
13652	} else {
13653		dst_reg->s32_min_value = S32_MIN;
13654		dst_reg->s32_max_value = S32_MAX;
13655	}
13656}
13657
13658static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
13659			       struct bpf_reg_state *src_reg)
13660{
13661	bool src_known = tnum_is_const(src_reg->var_off);
13662	bool dst_known = tnum_is_const(dst_reg->var_off);
13663
13664	if (src_known && dst_known) {
13665		/* dst_reg->var_off.value has been updated earlier */
13666		__mark_reg_known(dst_reg, dst_reg->var_off.value);
13667		return;
13668	}
13669
13670	/* We get both minimum and maximum from the var_off. */
13671	dst_reg->umin_value = dst_reg->var_off.value;
13672	dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
13673
13674	/* Safe to set s64 bounds by casting u64 result into s64 when u64
13675	 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
13676	 */
13677	if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
13678		dst_reg->smin_value = dst_reg->umin_value;
13679		dst_reg->smax_value = dst_reg->umax_value;
13680	} else {
13681		dst_reg->smin_value = S64_MIN;
13682		dst_reg->smax_value = S64_MAX;
13683	}
13684
13685	__update_reg_bounds(dst_reg);
13686}
13687
13688static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
13689				   u64 umin_val, u64 umax_val)
13690{
13691	/* We lose all sign bit information (except what we can pick
13692	 * up from var_off)
13693	 */
13694	dst_reg->s32_min_value = S32_MIN;
13695	dst_reg->s32_max_value = S32_MAX;
13696	/* If we might shift our top bit out, then we know nothing */
13697	if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
13698		dst_reg->u32_min_value = 0;
13699		dst_reg->u32_max_value = U32_MAX;
13700	} else {
13701		dst_reg->u32_min_value <<= umin_val;
13702		dst_reg->u32_max_value <<= umax_val;
13703	}
13704}
13705
13706static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
13707				 struct bpf_reg_state *src_reg)
13708{
13709	u32 umax_val = src_reg->u32_max_value;
13710	u32 umin_val = src_reg->u32_min_value;
13711	/* u32 alu operation will zext upper bits */
13712	struct tnum subreg = tnum_subreg(dst_reg->var_off);
13713
13714	__scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
13715	dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
13716	/* Not required but being careful mark reg64 bounds as unknown so
13717	 * that we are forced to pick them up from tnum and zext later and
13718	 * if some path skips this step we are still safe.
13719	 */
13720	__mark_reg64_unbounded(dst_reg);
13721	__update_reg32_bounds(dst_reg);
13722}
13723
13724static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
13725				   u64 umin_val, u64 umax_val)
13726{
13727	/* Special case <<32 because it is a common compiler pattern to sign
13728	 * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
13729	 * positive we know this shift will also be positive so we can track
13730	 * bounds correctly. Otherwise we lose all sign bit information except
13731	 * what we can pick up from var_off. Perhaps we can generalize this
13732	 * later to shifts of any length.
13733	 */
13734	if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
13735		dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
13736	else
13737		dst_reg->smax_value = S64_MAX;
13738
13739	if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
13740		dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
13741	else
13742		dst_reg->smin_value = S64_MIN;
13743
13744	/* If we might shift our top bit out, then we know nothing */
13745	if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
13746		dst_reg->umin_value = 0;
13747		dst_reg->umax_value = U64_MAX;
13748	} else {
13749		dst_reg->umin_value <<= umin_val;
13750		dst_reg->umax_value <<= umax_val;
13751	}
13752}
13753
13754static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
13755			       struct bpf_reg_state *src_reg)
13756{
13757	u64 umax_val = src_reg->umax_value;
13758	u64 umin_val = src_reg->umin_value;
13759
13760	/* scalar64 calc uses 32bit unshifted bounds so must be called first */
13761	__scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
13762	__scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
13763
13764	dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
13765	/* We may learn something more from the var_off */
13766	__update_reg_bounds(dst_reg);
13767}
13768
13769static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
13770				 struct bpf_reg_state *src_reg)
13771{
13772	struct tnum subreg = tnum_subreg(dst_reg->var_off);
13773	u32 umax_val = src_reg->u32_max_value;
13774	u32 umin_val = src_reg->u32_min_value;
13775
13776	/* BPF_RSH is an unsigned shift.  If the value in dst_reg might
13777	 * be negative, then either:
13778	 * 1) src_reg might be zero, so the sign bit of the result is
13779	 *    unknown, so we lose our signed bounds
13780	 * 2) it's known negative, thus the unsigned bounds capture the
13781	 *    signed bounds
13782	 * 3) the signed bounds cross zero, so they tell us nothing
13783	 *    about the result
13784	 * If the value in dst_reg is known nonnegative, then again the
13785	 * unsigned bounds capture the signed bounds.
13786	 * Thus, in all cases it suffices to blow away our signed bounds
13787	 * and rely on inferring new ones from the unsigned bounds and
13788	 * var_off of the result.
13789	 */
13790	dst_reg->s32_min_value = S32_MIN;
13791	dst_reg->s32_max_value = S32_MAX;
13792
13793	dst_reg->var_off = tnum_rshift(subreg, umin_val);
13794	dst_reg->u32_min_value >>= umax_val;
13795	dst_reg->u32_max_value >>= umin_val;
13796
13797	__mark_reg64_unbounded(dst_reg);
13798	__update_reg32_bounds(dst_reg);
13799}
13800
13801static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
13802			       struct bpf_reg_state *src_reg)
13803{
13804	u64 umax_val = src_reg->umax_value;
13805	u64 umin_val = src_reg->umin_value;
13806
13807	/* BPF_RSH is an unsigned shift.  If the value in dst_reg might
13808	 * be negative, then either:
13809	 * 1) src_reg might be zero, so the sign bit of the result is
13810	 *    unknown, so we lose our signed bounds
13811	 * 2) it's known negative, thus the unsigned bounds capture the
13812	 *    signed bounds
13813	 * 3) the signed bounds cross zero, so they tell us nothing
13814	 *    about the result
13815	 * If the value in dst_reg is known nonnegative, then again the
13816	 * unsigned bounds capture the signed bounds.
13817	 * Thus, in all cases it suffices to blow away our signed bounds
13818	 * and rely on inferring new ones from the unsigned bounds and
13819	 * var_off of the result.
13820	 */
13821	dst_reg->smin_value = S64_MIN;
13822	dst_reg->smax_value = S64_MAX;
13823	dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
13824	dst_reg->umin_value >>= umax_val;
13825	dst_reg->umax_value >>= umin_val;
13826
13827	/* Its not easy to operate on alu32 bounds here because it depends
13828	 * on bits being shifted in. Take easy way out and mark unbounded
13829	 * so we can recalculate later from tnum.
13830	 */
13831	__mark_reg32_unbounded(dst_reg);
13832	__update_reg_bounds(dst_reg);
13833}
13834
13835static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
13836				  struct bpf_reg_state *src_reg)
13837{
13838	u64 umin_val = src_reg->u32_min_value;
13839
13840	/* Upon reaching here, src_known is true and
13841	 * umax_val is equal to umin_val.
13842	 */
13843	dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
13844	dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
13845
13846	dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);
13847
13848	/* blow away the dst_reg umin_value/umax_value and rely on
13849	 * dst_reg var_off to refine the result.
13850	 */
13851	dst_reg->u32_min_value = 0;
13852	dst_reg->u32_max_value = U32_MAX;
13853
13854	__mark_reg64_unbounded(dst_reg);
13855	__update_reg32_bounds(dst_reg);
13856}
13857
13858static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
13859				struct bpf_reg_state *src_reg)
13860{
13861	u64 umin_val = src_reg->umin_value;
13862
13863	/* Upon reaching here, src_known is true and umax_val is equal
13864	 * to umin_val.
13865	 */
13866	dst_reg->smin_value >>= umin_val;
13867	dst_reg->smax_value >>= umin_val;
13868
13869	dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);
13870
13871	/* blow away the dst_reg umin_value/umax_value and rely on
13872	 * dst_reg var_off to refine the result.
13873	 */
13874	dst_reg->umin_value = 0;
13875	dst_reg->umax_value = U64_MAX;
13876
13877	/* Its not easy to operate on alu32 bounds here because it depends
13878	 * on bits being shifted in from upper 32-bits. Take easy way out
13879	 * and mark unbounded so we can recalculate later from tnum.
13880	 */
13881	__mark_reg32_unbounded(dst_reg);
13882	__update_reg_bounds(dst_reg);
13883}
13884
13885static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
13886					     const struct bpf_reg_state *src_reg)
13887{
13888	bool src_is_const = false;
13889	u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
13890
13891	if (insn_bitness == 32) {
13892		if (tnum_subreg_is_const(src_reg->var_off)
13893		    && src_reg->s32_min_value == src_reg->s32_max_value
13894		    && src_reg->u32_min_value == src_reg->u32_max_value)
13895			src_is_const = true;
13896	} else {
13897		if (tnum_is_const(src_reg->var_off)
13898		    && src_reg->smin_value == src_reg->smax_value
13899		    && src_reg->umin_value == src_reg->umax_value)
13900			src_is_const = true;
13901	}
13902
13903	switch (BPF_OP(insn->code)) {
13904	case BPF_ADD:
13905	case BPF_SUB:
13906	case BPF_AND:
13907	case BPF_XOR:
13908	case BPF_OR:
13909	case BPF_MUL:
13910		return true;
13911
13912	/* Shift operators range is only computable if shift dimension operand
13913	 * is a constant. Shifts greater than 31 or 63 are undefined. This
13914	 * includes shifts by a negative number.
13915	 */
13916	case BPF_LSH:
13917	case BPF_RSH:
13918	case BPF_ARSH:
13919		return (src_is_const && src_reg->umax_value < insn_bitness);
13920	default:
13921		return false;
13922	}
13923}
13924
13925/* WARNING: This function does calculations on 64-bit values, but the actual
13926 * execution may occur on 32-bit values. Therefore, things like bitshifts
13927 * need extra checks in the 32-bit case.
13928 */
13929static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
13930				      struct bpf_insn *insn,
13931				      struct bpf_reg_state *dst_reg,
13932				      struct bpf_reg_state src_reg)
13933{
13934	u8 opcode = BPF_OP(insn->code);
13935	bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
13936	int ret;
13937
13938	if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
13939		__mark_reg_unknown(env, dst_reg);
13940		return 0;
13941	}
13942
13943	if (sanitize_needed(opcode)) {
13944		ret = sanitize_val_alu(env, insn);
13945		if (ret < 0)
13946			return sanitize_err(env, insn, ret, NULL, NULL);
13947	}
13948
13949	/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
13950	 * There are two classes of instructions: The first class we track both
13951	 * alu32 and alu64 sign/unsigned bounds independently this provides the
13952	 * greatest amount of precision when alu operations are mixed with jmp32
13953	 * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
13954	 * and BPF_OR. This is possible because these ops have fairly easy to
13955	 * understand and calculate behavior in both 32-bit and 64-bit alu ops.
13956	 * See alu32 verifier tests for examples. The second class of
13957	 * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
13958	 * with regards to tracking sign/unsigned bounds because the bits may
13959	 * cross subreg boundaries in the alu64 case. When this happens we mark
13960	 * the reg unbounded in the subreg bound space and use the resulting
13961	 * tnum to calculate an approximation of the sign/unsigned bounds.
13962	 */
13963	switch (opcode) {
13964	case BPF_ADD:
13965		scalar32_min_max_add(dst_reg, &src_reg);
13966		scalar_min_max_add(dst_reg, &src_reg);
13967		dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
13968		break;
13969	case BPF_SUB:
13970		scalar32_min_max_sub(dst_reg, &src_reg);
13971		scalar_min_max_sub(dst_reg, &src_reg);
13972		dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
13973		break;
13974	case BPF_MUL:
13975		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
13976		scalar32_min_max_mul(dst_reg, &src_reg);
13977		scalar_min_max_mul(dst_reg, &src_reg);
13978		break;
13979	case BPF_AND:
13980		dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
13981		scalar32_min_max_and(dst_reg, &src_reg);
13982		scalar_min_max_and(dst_reg, &src_reg);
13983		break;
13984	case BPF_OR:
13985		dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
13986		scalar32_min_max_or(dst_reg, &src_reg);
13987		scalar_min_max_or(dst_reg, &src_reg);
13988		break;
13989	case BPF_XOR:
13990		dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
13991		scalar32_min_max_xor(dst_reg, &src_reg);
13992		scalar_min_max_xor(dst_reg, &src_reg);
13993		break;
13994	case BPF_LSH:
13995		if (alu32)
13996			scalar32_min_max_lsh(dst_reg, &src_reg);
13997		else
13998			scalar_min_max_lsh(dst_reg, &src_reg);
13999		break;
14000	case BPF_RSH:
14001		if (alu32)
14002			scalar32_min_max_rsh(dst_reg, &src_reg);
14003		else
14004			scalar_min_max_rsh(dst_reg, &src_reg);
14005		break;
14006	case BPF_ARSH:
14007		if (alu32)
14008			scalar32_min_max_arsh(dst_reg, &src_reg);
14009		else
14010			scalar_min_max_arsh(dst_reg, &src_reg);
14011		break;
14012	default:
14013		break;
14014	}
14015
14016	/* ALU32 ops are zero extended into 64bit register */
14017	if (alu32)
14018		zext_32_to_64(dst_reg);
14019	reg_bounds_sync(dst_reg);
14020	return 0;
14021}
14022
14023/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
14024 * and var_off.
14025 */
14026static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
14027				   struct bpf_insn *insn)
14028{
14029	struct bpf_verifier_state *vstate = env->cur_state;
14030	struct bpf_func_state *state = vstate->frame[vstate->curframe];
14031	struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
14032	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
14033	u8 opcode = BPF_OP(insn->code);
14034	int err;
14035
14036	dst_reg = &regs[insn->dst_reg];
14037	src_reg = NULL;
14038
14039	if (dst_reg->type == PTR_TO_ARENA) {
14040		struct bpf_insn_aux_data *aux = cur_aux(env);
14041
14042		if (BPF_CLASS(insn->code) == BPF_ALU64)
14043			/*
14044			 * 32-bit operations zero upper bits automatically.
14045			 * 64-bit operations need to be converted to 32.
14046			 */
14047			aux->needs_zext = true;
14048
14049		/* Any arithmetic operations are allowed on arena pointers */
14050		return 0;
14051	}
14052
14053	if (dst_reg->type != SCALAR_VALUE)
14054		ptr_reg = dst_reg;
14055	else
14056		/* Make sure ID is cleared otherwise dst_reg min/max could be
14057		 * incorrectly propagated into other registers by find_equal_scalars()
14058		 */
14059		dst_reg->id = 0;
14060	if (BPF_SRC(insn->code) == BPF_X) {
14061		src_reg = &regs[insn->src_reg];
14062		if (src_reg->type != SCALAR_VALUE) {
14063			if (dst_reg->type != SCALAR_VALUE) {
14064				/* Combining two pointers by any ALU op yields
14065				 * an arbitrary scalar. Disallow all math except
14066				 * pointer subtraction
14067				 */
14068				if (opcode == BPF_SUB && env->allow_ptr_leaks) {
14069					mark_reg_unknown(env, regs, insn->dst_reg);
14070					return 0;
14071				}
14072				verbose(env, "R%d pointer %s pointer prohibited\n",
14073					insn->dst_reg,
14074					bpf_alu_string[opcode >> 4]);
14075				return -EACCES;
14076			} else {
14077				/* scalar += pointer
14078				 * This is legal, but we have to reverse our
14079				 * src/dest handling in computing the range
14080				 */
14081				err = mark_chain_precision(env, insn->dst_reg);
14082				if (err)
14083					return err;
14084				return adjust_ptr_min_max_vals(env, insn,
14085							       src_reg, dst_reg);
14086			}
14087		} else if (ptr_reg) {
14088			/* pointer += scalar */
14089			err = mark_chain_precision(env, insn->src_reg);
14090			if (err)
14091				return err;
14092			return adjust_ptr_min_max_vals(env, insn,
14093						       dst_reg, src_reg);
14094		} else if (dst_reg->precise) {
14095			/* if dst_reg is precise, src_reg should be precise as well */
14096			err = mark_chain_precision(env, insn->src_reg);
14097			if (err)
14098				return err;
14099		}
14100	} else {
14101		/* Pretend the src is a reg with a known value, since we only
14102		 * need to be able to read from this state.
14103		 */
14104		off_reg.type = SCALAR_VALUE;
14105		__mark_reg_known(&off_reg, insn->imm);
14106		src_reg = &off_reg;
14107		if (ptr_reg) /* pointer += K */
14108			return adjust_ptr_min_max_vals(env, insn,
14109						       ptr_reg, src_reg);
14110	}
14111
14112	/* Got here implies adding two SCALAR_VALUEs */
14113	if (WARN_ON_ONCE(ptr_reg)) {
14114		print_verifier_state(env, state, true);
14115		verbose(env, "verifier internal error: unexpected ptr_reg\n");
14116		return -EINVAL;
14117	}
14118	if (WARN_ON(!src_reg)) {
14119		print_verifier_state(env, state, true);
14120		verbose(env, "verifier internal error: no src_reg\n");
14121		return -EINVAL;
14122	}
14123	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
14124}
14125
14126/* check validity of 32-bit and 64-bit arithmetic operations */
14127static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
14128{
14129	struct bpf_reg_state *regs = cur_regs(env);
14130	u8 opcode = BPF_OP(insn->code);
14131	int err;
14132
14133	if (opcode == BPF_END || opcode == BPF_NEG) {
14134		if (opcode == BPF_NEG) {
14135			if (BPF_SRC(insn->code) != BPF_K ||
14136			    insn->src_reg != BPF_REG_0 ||
14137			    insn->off != 0 || insn->imm != 0) {
14138				verbose(env, "BPF_NEG uses reserved fields\n");
14139				return -EINVAL;
14140			}
14141		} else {
14142			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
14143			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
14144			    (BPF_CLASS(insn->code) == BPF_ALU64 &&
14145			     BPF_SRC(insn->code) != BPF_TO_LE)) {
14146				verbose(env, "BPF_END uses reserved fields\n");
14147				return -EINVAL;
14148			}
14149		}
14150
14151		/* check src operand */
14152		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
14153		if (err)
14154			return err;
14155
14156		if (is_pointer_value(env, insn->dst_reg)) {
14157			verbose(env, "R%d pointer arithmetic prohibited\n",
14158				insn->dst_reg);
14159			return -EACCES;
14160		}
14161
14162		/* check dest operand */
14163		err = check_reg_arg(env, insn->dst_reg, DST_OP);
14164		if (err)
14165			return err;
14166
14167	} else if (opcode == BPF_MOV) {
14168
14169		if (BPF_SRC(insn->code) == BPF_X) {
14170			if (BPF_CLASS(insn->code) == BPF_ALU) {
14171				if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
14172				    insn->imm) {
14173					verbose(env, "BPF_MOV uses reserved fields\n");
14174					return -EINVAL;
14175				}
14176			} else if (insn->off == BPF_ADDR_SPACE_CAST) {
14177				if (insn->imm != 1 && insn->imm != 1u << 16) {
14178					verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
14179					return -EINVAL;
14180				}
14181				if (!env->prog->aux->arena) {
14182					verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
14183					return -EINVAL;
14184				}
14185			} else {
14186				if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
14187				     insn->off != 32) || insn->imm) {
14188					verbose(env, "BPF_MOV uses reserved fields\n");
14189					return -EINVAL;
14190				}
14191			}
14192
14193			/* check src operand */
14194			err = check_reg_arg(env, insn->src_reg, SRC_OP);
14195			if (err)
14196				return err;
14197		} else {
14198			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
14199				verbose(env, "BPF_MOV uses reserved fields\n");
14200				return -EINVAL;
14201			}
14202		}
14203
14204		/* check dest operand, mark as required later */
14205		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
14206		if (err)
14207			return err;
14208
14209		if (BPF_SRC(insn->code) == BPF_X) {
14210			struct bpf_reg_state *src_reg = regs + insn->src_reg;
14211			struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
14212
14213			if (BPF_CLASS(insn->code) == BPF_ALU64) {
14214				if (insn->imm) {
14215					/* off == BPF_ADDR_SPACE_CAST */
14216					mark_reg_unknown(env, regs, insn->dst_reg);
14217					if (insn->imm == 1) { /* cast from as(1) to as(0) */
14218						dst_reg->type = PTR_TO_ARENA;
14219						/* PTR_TO_ARENA is 32-bit */
14220						dst_reg->subreg_def = env->insn_idx + 1;
14221					}
14222				} else if (insn->off == 0) {
14223					/* case: R1 = R2
14224					 * copy register state to dest reg
14225					 */
14226					assign_scalar_id_before_mov(env, src_reg);
14227					copy_register_state(dst_reg, src_reg);
14228					dst_reg->live |= REG_LIVE_WRITTEN;
14229					dst_reg->subreg_def = DEF_NOT_SUBREG;
14230				} else {
14231					/* case: R1 = (s8, s16 s32)R2 */
14232					if (is_pointer_value(env, insn->src_reg)) {
14233						verbose(env,
14234							"R%d sign-extension part of pointer\n",
14235							insn->src_reg);
14236						return -EACCES;
14237					} else if (src_reg->type == SCALAR_VALUE) {
14238						bool no_sext;
14239
14240						no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
14241						if (no_sext)
14242							assign_scalar_id_before_mov(env, src_reg);
14243						copy_register_state(dst_reg, src_reg);
14244						if (!no_sext)
14245							dst_reg->id = 0;
14246						coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
14247						dst_reg->live |= REG_LIVE_WRITTEN;
14248						dst_reg->subreg_def = DEF_NOT_SUBREG;
14249					} else {
14250						mark_reg_unknown(env, regs, insn->dst_reg);
14251					}
14252				}
14253			} else {
14254				/* R1 = (u32) R2 */
14255				if (is_pointer_value(env, insn->src_reg)) {
14256					verbose(env,
14257						"R%d partial copy of pointer\n",
14258						insn->src_reg);
14259					return -EACCES;
14260				} else if (src_reg->type == SCALAR_VALUE) {
14261					if (insn->off == 0) {
14262						bool is_src_reg_u32 = get_reg_width(src_reg) <= 32;
14263
14264						if (is_src_reg_u32)
14265							assign_scalar_id_before_mov(env, src_reg);
14266						copy_register_state(dst_reg, src_reg);
14267						/* Make sure ID is cleared if src_reg is not in u32
14268						 * range otherwise dst_reg min/max could be incorrectly
14269						 * propagated into src_reg by find_equal_scalars()
14270						 */
14271						if (!is_src_reg_u32)
14272							dst_reg->id = 0;
14273						dst_reg->live |= REG_LIVE_WRITTEN;
14274						dst_reg->subreg_def = env->insn_idx + 1;
14275					} else {
14276						/* case: W1 = (s8, s16)W2 */
14277						bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
14278
14279						if (no_sext)
14280							assign_scalar_id_before_mov(env, src_reg);
14281						copy_register_state(dst_reg, src_reg);
14282						if (!no_sext)
14283							dst_reg->id = 0;
14284						dst_reg->live |= REG_LIVE_WRITTEN;
14285						dst_reg->subreg_def = env->insn_idx + 1;
14286						coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
14287					}
14288				} else {
14289					mark_reg_unknown(env, regs,
14290							 insn->dst_reg);
14291				}
14292				zext_32_to_64(dst_reg);
14293				reg_bounds_sync(dst_reg);
14294			}
14295		} else {
14296			/* case: R = imm
14297			 * remember the value we stored into this reg
14298			 */
14299			/* clear any state __mark_reg_known doesn't set */
14300			mark_reg_unknown(env, regs, insn->dst_reg);
14301			regs[insn->dst_reg].type = SCALAR_VALUE;
14302			if (BPF_CLASS(insn->code) == BPF_ALU64) {
14303				__mark_reg_known(regs + insn->dst_reg,
14304						 insn->imm);
14305			} else {
14306				__mark_reg_known(regs + insn->dst_reg,
14307						 (u32)insn->imm);
14308			}
14309		}
14310
14311	} else if (opcode > BPF_END) {
14312		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
14313		return -EINVAL;
14314
14315	} else {	/* all other ALU ops: and, sub, xor, add, ... */
14316
14317		if (BPF_SRC(insn->code) == BPF_X) {
14318			if (insn->imm != 0 || insn->off > 1 ||
14319			    (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
14320				verbose(env, "BPF_ALU uses reserved fields\n");
14321				return -EINVAL;
14322			}
14323			/* check src1 operand */
14324			err = check_reg_arg(env, insn->src_reg, SRC_OP);
14325			if (err)
14326				return err;
14327		} else {
14328			if (insn->src_reg != BPF_REG_0 || insn->off > 1 ||
14329			    (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
14330				verbose(env, "BPF_ALU uses reserved fields\n");
14331				return -EINVAL;
14332			}
14333		}
14334
14335		/* check src2 operand */
14336		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
14337		if (err)
14338			return err;
14339
14340		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
14341		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
14342			verbose(env, "div by zero\n");
14343			return -EINVAL;
14344		}
14345
14346		if ((opcode == BPF_LSH || opcode == BPF_RSH ||
14347		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
14348			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
14349
14350			if (insn->imm < 0 || insn->imm >= size) {
14351				verbose(env, "invalid shift %d\n", insn->imm);
14352				return -EINVAL;
14353			}
14354		}
14355
14356		/* check dest operand */
14357		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
14358		err = err ?: adjust_reg_min_max_vals(env, insn);
14359		if (err)
14360			return err;
14361	}
14362
14363	return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
14364}
14365
14366static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
14367				   struct bpf_reg_state *dst_reg,
14368				   enum bpf_reg_type type,
14369				   bool range_right_open)
14370{
14371	struct bpf_func_state *state;
14372	struct bpf_reg_state *reg;
14373	int new_range;
14374
14375	if (dst_reg->off < 0 ||
14376	    (dst_reg->off == 0 && range_right_open))
14377		/* This doesn't give us any range */
14378		return;
14379
14380	if (dst_reg->umax_value > MAX_PACKET_OFF ||
14381	    dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
14382		/* Risk of overflow.  For instance, ptr + (1<<63) may be less
14383		 * than pkt_end, but that's because it's also less than pkt.
14384		 */
14385		return;
14386
14387	new_range = dst_reg->off;
14388	if (range_right_open)
14389		new_range++;
14390
14391	/* Examples for register markings:
14392	 *
14393	 * pkt_data in dst register:
14394	 *
14395	 *   r2 = r3;
14396	 *   r2 += 8;
14397	 *   if (r2 > pkt_end) goto <handle exception>
14398	 *   <access okay>
14399	 *
14400	 *   r2 = r3;
14401	 *   r2 += 8;
14402	 *   if (r2 < pkt_end) goto <access okay>
14403	 *   <handle exception>
14404	 *
14405	 *   Where:
14406	 *     r2 == dst_reg, pkt_end == src_reg
14407	 *     r2=pkt(id=n,off=8,r=0)
14408	 *     r3=pkt(id=n,off=0,r=0)
14409	 *
14410	 * pkt_data in src register:
14411	 *
14412	 *   r2 = r3;
14413	 *   r2 += 8;
14414	 *   if (pkt_end >= r2) goto <access okay>
14415	 *   <handle exception>
14416	 *
14417	 *   r2 = r3;
14418	 *   r2 += 8;
14419	 *   if (pkt_end <= r2) goto <handle exception>
14420	 *   <access okay>
14421	 *
14422	 *   Where:
14423	 *     pkt_end == dst_reg, r2 == src_reg
14424	 *     r2=pkt(id=n,off=8,r=0)
14425	 *     r3=pkt(id=n,off=0,r=0)
14426	 *
14427	 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
14428	 * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
14429	 * and [r3, r3 + 8-1) respectively is safe to access depending on
14430	 * the check.
14431	 */
14432
14433	/* If our ids match, then we must have the same max_value.  And we
14434	 * don't care about the other reg's fixed offset, since if it's too big
14435	 * the range won't allow anything.
14436	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
14437	 */
14438	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
14439		if (reg->type == type && reg->id == dst_reg->id)
14440			/* keep the maximum range already checked */
14441			reg->range = max(reg->range, new_range);
14442	}));
14443}
14444
14445/*
14446 * <reg1> <op> <reg2>, currently assuming reg2 is a constant
14447 */
14448static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
14449				  u8 opcode, bool is_jmp32)
14450{
14451	struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
14452	struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
14453	u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
14454	u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
14455	s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
14456	s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
14457	u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
14458	u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
14459	s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
14460	s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
14461
14462	switch (opcode) {
14463	case BPF_JEQ:
14464		/* constants, umin/umax and smin/smax checks would be
14465		 * redundant in this case because they all should match
14466		 */
14467		if (tnum_is_const(t1) && tnum_is_const(t2))
14468			return t1.value == t2.value;
14469		/* non-overlapping ranges */
14470		if (umin1 > umax2 || umax1 < umin2)
14471			return 0;
14472		if (smin1 > smax2 || smax1 < smin2)
14473			return 0;
14474		if (!is_jmp32) {
14475			/* if 64-bit ranges are inconclusive, see if we can
14476			 * utilize 32-bit subrange knowledge to eliminate
14477			 * branches that can't be taken a priori
14478			 */
14479			if (reg1->u32_min_value > reg2->u32_max_value ||
14480			    reg1->u32_max_value < reg2->u32_min_value)
14481				return 0;
14482			if (reg1->s32_min_value > reg2->s32_max_value ||
14483			    reg1->s32_max_value < reg2->s32_min_value)
14484				return 0;
14485		}
14486		break;
14487	case BPF_JNE:
14488		/* constants, umin/umax and smin/smax checks would be
14489		 * redundant in this case because they all should match
14490		 */
14491		if (tnum_is_const(t1) && tnum_is_const(t2))
14492			return t1.value != t2.value;
14493		/* non-overlapping ranges */
14494		if (umin1 > umax2 || umax1 < umin2)
14495			return 1;
14496		if (smin1 > smax2 || smax1 < smin2)
14497			return 1;
14498		if (!is_jmp32) {
14499			/* if 64-bit ranges are inconclusive, see if we can
14500			 * utilize 32-bit subrange knowledge to eliminate
14501			 * branches that can't be taken a priori
14502			 */
14503			if (reg1->u32_min_value > reg2->u32_max_value ||
14504			    reg1->u32_max_value < reg2->u32_min_value)
14505				return 1;
14506			if (reg1->s32_min_value > reg2->s32_max_value ||
14507			    reg1->s32_max_value < reg2->s32_min_value)
14508				return 1;
14509		}
14510		break;
14511	case BPF_JSET:
14512		if (!is_reg_const(reg2, is_jmp32)) {
14513			swap(reg1, reg2);
14514			swap(t1, t2);
14515		}
14516		if (!is_reg_const(reg2, is_jmp32))
14517			return -1;
14518		if ((~t1.mask & t1.value) & t2.value)
14519			return 1;
14520		if (!((t1.mask | t1.value) & t2.value))
14521			return 0;
14522		break;
14523	case BPF_JGT:
14524		if (umin1 > umax2)
14525			return 1;
14526		else if (umax1 <= umin2)
14527			return 0;
14528		break;
14529	case BPF_JSGT:
14530		if (smin1 > smax2)
14531			return 1;
14532		else if (smax1 <= smin2)
14533			return 0;
14534		break;
14535	case BPF_JLT:
14536		if (umax1 < umin2)
14537			return 1;
14538		else if (umin1 >= umax2)
14539			return 0;
14540		break;
14541	case BPF_JSLT:
14542		if (smax1 < smin2)
14543			return 1;
14544		else if (smin1 >= smax2)
14545			return 0;
14546		break;
14547	case BPF_JGE:
14548		if (umin1 >= umax2)
14549			return 1;
14550		else if (umax1 < umin2)
14551			return 0;
14552		break;
14553	case BPF_JSGE:
14554		if (smin1 >= smax2)
14555			return 1;
14556		else if (smax1 < smin2)
14557			return 0;
14558		break;
14559	case BPF_JLE:
14560		if (umax1 <= umin2)
14561			return 1;
14562		else if (umin1 > umax2)
14563			return 0;
14564		break;
14565	case BPF_JSLE:
14566		if (smax1 <= smin2)
14567			return 1;
14568		else if (smin1 > smax2)
14569			return 0;
14570		break;
14571	}
14572
14573	return -1;
14574}
14575
14576static int flip_opcode(u32 opcode)
14577{
14578	/* How can we transform "a <op> b" into "b <op> a"? */
14579	static const u8 opcode_flip[16] = {
14580		/* these stay the same */
14581		[BPF_JEQ  >> 4] = BPF_JEQ,
14582		[BPF_JNE  >> 4] = BPF_JNE,
14583		[BPF_JSET >> 4] = BPF_JSET,
14584		/* these swap "lesser" and "greater" (L and G in the opcodes) */
14585		[BPF_JGE  >> 4] = BPF_JLE,
14586		[BPF_JGT  >> 4] = BPF_JLT,
14587		[BPF_JLE  >> 4] = BPF_JGE,
14588		[BPF_JLT  >> 4] = BPF_JGT,
14589		[BPF_JSGE >> 4] = BPF_JSLE,
14590		[BPF_JSGT >> 4] = BPF_JSLT,
14591		[BPF_JSLE >> 4] = BPF_JSGE,
14592		[BPF_JSLT >> 4] = BPF_JSGT
14593	};
14594	return opcode_flip[opcode >> 4];
14595}
14596
14597static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
14598				   struct bpf_reg_state *src_reg,
14599				   u8 opcode)
14600{
14601	struct bpf_reg_state *pkt;
14602
14603	if (src_reg->type == PTR_TO_PACKET_END) {
14604		pkt = dst_reg;
14605	} else if (dst_reg->type == PTR_TO_PACKET_END) {
14606		pkt = src_reg;
14607		opcode = flip_opcode(opcode);
14608	} else {
14609		return -1;
14610	}
14611
14612	if (pkt->range >= 0)
14613		return -1;
14614
14615	switch (opcode) {
14616	case BPF_JLE:
14617		/* pkt <= pkt_end */
14618		fallthrough;
14619	case BPF_JGT:
14620		/* pkt > pkt_end */
14621		if (pkt->range == BEYOND_PKT_END)
14622			/* pkt has at last one extra byte beyond pkt_end */
14623			return opcode == BPF_JGT;
14624		break;
14625	case BPF_JLT:
14626		/* pkt < pkt_end */
14627		fallthrough;
14628	case BPF_JGE:
14629		/* pkt >= pkt_end */
14630		if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END)
14631			return opcode == BPF_JGE;
14632		break;
14633	}
14634	return -1;
14635}
14636
14637/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
14638 * and return:
14639 *  1 - branch will be taken and "goto target" will be executed
14640 *  0 - branch will not be taken and fall-through to next insn
14641 * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
14642 *      range [0,10]
14643 */
14644static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
14645			   u8 opcode, bool is_jmp32)
14646{
14647	if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
14648		return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
14649
14650	if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
14651		u64 val;
14652
14653		/* arrange that reg2 is a scalar, and reg1 is a pointer */
14654		if (!is_reg_const(reg2, is_jmp32)) {
14655			opcode = flip_opcode(opcode);
14656			swap(reg1, reg2);
14657		}
14658		/* and ensure that reg2 is a constant */
14659		if (!is_reg_const(reg2, is_jmp32))
14660			return -1;
14661
14662		if (!reg_not_null(reg1))
14663			return -1;
14664
14665		/* If pointer is valid tests against zero will fail so we can
14666		 * use this to direct branch taken.
14667		 */
14668		val = reg_const_value(reg2, is_jmp32);
14669		if (val != 0)
14670			return -1;
14671
14672		switch (opcode) {
14673		case BPF_JEQ:
14674			return 0;
14675		case BPF_JNE:
14676			return 1;
14677		default:
14678			return -1;
14679		}
14680	}
14681
14682	/* now deal with two scalars, but not necessarily constants */
14683	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
14684}
14685
14686/* Opcode that corresponds to a *false* branch condition.
14687 * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
14688 */
14689static u8 rev_opcode(u8 opcode)
14690{
14691	switch (opcode) {
14692	case BPF_JEQ:		return BPF_JNE;
14693	case BPF_JNE:		return BPF_JEQ;
14694	/* JSET doesn't have it's reverse opcode in BPF, so add
14695	 * BPF_X flag to denote the reverse of that operation
14696	 */
14697	case BPF_JSET:		return BPF_JSET | BPF_X;
14698	case BPF_JSET | BPF_X:	return BPF_JSET;
14699	case BPF_JGE:		return BPF_JLT;
14700	case BPF_JGT:		return BPF_JLE;
14701	case BPF_JLE:		return BPF_JGT;
14702	case BPF_JLT:		return BPF_JGE;
14703	case BPF_JSGE:		return BPF_JSLT;
14704	case BPF_JSGT:		return BPF_JSLE;
14705	case BPF_JSLE:		return BPF_JSGT;
14706	case BPF_JSLT:		return BPF_JSGE;
14707	default:		return 0;
14708	}
14709}
14710
14711/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
14712static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
14713				u8 opcode, bool is_jmp32)
14714{
14715	struct tnum t;
14716	u64 val;
14717
14718	/* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
14719	switch (opcode) {
14720	case BPF_JGE:
14721	case BPF_JGT:
14722	case BPF_JSGE:
14723	case BPF_JSGT:
14724		opcode = flip_opcode(opcode);
14725		swap(reg1, reg2);
14726		break;
14727	default:
14728		break;
14729	}
14730
14731	switch (opcode) {
14732	case BPF_JEQ:
14733		if (is_jmp32) {
14734			reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
14735			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
14736			reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
14737			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
14738			reg2->u32_min_value = reg1->u32_min_value;
14739			reg2->u32_max_value = reg1->u32_max_value;
14740			reg2->s32_min_value = reg1->s32_min_value;
14741			reg2->s32_max_value = reg1->s32_max_value;
14742
14743			t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
14744			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
14745			reg2->var_off = tnum_with_subreg(reg2->var_off, t);
14746		} else {
14747			reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
14748			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
14749			reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
14750			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
14751			reg2->umin_value = reg1->umin_value;
14752			reg2->umax_value = reg1->umax_value;
14753			reg2->smin_value = reg1->smin_value;
14754			reg2->smax_value = reg1->smax_value;
14755
14756			reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
14757			reg2->var_off = reg1->var_off;
14758		}
14759		break;
14760	case BPF_JNE:
14761		if (!is_reg_const(reg2, is_jmp32))
14762			swap(reg1, reg2);
14763		if (!is_reg_const(reg2, is_jmp32))
14764			break;
14765
14766		/* try to recompute the bound of reg1 if reg2 is a const and
14767		 * is exactly the edge of reg1.
14768		 */
14769		val = reg_const_value(reg2, is_jmp32);
14770		if (is_jmp32) {
14771			/* u32_min_value is not equal to 0xffffffff at this point,
14772			 * because otherwise u32_max_value is 0xffffffff as well,
14773			 * in such a case both reg1 and reg2 would be constants,
14774			 * jump would be predicted and reg_set_min_max() won't
14775			 * be called.
14776			 *
14777			 * Same reasoning works for all {u,s}{min,max}{32,64} cases
14778			 * below.
14779			 */
14780			if (reg1->u32_min_value == (u32)val)
14781				reg1->u32_min_value++;
14782			if (reg1->u32_max_value == (u32)val)
14783				reg1->u32_max_value--;
14784			if (reg1->s32_min_value == (s32)val)
14785				reg1->s32_min_value++;
14786			if (reg1->s32_max_value == (s32)val)
14787				reg1->s32_max_value--;
14788		} else {
14789			if (reg1->umin_value == (u64)val)
14790				reg1->umin_value++;
14791			if (reg1->umax_value == (u64)val)
14792				reg1->umax_value--;
14793			if (reg1->smin_value == (s64)val)
14794				reg1->smin_value++;
14795			if (reg1->smax_value == (s64)val)
14796				reg1->smax_value--;
14797		}
14798		break;
14799	case BPF_JSET:
14800		if (!is_reg_const(reg2, is_jmp32))
14801			swap(reg1, reg2);
14802		if (!is_reg_const(reg2, is_jmp32))
14803			break;
14804		val = reg_const_value(reg2, is_jmp32);
14805		/* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
14806		 * requires single bit to learn something useful. E.g., if we
14807		 * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
14808		 * are actually set? We can learn something definite only if
14809		 * it's a single-bit value to begin with.
14810		 *
14811		 * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
14812		 * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
14813		 * bit 1 is set, which we can readily use in adjustments.
14814		 */
14815		if (!is_power_of_2(val))
14816			break;
14817		if (is_jmp32) {
14818			t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
14819			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
14820		} else {
14821			reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
14822		}
14823		break;
14824	case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
14825		if (!is_reg_const(reg2, is_jmp32))
14826			swap(reg1, reg2);
14827		if (!is_reg_const(reg2, is_jmp32))
14828			break;
14829		val = reg_const_value(reg2, is_jmp32);
14830		if (is_jmp32) {
14831			t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
14832			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
14833		} else {
14834			reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
14835		}
14836		break;
14837	case BPF_JLE:
14838		if (is_jmp32) {
14839			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
14840			reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
14841		} else {
14842			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
14843			reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
14844		}
14845		break;
14846	case BPF_JLT:
14847		if (is_jmp32) {
14848			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
14849			reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
14850		} else {
14851			reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
14852			reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
14853		}
14854		break;
14855	case BPF_JSLE:
14856		if (is_jmp32) {
14857			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
14858			reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
14859		} else {
14860			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
14861			reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
14862		}
14863		break;
14864	case BPF_JSLT:
14865		if (is_jmp32) {
14866			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
14867			reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
14868		} else {
14869			reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
14870			reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
14871		}
14872		break;
14873	default:
14874		return;
14875	}
14876}
14877
14878/* Adjusts the register min/max values in the case that the dst_reg and
14879 * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
14880 * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
14881 * Technically we can do similar adjustments for pointers to the same object,
14882 * but we don't support that right now.
14883 */
14884static int reg_set_min_max(struct bpf_verifier_env *env,
14885			   struct bpf_reg_state *true_reg1,
14886			   struct bpf_reg_state *true_reg2,
14887			   struct bpf_reg_state *false_reg1,
14888			   struct bpf_reg_state *false_reg2,
14889			   u8 opcode, bool is_jmp32)
14890{
14891	int err;
14892
14893	/* If either register is a pointer, we can't learn anything about its
14894	 * variable offset from the compare (unless they were a pointer into
14895	 * the same object, but we don't bother with that).
14896	 */
14897	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
14898		return 0;
14899
14900	/* fallthrough (FALSE) branch */
14901	regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
14902	reg_bounds_sync(false_reg1);
14903	reg_bounds_sync(false_reg2);
14904
14905	/* jump (TRUE) branch */
14906	regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
14907	reg_bounds_sync(true_reg1);
14908	reg_bounds_sync(true_reg2);
14909
14910	err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
14911	err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
14912	err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
14913	err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
14914	return err;
14915}
14916
14917static void mark_ptr_or_null_reg(struct bpf_func_state *state,
14918				 struct bpf_reg_state *reg, u32 id,
14919				 bool is_null)
14920{
14921	if (type_may_be_null(reg->type) && reg->id == id &&
14922	    (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
14923		/* Old offset (both fixed and variable parts) should have been
14924		 * known-zero, because we don't allow pointer arithmetic on
14925		 * pointers that might be NULL. If we see this happening, don't
14926		 * convert the register.
14927		 *
14928		 * But in some cases, some helpers that return local kptrs
14929		 * advance offset for the returned pointer. In those cases, it
14930		 * is fine to expect to see reg->off.
14931		 */
14932		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
14933			return;
14934		if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
14935		    WARN_ON_ONCE(reg->off))
14936			return;
14937
14938		if (is_null) {
14939			reg->type = SCALAR_VALUE;
14940			/* We don't need id and ref_obj_id from this point
14941			 * onwards anymore, thus we should better reset it,
14942			 * so that state pruning has chances to take effect.
14943			 */
14944			reg->id = 0;
14945			reg->ref_obj_id = 0;
14946
14947			return;
14948		}
14949
14950		mark_ptr_not_null_reg(reg);
14951
14952		if (!reg_may_point_to_spin_lock(reg)) {
14953			/* For not-NULL ptr, reg->ref_obj_id will be reset
14954			 * in release_reference().
14955			 *
14956			 * reg->id is still used by spin_lock ptr. Other
14957			 * than spin_lock ptr type, reg->id can be reset.
14958			 */
14959			reg->id = 0;
14960		}
14961	}
14962}
14963
14964/* The logic is similar to find_good_pkt_pointers(), both could eventually
14965 * be folded together at some point.
14966 */
14967static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
14968				  bool is_null)
14969{
14970	struct bpf_func_state *state = vstate->frame[vstate->curframe];
14971	struct bpf_reg_state *regs = state->regs, *reg;
14972	u32 ref_obj_id = regs[regno].ref_obj_id;
14973	u32 id = regs[regno].id;
14974
14975	if (ref_obj_id && ref_obj_id == id && is_null)
14976		/* regs[regno] is in the " == NULL" branch.
14977		 * No one could have freed the reference state before
14978		 * doing the NULL check.
14979		 */
14980		WARN_ON_ONCE(release_reference_state(state, id));
14981
14982	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
14983		mark_ptr_or_null_reg(state, reg, id, is_null);
14984	}));
14985}
14986
14987static bool try_match_pkt_pointers(const struct bpf_insn *insn,
14988				   struct bpf_reg_state *dst_reg,
14989				   struct bpf_reg_state *src_reg,
14990				   struct bpf_verifier_state *this_branch,
14991				   struct bpf_verifier_state *other_branch)
14992{
14993	if (BPF_SRC(insn->code) != BPF_X)
14994		return false;
14995
14996	/* Pointers are always 64-bit. */
14997	if (BPF_CLASS(insn->code) == BPF_JMP32)
14998		return false;
14999
15000	switch (BPF_OP(insn->code)) {
15001	case BPF_JGT:
15002		if ((dst_reg->type == PTR_TO_PACKET &&
15003		     src_reg->type == PTR_TO_PACKET_END) ||
15004		    (dst_reg->type == PTR_TO_PACKET_META &&
15005		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
15006			/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
15007			find_good_pkt_pointers(this_branch, dst_reg,
15008					       dst_reg->type, false);
15009			mark_pkt_end(other_branch, insn->dst_reg, true);
15010		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
15011			    src_reg->type == PTR_TO_PACKET) ||
15012			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
15013			    src_reg->type == PTR_TO_PACKET_META)) {
15014			/* pkt_end > pkt_data', pkt_data > pkt_meta' */
15015			find_good_pkt_pointers(other_branch, src_reg,
15016					       src_reg->type, true);
15017			mark_pkt_end(this_branch, insn->src_reg, false);
15018		} else {
15019			return false;
15020		}
15021		break;
15022	case BPF_JLT:
15023		if ((dst_reg->type == PTR_TO_PACKET &&
15024		     src_reg->type == PTR_TO_PACKET_END) ||
15025		    (dst_reg->type == PTR_TO_PACKET_META &&
15026		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
15027			/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
15028			find_good_pkt_pointers(other_branch, dst_reg,
15029					       dst_reg->type, true);
15030			mark_pkt_end(this_branch, insn->dst_reg, false);
15031		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
15032			    src_reg->type == PTR_TO_PACKET) ||
15033			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
15034			    src_reg->type == PTR_TO_PACKET_META)) {
15035			/* pkt_end < pkt_data', pkt_data > pkt_meta' */
15036			find_good_pkt_pointers(this_branch, src_reg,
15037					       src_reg->type, false);
15038			mark_pkt_end(other_branch, insn->src_reg, true);
15039		} else {
15040			return false;
15041		}
15042		break;
15043	case BPF_JGE:
15044		if ((dst_reg->type == PTR_TO_PACKET &&
15045		     src_reg->type == PTR_TO_PACKET_END) ||
15046		    (dst_reg->type == PTR_TO_PACKET_META &&
15047		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
15048			/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
15049			find_good_pkt_pointers(this_branch, dst_reg,
15050					       dst_reg->type, true);
15051			mark_pkt_end(other_branch, insn->dst_reg, false);
15052		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
15053			    src_reg->type == PTR_TO_PACKET) ||
15054			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
15055			    src_reg->type == PTR_TO_PACKET_META)) {
15056			/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
15057			find_good_pkt_pointers(other_branch, src_reg,
15058					       src_reg->type, false);
15059			mark_pkt_end(this_branch, insn->src_reg, true);
15060		} else {
15061			return false;
15062		}
15063		break;
15064	case BPF_JLE:
15065		if ((dst_reg->type == PTR_TO_PACKET &&
15066		     src_reg->type == PTR_TO_PACKET_END) ||
15067		    (dst_reg->type == PTR_TO_PACKET_META &&
15068		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
15069			/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
15070			find_good_pkt_pointers(other_branch, dst_reg,
15071					       dst_reg->type, false);
15072			mark_pkt_end(this_branch, insn->dst_reg, true);
15073		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
15074			    src_reg->type == PTR_TO_PACKET) ||
15075			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
15076			    src_reg->type == PTR_TO_PACKET_META)) {
15077			/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
15078			find_good_pkt_pointers(this_branch, src_reg,
15079					       src_reg->type, true);
15080			mark_pkt_end(other_branch, insn->src_reg, false);
15081		} else {
15082			return false;
15083		}
15084		break;
15085	default:
15086		return false;
15087	}
15088
15089	return true;
15090}
15091
15092static void find_equal_scalars(struct bpf_verifier_state *vstate,
15093			       struct bpf_reg_state *known_reg)
15094{
15095	struct bpf_func_state *state;
15096	struct bpf_reg_state *reg;
15097
15098	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
15099		if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
15100			copy_register_state(reg, known_reg);
15101	}));
15102}
15103
15104static int check_cond_jmp_op(struct bpf_verifier_env *env,
15105			     struct bpf_insn *insn, int *insn_idx)
15106{
15107	struct bpf_verifier_state *this_branch = env->cur_state;
15108	struct bpf_verifier_state *other_branch;
15109	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
15110	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
15111	struct bpf_reg_state *eq_branch_regs;
15112	struct bpf_reg_state fake_reg = {};
15113	u8 opcode = BPF_OP(insn->code);
15114	bool is_jmp32;
15115	int pred = -1;
15116	int err;
15117
15118	/* Only conditional jumps are expected to reach here. */
15119	if (opcode == BPF_JA || opcode > BPF_JCOND) {
15120		verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
15121		return -EINVAL;
15122	}
15123
15124	if (opcode == BPF_JCOND) {
15125		struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
15126		int idx = *insn_idx;
15127
15128		if (insn->code != (BPF_JMP | BPF_JCOND) ||
15129		    insn->src_reg != BPF_MAY_GOTO ||
15130		    insn->dst_reg || insn->imm || insn->off == 0) {
15131			verbose(env, "invalid may_goto off %d imm %d\n",
15132				insn->off, insn->imm);
15133			return -EINVAL;
15134		}
15135		prev_st = find_prev_entry(env, cur_st->parent, idx);
15136
15137		/* branch out 'fallthrough' insn as a new state to explore */
15138		queued_st = push_stack(env, idx + 1, idx, false);
15139		if (!queued_st)
15140			return -ENOMEM;
15141
15142		queued_st->may_goto_depth++;
15143		if (prev_st)
15144			widen_imprecise_scalars(env, prev_st, queued_st);
15145		*insn_idx += insn->off;
15146		return 0;
15147	}
15148
15149	/* check src2 operand */
15150	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
15151	if (err)
15152		return err;
15153
15154	dst_reg = &regs[insn->dst_reg];
15155	if (BPF_SRC(insn->code) == BPF_X) {
15156		if (insn->imm != 0) {
15157			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
15158			return -EINVAL;
15159		}
15160
15161		/* check src1 operand */
15162		err = check_reg_arg(env, insn->src_reg, SRC_OP);
15163		if (err)
15164			return err;
15165
15166		src_reg = &regs[insn->src_reg];
15167		if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
15168		    is_pointer_value(env, insn->src_reg)) {
15169			verbose(env, "R%d pointer comparison prohibited\n",
15170				insn->src_reg);
15171			return -EACCES;
15172		}
15173	} else {
15174		if (insn->src_reg != BPF_REG_0) {
15175			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
15176			return -EINVAL;
15177		}
15178		src_reg = &fake_reg;
15179		src_reg->type = SCALAR_VALUE;
15180		__mark_reg_known(src_reg, insn->imm);
15181	}
15182
15183	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
15184	pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
15185	if (pred >= 0) {
15186		/* If we get here with a dst_reg pointer type it is because
15187		 * above is_branch_taken() special cased the 0 comparison.
15188		 */
15189		if (!__is_pointer_value(false, dst_reg))
15190			err = mark_chain_precision(env, insn->dst_reg);
15191		if (BPF_SRC(insn->code) == BPF_X && !err &&
15192		    !__is_pointer_value(false, src_reg))
15193			err = mark_chain_precision(env, insn->src_reg);
15194		if (err)
15195			return err;
15196	}
15197
15198	if (pred == 1) {
15199		/* Only follow the goto, ignore fall-through. If needed, push
15200		 * the fall-through branch for simulation under speculative
15201		 * execution.
15202		 */
15203		if (!env->bypass_spec_v1 &&
15204		    !sanitize_speculative_path(env, insn, *insn_idx + 1,
15205					       *insn_idx))
15206			return -EFAULT;
15207		if (env->log.level & BPF_LOG_LEVEL)
15208			print_insn_state(env, this_branch->frame[this_branch->curframe]);
15209		*insn_idx += insn->off;
15210		return 0;
15211	} else if (pred == 0) {
15212		/* Only follow the fall-through branch, since that's where the
15213		 * program will go. If needed, push the goto branch for
15214		 * simulation under speculative execution.
15215		 */
15216		if (!env->bypass_spec_v1 &&
15217		    !sanitize_speculative_path(env, insn,
15218					       *insn_idx + insn->off + 1,
15219					       *insn_idx))
15220			return -EFAULT;
15221		if (env->log.level & BPF_LOG_LEVEL)
15222			print_insn_state(env, this_branch->frame[this_branch->curframe]);
15223		return 0;
15224	}
15225
15226	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
15227				  false);
15228	if (!other_branch)
15229		return -EFAULT;
15230	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
15231
15232	if (BPF_SRC(insn->code) == BPF_X) {
15233		err = reg_set_min_max(env,
15234				      &other_branch_regs[insn->dst_reg],
15235				      &other_branch_regs[insn->src_reg],
15236				      dst_reg, src_reg, opcode, is_jmp32);
15237	} else /* BPF_SRC(insn->code) == BPF_K */ {
15238		err = reg_set_min_max(env,
15239				      &other_branch_regs[insn->dst_reg],
15240				      src_reg /* fake one */,
15241				      dst_reg, src_reg /* same fake one */,
15242				      opcode, is_jmp32);
15243	}
15244	if (err)
15245		return err;
15246
15247	if (BPF_SRC(insn->code) == BPF_X &&
15248	    src_reg->type == SCALAR_VALUE && src_reg->id &&
15249	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
15250		find_equal_scalars(this_branch, src_reg);
15251		find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
15252	}
15253	if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
15254	    !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
15255		find_equal_scalars(this_branch, dst_reg);
15256		find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
15257	}
15258
15259	/* if one pointer register is compared to another pointer
15260	 * register check if PTR_MAYBE_NULL could be lifted.
15261	 * E.g. register A - maybe null
15262	 *      register B - not null
15263	 * for JNE A, B, ... - A is not null in the false branch;
15264	 * for JEQ A, B, ... - A is not null in the true branch.
15265	 *
15266	 * Since PTR_TO_BTF_ID points to a kernel struct that does
15267	 * not need to be null checked by the BPF program, i.e.,
15268	 * could be null even without PTR_MAYBE_NULL marking, so
15269	 * only propagate nullness when neither reg is that type.
15270	 */
15271	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
15272	    __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
15273	    type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
15274	    base_type(src_reg->type) != PTR_TO_BTF_ID &&
15275	    base_type(dst_reg->type) != PTR_TO_BTF_ID) {
15276		eq_branch_regs = NULL;
15277		switch (opcode) {
15278		case BPF_JEQ:
15279			eq_branch_regs = other_branch_regs;
15280			break;
15281		case BPF_JNE:
15282			eq_branch_regs = regs;
15283			break;
15284		default:
15285			/* do nothing */
15286			break;
15287		}
15288		if (eq_branch_regs) {
15289			if (type_may_be_null(src_reg->type))
15290				mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
15291			else
15292				mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
15293		}
15294	}
15295
15296	/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
15297	 * NOTE: these optimizations below are related with pointer comparison
15298	 *       which will never be JMP32.
15299	 */
15300	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
15301	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
15302	    type_may_be_null(dst_reg->type)) {
15303		/* Mark all identical registers in each branch as either
15304		 * safe or unknown depending R == 0 or R != 0 conditional.
15305		 */
15306		mark_ptr_or_null_regs(this_branch, insn->dst_reg,
15307				      opcode == BPF_JNE);
15308		mark_ptr_or_null_regs(other_branch, insn->dst_reg,
15309				      opcode == BPF_JEQ);
15310	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
15311					   this_branch, other_branch) &&
15312		   is_pointer_value(env, insn->dst_reg)) {
15313		verbose(env, "R%d pointer comparison prohibited\n",
15314			insn->dst_reg);
15315		return -EACCES;
15316	}
15317	if (env->log.level & BPF_LOG_LEVEL)
15318		print_insn_state(env, this_branch->frame[this_branch->curframe]);
15319	return 0;
15320}
15321
15322/* verify BPF_LD_IMM64 instruction */
15323static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
15324{
15325	struct bpf_insn_aux_data *aux = cur_aux(env);
15326	struct bpf_reg_state *regs = cur_regs(env);
15327	struct bpf_reg_state *dst_reg;
15328	struct bpf_map *map;
15329	int err;
15330
15331	if (BPF_SIZE(insn->code) != BPF_DW) {
15332		verbose(env, "invalid BPF_LD_IMM insn\n");
15333		return -EINVAL;
15334	}
15335	if (insn->off != 0) {
15336		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
15337		return -EINVAL;
15338	}
15339
15340	err = check_reg_arg(env, insn->dst_reg, DST_OP);
15341	if (err)
15342		return err;
15343
15344	dst_reg = &regs[insn->dst_reg];
15345	if (insn->src_reg == 0) {
15346		u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
15347
15348		dst_reg->type = SCALAR_VALUE;
15349		__mark_reg_known(&regs[insn->dst_reg], imm);
15350		return 0;
15351	}
15352
15353	/* All special src_reg cases are listed below. From this point onwards
15354	 * we either succeed and assign a corresponding dst_reg->type after
15355	 * zeroing the offset, or fail and reject the program.
15356	 */
15357	mark_reg_known_zero(env, regs, insn->dst_reg);
15358
15359	if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
15360		dst_reg->type = aux->btf_var.reg_type;
15361		switch (base_type(dst_reg->type)) {
15362		case PTR_TO_MEM:
15363			dst_reg->mem_size = aux->btf_var.mem_size;
15364			break;
15365		case PTR_TO_BTF_ID:
15366			dst_reg->btf = aux->btf_var.btf;
15367			dst_reg->btf_id = aux->btf_var.btf_id;
15368			break;
15369		default:
15370			verbose(env, "bpf verifier is misconfigured\n");
15371			return -EFAULT;
15372		}
15373		return 0;
15374	}
15375
15376	if (insn->src_reg == BPF_PSEUDO_FUNC) {
15377		struct bpf_prog_aux *aux = env->prog->aux;
15378		u32 subprogno = find_subprog(env,
15379					     env->insn_idx + insn->imm + 1);
15380
15381		if (!aux->func_info) {
15382			verbose(env, "missing btf func_info\n");
15383			return -EINVAL;
15384		}
15385		if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
15386			verbose(env, "callback function not static\n");
15387			return -EINVAL;
15388		}
15389
15390		dst_reg->type = PTR_TO_FUNC;
15391		dst_reg->subprogno = subprogno;
15392		return 0;
15393	}
15394
15395	map = env->used_maps[aux->map_index];
15396	dst_reg->map_ptr = map;
15397
15398	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
15399	    insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
15400		if (map->map_type == BPF_MAP_TYPE_ARENA) {
15401			__mark_reg_unknown(env, dst_reg);
15402			return 0;
15403		}
15404		dst_reg->type = PTR_TO_MAP_VALUE;
15405		dst_reg->off = aux->map_off;
15406		WARN_ON_ONCE(map->max_entries != 1);
15407		/* We want reg->id to be same (0) as map_value is not distinct */
15408	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
15409		   insn->src_reg == BPF_PSEUDO_MAP_IDX) {
15410		dst_reg->type = CONST_PTR_TO_MAP;
15411	} else {
15412		verbose(env, "bpf verifier is misconfigured\n");
15413		return -EINVAL;
15414	}
15415
15416	return 0;
15417}
15418
15419static bool may_access_skb(enum bpf_prog_type type)
15420{
15421	switch (type) {
15422	case BPF_PROG_TYPE_SOCKET_FILTER:
15423	case BPF_PROG_TYPE_SCHED_CLS:
15424	case BPF_PROG_TYPE_SCHED_ACT:
15425		return true;
15426	default:
15427		return false;
15428	}
15429}
15430
15431/* verify safety of LD_ABS|LD_IND instructions:
15432 * - they can only appear in the programs where ctx == skb
15433 * - since they are wrappers of function calls, they scratch R1-R5 registers,
15434 *   preserve R6-R9, and store return value into R0
15435 *
15436 * Implicit input:
15437 *   ctx == skb == R6 == CTX
15438 *
15439 * Explicit input:
15440 *   SRC == any register
15441 *   IMM == 32-bit immediate
15442 *
15443 * Output:
15444 *   R0 - 8/16/32-bit skb data converted to cpu endianness
15445 */
15446static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
15447{
15448	struct bpf_reg_state *regs = cur_regs(env);
15449	static const int ctx_reg = BPF_REG_6;
15450	u8 mode = BPF_MODE(insn->code);
15451	int i, err;
15452
15453	if (!may_access_skb(resolve_prog_type(env->prog))) {
15454		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
15455		return -EINVAL;
15456	}
15457
15458	if (!env->ops->gen_ld_abs) {
15459		verbose(env, "bpf verifier is misconfigured\n");
15460		return -EINVAL;
15461	}
15462
15463	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
15464	    BPF_SIZE(insn->code) == BPF_DW ||
15465	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
15466		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
15467		return -EINVAL;
15468	}
15469
15470	/* check whether implicit source operand (register R6) is readable */
15471	err = check_reg_arg(env, ctx_reg, SRC_OP);
15472	if (err)
15473		return err;
15474
15475	/* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as
15476	 * gen_ld_abs() may terminate the program at runtime, leading to
15477	 * reference leak.
15478	 */
15479	err = check_reference_leak(env, false);
15480	if (err) {
15481		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
15482		return err;
15483	}
15484
15485	if (env->cur_state->active_lock.ptr) {
15486		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
15487		return -EINVAL;
15488	}
15489
15490	if (env->cur_state->active_rcu_lock) {
15491		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
15492		return -EINVAL;
15493	}
15494
15495	if (env->cur_state->active_preempt_lock) {
15496		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n");
15497		return -EINVAL;
15498	}
15499
15500	if (regs[ctx_reg].type != PTR_TO_CTX) {
15501		verbose(env,
15502			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
15503		return -EINVAL;
15504	}
15505
15506	if (mode == BPF_IND) {
15507		/* check explicit source operand */
15508		err = check_reg_arg(env, insn->src_reg, SRC_OP);
15509		if (err)
15510			return err;
15511	}
15512
15513	err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
15514	if (err < 0)
15515		return err;
15516
15517	/* reset caller saved regs to unreadable */
15518	for (i = 0; i < CALLER_SAVED_REGS; i++) {
15519		mark_reg_not_init(env, regs, caller_saved[i]);
15520		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
15521	}
15522
15523	/* mark destination R0 register as readable, since it contains
15524	 * the value fetched from the packet.
15525	 * Already marked as written above.
15526	 */
15527	mark_reg_unknown(env, regs, BPF_REG_0);
15528	/* ld_abs load up to 32-bit skb data. */
15529	regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
15530	return 0;
15531}
15532
15533static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
15534{
15535	const char *exit_ctx = "At program exit";
15536	struct tnum enforce_attach_type_range = tnum_unknown;
15537	const struct bpf_prog *prog = env->prog;
15538	struct bpf_reg_state *reg;
15539	struct bpf_retval_range range = retval_range(0, 1);
15540	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
15541	int err;
15542	struct bpf_func_state *frame = env->cur_state->frame[0];
15543	const bool is_subprog = frame->subprogno;
15544
15545	/* LSM and struct_ops func-ptr's return type could be "void" */
15546	if (!is_subprog || frame->in_exception_callback_fn) {
15547		switch (prog_type) {
15548		case BPF_PROG_TYPE_LSM:
15549			if (prog->expected_attach_type == BPF_LSM_CGROUP)
15550				/* See below, can be 0 or 0-1 depending on hook. */
15551				break;
15552			fallthrough;
15553		case BPF_PROG_TYPE_STRUCT_OPS:
15554			if (!prog->aux->attach_func_proto->type)
15555				return 0;
15556			break;
15557		default:
15558			break;
15559		}
15560	}
15561
15562	/* eBPF calling convention is such that R0 is used
15563	 * to return the value from eBPF program.
15564	 * Make sure that it's readable at this time
15565	 * of bpf_exit, which means that program wrote
15566	 * something into it earlier
15567	 */
15568	err = check_reg_arg(env, regno, SRC_OP);
15569	if (err)
15570		return err;
15571
15572	if (is_pointer_value(env, regno)) {
15573		verbose(env, "R%d leaks addr as return value\n", regno);
15574		return -EACCES;
15575	}
15576
15577	reg = cur_regs(env) + regno;
15578
15579	if (frame->in_async_callback_fn) {
15580		/* enforce return zero from async callbacks like timer */
15581		exit_ctx = "At async callback return";
15582		range = retval_range(0, 0);
15583		goto enforce_retval;
15584	}
15585
15586	if (is_subprog && !frame->in_exception_callback_fn) {
15587		if (reg->type != SCALAR_VALUE) {
15588			verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
15589				regno, reg_type_str(env, reg->type));
15590			return -EINVAL;
15591		}
15592		return 0;
15593	}
15594
15595	switch (prog_type) {
15596	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
15597		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
15598		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
15599		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
15600		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
15601		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
15602		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
15603		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
15604		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
15605		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
15606			range = retval_range(1, 1);
15607		if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
15608		    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
15609			range = retval_range(0, 3);
15610		break;
15611	case BPF_PROG_TYPE_CGROUP_SKB:
15612		if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
15613			range = retval_range(0, 3);
15614			enforce_attach_type_range = tnum_range(2, 3);
15615		}
15616		break;
15617	case BPF_PROG_TYPE_CGROUP_SOCK:
15618	case BPF_PROG_TYPE_SOCK_OPS:
15619	case BPF_PROG_TYPE_CGROUP_DEVICE:
15620	case BPF_PROG_TYPE_CGROUP_SYSCTL:
15621	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
15622		break;
15623	case BPF_PROG_TYPE_RAW_TRACEPOINT:
15624		if (!env->prog->aux->attach_btf_id)
15625			return 0;
15626		range = retval_range(0, 0);
15627		break;
15628	case BPF_PROG_TYPE_TRACING:
15629		switch (env->prog->expected_attach_type) {
15630		case BPF_TRACE_FENTRY:
15631		case BPF_TRACE_FEXIT:
15632			range = retval_range(0, 0);
15633			break;
15634		case BPF_TRACE_RAW_TP:
15635		case BPF_MODIFY_RETURN:
15636			return 0;
15637		case BPF_TRACE_ITER:
15638			break;
15639		default:
15640			return -ENOTSUPP;
15641		}
15642		break;
15643	case BPF_PROG_TYPE_SK_LOOKUP:
15644		range = retval_range(SK_DROP, SK_PASS);
15645		break;
15646
15647	case BPF_PROG_TYPE_LSM:
15648		if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
15649			/* Regular BPF_PROG_TYPE_LSM programs can return
15650			 * any value.
15651			 */
15652			return 0;
15653		}
15654		if (!env->prog->aux->attach_func_proto->type) {
15655			/* Make sure programs that attach to void
15656			 * hooks don't try to modify return value.
15657			 */
15658			range = retval_range(1, 1);
15659		}
15660		break;
15661
15662	case BPF_PROG_TYPE_NETFILTER:
15663		range = retval_range(NF_DROP, NF_ACCEPT);
15664		break;
15665	case BPF_PROG_TYPE_EXT:
15666		/* freplace program can return anything as its return value
15667		 * depends on the to-be-replaced kernel func or bpf program.
15668		 */
15669	default:
15670		return 0;
15671	}
15672
15673enforce_retval:
15674	if (reg->type != SCALAR_VALUE) {
15675		verbose(env, "%s the register R%d is not a known value (%s)\n",
15676			exit_ctx, regno, reg_type_str(env, reg->type));
15677		return -EINVAL;
15678	}
15679
15680	err = mark_chain_precision(env, regno);
15681	if (err)
15682		return err;
15683
15684	if (!retval_range_within(range, reg)) {
15685		verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
15686		if (!is_subprog &&
15687		    prog->expected_attach_type == BPF_LSM_CGROUP &&
15688		    prog_type == BPF_PROG_TYPE_LSM &&
15689		    !prog->aux->attach_func_proto->type)
15690			verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
15691		return -EINVAL;
15692	}
15693
15694	if (!tnum_is_unknown(enforce_attach_type_range) &&
15695	    tnum_in(enforce_attach_type_range, reg->var_off))
15696		env->prog->enforce_expected_attach_type = 1;
15697	return 0;
15698}
15699
15700/* non-recursive DFS pseudo code
15701 * 1  procedure DFS-iterative(G,v):
15702 * 2      label v as discovered
15703 * 3      let S be a stack
15704 * 4      S.push(v)
15705 * 5      while S is not empty
15706 * 6            t <- S.peek()
15707 * 7            if t is what we're looking for:
15708 * 8                return t
15709 * 9            for all edges e in G.adjacentEdges(t) do
15710 * 10               if edge e is already labelled
15711 * 11                   continue with the next edge
15712 * 12               w <- G.adjacentVertex(t,e)
15713 * 13               if vertex w is not discovered and not explored
15714 * 14                   label e as tree-edge
15715 * 15                   label w as discovered
15716 * 16                   S.push(w)
15717 * 17                   continue at 5
15718 * 18               else if vertex w is discovered
15719 * 19                   label e as back-edge
15720 * 20               else
15721 * 21                   // vertex w is explored
15722 * 22                   label e as forward- or cross-edge
15723 * 23           label t as explored
15724 * 24           S.pop()
15725 *
15726 * convention:
15727 * 0x10 - discovered
15728 * 0x11 - discovered and fall-through edge labelled
15729 * 0x12 - discovered and fall-through and branch edges labelled
15730 * 0x20 - explored
15731 */
15732
15733enum {
15734	DISCOVERED = 0x10,
15735	EXPLORED = 0x20,
15736	FALLTHROUGH = 1,
15737	BRANCH = 2,
15738};
15739
15740static void mark_prune_point(struct bpf_verifier_env *env, int idx)
15741{
15742	env->insn_aux_data[idx].prune_point = true;
15743}
15744
15745static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
15746{
15747	return env->insn_aux_data[insn_idx].prune_point;
15748}
15749
15750static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
15751{
15752	env->insn_aux_data[idx].force_checkpoint = true;
15753}
15754
15755static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
15756{
15757	return env->insn_aux_data[insn_idx].force_checkpoint;
15758}
15759
15760static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
15761{
15762	env->insn_aux_data[idx].calls_callback = true;
15763}
15764
15765static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
15766{
15767	return env->insn_aux_data[insn_idx].calls_callback;
15768}
15769
15770enum {
15771	DONE_EXPLORING = 0,
15772	KEEP_EXPLORING = 1,
15773};
15774
15775/* t, w, e - match pseudo-code above:
15776 * t - index of current instruction
15777 * w - next instruction
15778 * e - edge
15779 */
15780static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
15781{
15782	int *insn_stack = env->cfg.insn_stack;
15783	int *insn_state = env->cfg.insn_state;
15784
15785	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
15786		return DONE_EXPLORING;
15787
15788	if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
15789		return DONE_EXPLORING;
15790
15791	if (w < 0 || w >= env->prog->len) {
15792		verbose_linfo(env, t, "%d: ", t);
15793		verbose(env, "jump out of range from insn %d to %d\n", t, w);
15794		return -EINVAL;
15795	}
15796
15797	if (e == BRANCH) {
15798		/* mark branch target for state pruning */
15799		mark_prune_point(env, w);
15800		mark_jmp_point(env, w);
15801	}
15802
15803	if (insn_state[w] == 0) {
15804		/* tree-edge */
15805		insn_state[t] = DISCOVERED | e;
15806		insn_state[w] = DISCOVERED;
15807		if (env->cfg.cur_stack >= env->prog->len)
15808			return -E2BIG;
15809		insn_stack[env->cfg.cur_stack++] = w;
15810		return KEEP_EXPLORING;
15811	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
15812		if (env->bpf_capable)
15813			return DONE_EXPLORING;
15814		verbose_linfo(env, t, "%d: ", t);
15815		verbose_linfo(env, w, "%d: ", w);
15816		verbose(env, "back-edge from insn %d to %d\n", t, w);
15817		return -EINVAL;
15818	} else if (insn_state[w] == EXPLORED) {
15819		/* forward- or cross-edge */
15820		insn_state[t] = DISCOVERED | e;
15821	} else {
15822		verbose(env, "insn state internal bug\n");
15823		return -EFAULT;
15824	}
15825	return DONE_EXPLORING;
15826}
15827
15828static int visit_func_call_insn(int t, struct bpf_insn *insns,
15829				struct bpf_verifier_env *env,
15830				bool visit_callee)
15831{
15832	int ret, insn_sz;
15833
15834	insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
15835	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
15836	if (ret)
15837		return ret;
15838
15839	mark_prune_point(env, t + insn_sz);
15840	/* when we exit from subprog, we need to record non-linear history */
15841	mark_jmp_point(env, t + insn_sz);
15842
15843	if (visit_callee) {
15844		mark_prune_point(env, t);
15845		ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
15846	}
15847	return ret;
15848}
15849
15850/* Visits the instruction at index t and returns one of the following:
15851 *  < 0 - an error occurred
15852 *  DONE_EXPLORING - the instruction was fully explored
15853 *  KEEP_EXPLORING - there is still work to be done before it is fully explored
15854 */
15855static int visit_insn(int t, struct bpf_verifier_env *env)
15856{
15857	struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
15858	int ret, off, insn_sz;
15859
15860	if (bpf_pseudo_func(insn))
15861		return visit_func_call_insn(t, insns, env, true);
15862
15863	/* All non-branch instructions have a single fall-through edge. */
15864	if (BPF_CLASS(insn->code) != BPF_JMP &&
15865	    BPF_CLASS(insn->code) != BPF_JMP32) {
15866		insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
15867		return push_insn(t, t + insn_sz, FALLTHROUGH, env);
15868	}
15869
15870	switch (BPF_OP(insn->code)) {
15871	case BPF_EXIT:
15872		return DONE_EXPLORING;
15873
15874	case BPF_CALL:
15875		if (is_async_callback_calling_insn(insn))
15876			/* Mark this call insn as a prune point to trigger
15877			 * is_state_visited() check before call itself is
15878			 * processed by __check_func_call(). Otherwise new
15879			 * async state will be pushed for further exploration.
15880			 */
15881			mark_prune_point(env, t);
15882		/* For functions that invoke callbacks it is not known how many times
15883		 * callback would be called. Verifier models callback calling functions
15884		 * by repeatedly visiting callback bodies and returning to origin call
15885		 * instruction.
15886		 * In order to stop such iteration verifier needs to identify when a
15887		 * state identical some state from a previous iteration is reached.
15888		 * Check below forces creation of checkpoint before callback calling
15889		 * instruction to allow search for such identical states.
15890		 */
15891		if (is_sync_callback_calling_insn(insn)) {
15892			mark_calls_callback(env, t);
15893			mark_force_checkpoint(env, t);
15894			mark_prune_point(env, t);
15895			mark_jmp_point(env, t);
15896		}
15897		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
15898			struct bpf_kfunc_call_arg_meta meta;
15899
15900			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
15901			if (ret == 0 && is_iter_next_kfunc(&meta)) {
15902				mark_prune_point(env, t);
15903				/* Checking and saving state checkpoints at iter_next() call
15904				 * is crucial for fast convergence of open-coded iterator loop
15905				 * logic, so we need to force it. If we don't do that,
15906				 * is_state_visited() might skip saving a checkpoint, causing
15907				 * unnecessarily long sequence of not checkpointed
15908				 * instructions and jumps, leading to exhaustion of jump
15909				 * history buffer, and potentially other undesired outcomes.
15910				 * It is expected that with correct open-coded iterators
15911				 * convergence will happen quickly, so we don't run a risk of
15912				 * exhausting memory.
15913				 */
15914				mark_force_checkpoint(env, t);
15915			}
15916		}
15917		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
15918
15919	case BPF_JA:
15920		if (BPF_SRC(insn->code) != BPF_K)
15921			return -EINVAL;
15922
15923		if (BPF_CLASS(insn->code) == BPF_JMP)
15924			off = insn->off;
15925		else
15926			off = insn->imm;
15927
15928		/* unconditional jump with single edge */
15929		ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
15930		if (ret)
15931			return ret;
15932
15933		mark_prune_point(env, t + off + 1);
15934		mark_jmp_point(env, t + off + 1);
15935
15936		return ret;
15937
15938	default:
15939		/* conditional jump with two edges */
15940		mark_prune_point(env, t);
15941		if (is_may_goto_insn(insn))
15942			mark_force_checkpoint(env, t);
15943
15944		ret = push_insn(t, t + 1, FALLTHROUGH, env);
15945		if (ret)
15946			return ret;
15947
15948		return push_insn(t, t + insn->off + 1, BRANCH, env);
15949	}
15950}
15951
15952/* non-recursive depth-first-search to detect loops in BPF program
15953 * loop == back-edge in directed graph
15954 */
15955static int check_cfg(struct bpf_verifier_env *env)
15956{
15957	int insn_cnt = env->prog->len;
15958	int *insn_stack, *insn_state;
15959	int ex_insn_beg, i, ret = 0;
15960	bool ex_done = false;
15961
15962	insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
15963	if (!insn_state)
15964		return -ENOMEM;
15965
15966	insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
15967	if (!insn_stack) {
15968		kvfree(insn_state);
15969		return -ENOMEM;
15970	}
15971
15972	insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
15973	insn_stack[0] = 0; /* 0 is the first instruction */
15974	env->cfg.cur_stack = 1;
15975
15976walk_cfg:
15977	while (env->cfg.cur_stack > 0) {
15978		int t = insn_stack[env->cfg.cur_stack - 1];
15979
15980		ret = visit_insn(t, env);
15981		switch (ret) {
15982		case DONE_EXPLORING:
15983			insn_state[t] = EXPLORED;
15984			env->cfg.cur_stack--;
15985			break;
15986		case KEEP_EXPLORING:
15987			break;
15988		default:
15989			if (ret > 0) {
15990				verbose(env, "visit_insn internal bug\n");
15991				ret = -EFAULT;
15992			}
15993			goto err_free;
15994		}
15995	}
15996
15997	if (env->cfg.cur_stack < 0) {
15998		verbose(env, "pop stack internal bug\n");
15999		ret = -EFAULT;
16000		goto err_free;
16001	}
16002
16003	if (env->exception_callback_subprog && !ex_done) {
16004		ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;
16005
16006		insn_state[ex_insn_beg] = DISCOVERED;
16007		insn_stack[0] = ex_insn_beg;
16008		env->cfg.cur_stack = 1;
16009		ex_done = true;
16010		goto walk_cfg;
16011	}
16012
16013	for (i = 0; i < insn_cnt; i++) {
16014		struct bpf_insn *insn = &env->prog->insnsi[i];
16015
16016		if (insn_state[i] != EXPLORED) {
16017			verbose(env, "unreachable insn %d\n", i);
16018			ret = -EINVAL;
16019			goto err_free;
16020		}
16021		if (bpf_is_ldimm64(insn)) {
16022			if (insn_state[i + 1] != 0) {
16023				verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
16024				ret = -EINVAL;
16025				goto err_free;
16026			}
16027			i++; /* skip second half of ldimm64 */
16028		}
16029	}
16030	ret = 0; /* cfg looks good */
16031
16032err_free:
16033	kvfree(insn_state);
16034	kvfree(insn_stack);
16035	env->cfg.insn_state = env->cfg.insn_stack = NULL;
16036	return ret;
16037}
16038
16039static int check_abnormal_return(struct bpf_verifier_env *env)
16040{
16041	int i;
16042
16043	for (i = 1; i < env->subprog_cnt; i++) {
16044		if (env->subprog_info[i].has_ld_abs) {
16045			verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
16046			return -EINVAL;
16047		}
16048		if (env->subprog_info[i].has_tail_call) {
16049			verbose(env, "tail_call is not allowed in subprogs without BTF\n");
16050			return -EINVAL;
16051		}
16052	}
16053	return 0;
16054}
16055
16056/* The minimum supported BTF func info size */
16057#define MIN_BPF_FUNCINFO_SIZE	8
16058#define MAX_FUNCINFO_REC_SIZE	252
16059
16060static int check_btf_func_early(struct bpf_verifier_env *env,
16061				const union bpf_attr *attr,
16062				bpfptr_t uattr)
16063{
16064	u32 krec_size = sizeof(struct bpf_func_info);
16065	const struct btf_type *type, *func_proto;
16066	u32 i, nfuncs, urec_size, min_size;
16067	struct bpf_func_info *krecord;
16068	struct bpf_prog *prog;
16069	const struct btf *btf;
16070	u32 prev_offset = 0;
16071	bpfptr_t urecord;
16072	int ret = -ENOMEM;
16073
16074	nfuncs = attr->func_info_cnt;
16075	if (!nfuncs) {
16076		if (check_abnormal_return(env))
16077			return -EINVAL;
16078		return 0;
16079	}
16080
16081	urec_size = attr->func_info_rec_size;
16082	if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
16083	    urec_size > MAX_FUNCINFO_REC_SIZE ||
16084	    urec_size % sizeof(u32)) {
16085		verbose(env, "invalid func info rec size %u\n", urec_size);
16086		return -EINVAL;
16087	}
16088
16089	prog = env->prog;
16090	btf = prog->aux->btf;
16091
16092	urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
16093	min_size = min_t(u32, krec_size, urec_size);
16094
16095	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
16096	if (!krecord)
16097		return -ENOMEM;
16098
16099	for (i = 0; i < nfuncs; i++) {
16100		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
16101		if (ret) {
16102			if (ret == -E2BIG) {
16103				verbose(env, "nonzero tailing record in func info");
16104				/* set the size kernel expects so loader can zero
16105				 * out the rest of the record.
16106				 */
16107				if (copy_to_bpfptr_offset(uattr,
16108							  offsetof(union bpf_attr, func_info_rec_size),
16109							  &min_size, sizeof(min_size)))
16110					ret = -EFAULT;
16111			}
16112			goto err_free;
16113		}
16114
16115		if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
16116			ret = -EFAULT;
16117			goto err_free;
16118		}
16119
16120		/* check insn_off */
16121		ret = -EINVAL;
16122		if (i == 0) {
16123			if (krecord[i].insn_off) {
16124				verbose(env,
16125					"nonzero insn_off %u for the first func info record",
16126					krecord[i].insn_off);
16127				goto err_free;
16128			}
16129		} else if (krecord[i].insn_off <= prev_offset) {
16130			verbose(env,
16131				"same or smaller insn offset (%u) than previous func info record (%u)",
16132				krecord[i].insn_off, prev_offset);
16133			goto err_free;
16134		}
16135
16136		/* check type_id */
16137		type = btf_type_by_id(btf, krecord[i].type_id);
16138		if (!type || !btf_type_is_func(type)) {
16139			verbose(env, "invalid type id %d in func info",
16140				krecord[i].type_id);
16141			goto err_free;
16142		}
16143
16144		func_proto = btf_type_by_id(btf, type->type);
16145		if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
16146			/* btf_func_check() already verified it during BTF load */
16147			goto err_free;
16148
16149		prev_offset = krecord[i].insn_off;
16150		bpfptr_add(&urecord, urec_size);
16151	}
16152
16153	prog->aux->func_info = krecord;
16154	prog->aux->func_info_cnt = nfuncs;
16155	return 0;
16156
16157err_free:
16158	kvfree(krecord);
16159	return ret;
16160}
16161
16162static int check_btf_func(struct bpf_verifier_env *env,
16163			  const union bpf_attr *attr,
16164			  bpfptr_t uattr)
16165{
16166	const struct btf_type *type, *func_proto, *ret_type;
16167	u32 i, nfuncs, urec_size;
16168	struct bpf_func_info *krecord;
16169	struct bpf_func_info_aux *info_aux = NULL;
16170	struct bpf_prog *prog;
16171	const struct btf *btf;
16172	bpfptr_t urecord;
16173	bool scalar_return;
16174	int ret = -ENOMEM;
16175
16176	nfuncs = attr->func_info_cnt;
16177	if (!nfuncs) {
16178		if (check_abnormal_return(env))
16179			return -EINVAL;
16180		return 0;
16181	}
16182	if (nfuncs != env->subprog_cnt) {
16183		verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
16184		return -EINVAL;
16185	}
16186
16187	urec_size = attr->func_info_rec_size;
16188
16189	prog = env->prog;
16190	btf = prog->aux->btf;
16191
16192	urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
16193
16194	krecord = prog->aux->func_info;
16195	info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
16196	if (!info_aux)
16197		return -ENOMEM;
16198
16199	for (i = 0; i < nfuncs; i++) {
16200		/* check insn_off */
16201		ret = -EINVAL;
16202
16203		if (env->subprog_info[i].start != krecord[i].insn_off) {
16204			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
16205			goto err_free;
16206		}
16207
16208		/* Already checked type_id */
16209		type = btf_type_by_id(btf, krecord[i].type_id);
16210		info_aux[i].linkage = BTF_INFO_VLEN(type->info);
16211		/* Already checked func_proto */
16212		func_proto = btf_type_by_id(btf, type->type);
16213
16214		ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
16215		scalar_return =
16216			btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
16217		if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
16218			verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
16219			goto err_free;
16220		}
16221		if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
16222			verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
16223			goto err_free;
16224		}
16225
16226		bpfptr_add(&urecord, urec_size);
16227	}
16228
16229	prog->aux->func_info_aux = info_aux;
16230	return 0;
16231
16232err_free:
16233	kfree(info_aux);
16234	return ret;
16235}
16236
16237static void adjust_btf_func(struct bpf_verifier_env *env)
16238{
16239	struct bpf_prog_aux *aux = env->prog->aux;
16240	int i;
16241
16242	if (!aux->func_info)
16243		return;
16244
16245	/* func_info is not available for hidden subprogs */
16246	for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
16247		aux->func_info[i].insn_off = env->subprog_info[i].start;
16248}
16249
16250#define MIN_BPF_LINEINFO_SIZE	offsetofend(struct bpf_line_info, line_col)
16251#define MAX_LINEINFO_REC_SIZE	MAX_FUNCINFO_REC_SIZE
16252
16253static int check_btf_line(struct bpf_verifier_env *env,
16254			  const union bpf_attr *attr,
16255			  bpfptr_t uattr)
16256{
16257	u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
16258	struct bpf_subprog_info *sub;
16259	struct bpf_line_info *linfo;
16260	struct bpf_prog *prog;
16261	const struct btf *btf;
16262	bpfptr_t ulinfo;
16263	int err;
16264
16265	nr_linfo = attr->line_info_cnt;
16266	if (!nr_linfo)
16267		return 0;
16268	if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
16269		return -EINVAL;
16270
16271	rec_size = attr->line_info_rec_size;
16272	if (rec_size < MIN_BPF_LINEINFO_SIZE ||
16273	    rec_size > MAX_LINEINFO_REC_SIZE ||
16274	    rec_size & (sizeof(u32) - 1))
16275		return -EINVAL;
16276
16277	/* Need to zero it in case the userspace may
16278	 * pass in a smaller bpf_line_info object.
16279	 */
16280	linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
16281			 GFP_KERNEL | __GFP_NOWARN);
16282	if (!linfo)
16283		return -ENOMEM;
16284
16285	prog = env->prog;
16286	btf = prog->aux->btf;
16287
16288	s = 0;
16289	sub = env->subprog_info;
16290	ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
16291	expected_size = sizeof(struct bpf_line_info);
16292	ncopy = min_t(u32, expected_size, rec_size);
16293	for (i = 0; i < nr_linfo; i++) {
16294		err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
16295		if (err) {
16296			if (err == -E2BIG) {
16297				verbose(env, "nonzero tailing record in line_info");
16298				if (copy_to_bpfptr_offset(uattr,
16299							  offsetof(union bpf_attr, line_info_rec_size),
16300							  &expected_size, sizeof(expected_size)))
16301					err = -EFAULT;
16302			}
16303			goto err_free;
16304		}
16305
16306		if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
16307			err = -EFAULT;
16308			goto err_free;
16309		}
16310
16311		/*
16312		 * Check insn_off to ensure
16313		 * 1) strictly increasing AND
16314		 * 2) bounded by prog->len
16315		 *
16316		 * The linfo[0].insn_off == 0 check logically falls into
16317		 * the later "missing bpf_line_info for func..." case
16318		 * because the first linfo[0].insn_off must be the
16319		 * first sub also and the first sub must have
16320		 * subprog_info[0].start == 0.
16321		 */
16322		if ((i && linfo[i].insn_off <= prev_offset) ||
16323		    linfo[i].insn_off >= prog->len) {
16324			verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
16325				i, linfo[i].insn_off, prev_offset,
16326				prog->len);
16327			err = -EINVAL;
16328			goto err_free;
16329		}
16330
16331		if (!prog->insnsi[linfo[i].insn_off].code) {
16332			verbose(env,
16333				"Invalid insn code at line_info[%u].insn_off\n",
16334				i);
16335			err = -EINVAL;
16336			goto err_free;
16337		}
16338
16339		if (!btf_name_by_offset(btf, linfo[i].line_off) ||
16340		    !btf_name_by_offset(btf, linfo[i].file_name_off)) {
16341			verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
16342			err = -EINVAL;
16343			goto err_free;
16344		}
16345
16346		if (s != env->subprog_cnt) {
16347			if (linfo[i].insn_off == sub[s].start) {
16348				sub[s].linfo_idx = i;
16349				s++;
16350			} else if (sub[s].start < linfo[i].insn_off) {
16351				verbose(env, "missing bpf_line_info for func#%u\n", s);
16352				err = -EINVAL;
16353				goto err_free;
16354			}
16355		}
16356
16357		prev_offset = linfo[i].insn_off;
16358		bpfptr_add(&ulinfo, rec_size);
16359	}
16360
16361	if (s != env->subprog_cnt) {
16362		verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
16363			env->subprog_cnt - s, s);
16364		err = -EINVAL;
16365		goto err_free;
16366	}
16367
16368	prog->aux->linfo = linfo;
16369	prog->aux->nr_linfo = nr_linfo;
16370
16371	return 0;
16372
16373err_free:
16374	kvfree(linfo);
16375	return err;
16376}
16377
16378#define MIN_CORE_RELO_SIZE	sizeof(struct bpf_core_relo)
16379#define MAX_CORE_RELO_SIZE	MAX_FUNCINFO_REC_SIZE
16380
16381static int check_core_relo(struct bpf_verifier_env *env,
16382			   const union bpf_attr *attr,
16383			   bpfptr_t uattr)
16384{
16385	u32 i, nr_core_relo, ncopy, expected_size, rec_size;
16386	struct bpf_core_relo core_relo = {};
16387	struct bpf_prog *prog = env->prog;
16388	const struct btf *btf = prog->aux->btf;
16389	struct bpf_core_ctx ctx = {
16390		.log = &env->log,
16391		.btf = btf,
16392	};
16393	bpfptr_t u_core_relo;
16394	int err;
16395
16396	nr_core_relo = attr->core_relo_cnt;
16397	if (!nr_core_relo)
16398		return 0;
16399	if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
16400		return -EINVAL;
16401
16402	rec_size = attr->core_relo_rec_size;
16403	if (rec_size < MIN_CORE_RELO_SIZE ||
16404	    rec_size > MAX_CORE_RELO_SIZE ||
16405	    rec_size % sizeof(u32))
16406		return -EINVAL;
16407
16408	u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
16409	expected_size = sizeof(struct bpf_core_relo);
16410	ncopy = min_t(u32, expected_size, rec_size);
16411
16412	/* Unlike func_info and line_info, copy and apply each CO-RE
16413	 * relocation record one at a time.
16414	 */
16415	for (i = 0; i < nr_core_relo; i++) {
16416		/* future proofing when sizeof(bpf_core_relo) changes */
16417		err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
16418		if (err) {
16419			if (err == -E2BIG) {
16420				verbose(env, "nonzero tailing record in core_relo");
16421				if (copy_to_bpfptr_offset(uattr,
16422							  offsetof(union bpf_attr, core_relo_rec_size),
16423							  &expected_size, sizeof(expected_size)))
16424					err = -EFAULT;
16425			}
16426			break;
16427		}
16428
16429		if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
16430			err = -EFAULT;
16431			break;
16432		}
16433
16434		if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
16435			verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
16436				i, core_relo.insn_off, prog->len);
16437			err = -EINVAL;
16438			break;
16439		}
16440
16441		err = bpf_core_apply(&ctx, &core_relo, i,
16442				     &prog->insnsi[core_relo.insn_off / 8]);
16443		if (err)
16444			break;
16445		bpfptr_add(&u_core_relo, rec_size);
16446	}
16447	return err;
16448}
16449
16450static int check_btf_info_early(struct bpf_verifier_env *env,
16451				const union bpf_attr *attr,
16452				bpfptr_t uattr)
16453{
16454	struct btf *btf;
16455	int err;
16456
16457	if (!attr->func_info_cnt && !attr->line_info_cnt) {
16458		if (check_abnormal_return(env))
16459			return -EINVAL;
16460		return 0;
16461	}
16462
16463	btf = btf_get_by_fd(attr->prog_btf_fd);
16464	if (IS_ERR(btf))
16465		return PTR_ERR(btf);
16466	if (btf_is_kernel(btf)) {
16467		btf_put(btf);
16468		return -EACCES;
16469	}
16470	env->prog->aux->btf = btf;
16471
16472	err = check_btf_func_early(env, attr, uattr);
16473	if (err)
16474		return err;
16475	return 0;
16476}
16477
16478static int check_btf_info(struct bpf_verifier_env *env,
16479			  const union bpf_attr *attr,
16480			  bpfptr_t uattr)
16481{
16482	int err;
16483
16484	if (!attr->func_info_cnt && !attr->line_info_cnt) {
16485		if (check_abnormal_return(env))
16486			return -EINVAL;
16487		return 0;
16488	}
16489
16490	err = check_btf_func(env, attr, uattr);
16491	if (err)
16492		return err;
16493
16494	err = check_btf_line(env, attr, uattr);
16495	if (err)
16496		return err;
16497
16498	err = check_core_relo(env, attr, uattr);
16499	if (err)
16500		return err;
16501
16502	return 0;
16503}
16504
16505/* check %cur's range satisfies %old's */
16506static bool range_within(const struct bpf_reg_state *old,
16507			 const struct bpf_reg_state *cur)
16508{
16509	return old->umin_value <= cur->umin_value &&
16510	       old->umax_value >= cur->umax_value &&
16511	       old->smin_value <= cur->smin_value &&
16512	       old->smax_value >= cur->smax_value &&
16513	       old->u32_min_value <= cur->u32_min_value &&
16514	       old->u32_max_value >= cur->u32_max_value &&
16515	       old->s32_min_value <= cur->s32_min_value &&
16516	       old->s32_max_value >= cur->s32_max_value;
16517}
16518
16519/* If in the old state two registers had the same id, then they need to have
16520 * the same id in the new state as well.  But that id could be different from
16521 * the old state, so we need to track the mapping from old to new ids.
16522 * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
16523 * regs with old id 5 must also have new id 9 for the new state to be safe.  But
16524 * regs with a different old id could still have new id 9, we don't care about
16525 * that.
16526 * So we look through our idmap to see if this old id has been seen before.  If
16527 * so, we require the new id to match; otherwise, we add the id pair to the map.
16528 */
16529static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
16530{
16531	struct bpf_id_pair *map = idmap->map;
16532	unsigned int i;
16533
16534	/* either both IDs should be set or both should be zero */
16535	if (!!old_id != !!cur_id)
16536		return false;
16537
16538	if (old_id == 0) /* cur_id == 0 as well */
16539		return true;
16540
16541	for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
16542		if (!map[i].old) {
16543			/* Reached an empty slot; haven't seen this id before */
16544			map[i].old = old_id;
16545			map[i].cur = cur_id;
16546			return true;
16547		}
16548		if (map[i].old == old_id)
16549			return map[i].cur == cur_id;
16550		if (map[i].cur == cur_id)
16551			return false;
16552	}
16553	/* We ran out of idmap slots, which should be impossible */
16554	WARN_ON_ONCE(1);
16555	return false;
16556}
16557
16558/* Similar to check_ids(), but allocate a unique temporary ID
16559 * for 'old_id' or 'cur_id' of zero.
16560 * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
16561 */
16562static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
16563{
16564	old_id = old_id ? old_id : ++idmap->tmp_id_gen;
16565	cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
16566
16567	return check_ids(old_id, cur_id, idmap);
16568}
16569
16570static void clean_func_state(struct bpf_verifier_env *env,
16571			     struct bpf_func_state *st)
16572{
16573	enum bpf_reg_liveness live;
16574	int i, j;
16575
16576	for (i = 0; i < BPF_REG_FP; i++) {
16577		live = st->regs[i].live;
16578		/* liveness must not touch this register anymore */
16579		st->regs[i].live |= REG_LIVE_DONE;
16580		if (!(live & REG_LIVE_READ))
16581			/* since the register is unused, clear its state
16582			 * to make further comparison simpler
16583			 */
16584			__mark_reg_not_init(env, &st->regs[i]);
16585	}
16586
16587	for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
16588		live = st->stack[i].spilled_ptr.live;
16589		/* liveness must not touch this stack slot anymore */
16590		st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
16591		if (!(live & REG_LIVE_READ)) {
16592			__mark_reg_not_init(env, &st->stack[i].spilled_ptr);
16593			for (j = 0; j < BPF_REG_SIZE; j++)
16594				st->stack[i].slot_type[j] = STACK_INVALID;
16595		}
16596	}
16597}
16598
16599static void clean_verifier_state(struct bpf_verifier_env *env,
16600				 struct bpf_verifier_state *st)
16601{
16602	int i;
16603
16604	if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
16605		/* all regs in this state in all frames were already marked */
16606		return;
16607
16608	for (i = 0; i <= st->curframe; i++)
16609		clean_func_state(env, st->frame[i]);
16610}
16611
16612/* the parentage chains form a tree.
16613 * the verifier states are added to state lists at given insn and
16614 * pushed into state stack for future exploration.
16615 * when the verifier reaches bpf_exit insn some of the verifer states
16616 * stored in the state lists have their final liveness state already,
16617 * but a lot of states will get revised from liveness point of view when
16618 * the verifier explores other branches.
16619 * Example:
16620 * 1: r0 = 1
16621 * 2: if r1 == 100 goto pc+1
16622 * 3: r0 = 2
16623 * 4: exit
16624 * when the verifier reaches exit insn the register r0 in the state list of
16625 * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
16626 * of insn 2 and goes exploring further. At the insn 4 it will walk the
16627 * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
16628 *
16629 * Since the verifier pushes the branch states as it sees them while exploring
16630 * the program the condition of walking the branch instruction for the second
16631 * time means that all states below this branch were already explored and
16632 * their final liveness marks are already propagated.
16633 * Hence when the verifier completes the search of state list in is_state_visited()
16634 * we can call this clean_live_states() function to mark all liveness states
16635 * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
16636 * will not be used.
16637 * This function also clears the registers and stack for states that !READ
16638 * to simplify state merging.
16639 *
16640 * Important note here that walking the same branch instruction in the callee
16641 * doesn't meant that the states are DONE. The verifier has to compare
16642 * the callsites
16643 */
16644static void clean_live_states(struct bpf_verifier_env *env, int insn,
16645			      struct bpf_verifier_state *cur)
16646{
16647	struct bpf_verifier_state_list *sl;
16648
16649	sl = *explored_state(env, insn);
16650	while (sl) {
16651		if (sl->state.branches)
16652			goto next;
16653		if (sl->state.insn_idx != insn ||
16654		    !same_callsites(&sl->state, cur))
16655			goto next;
16656		clean_verifier_state(env, &sl->state);
16657next:
16658		sl = sl->next;
16659	}
16660}
16661
16662static bool regs_exact(const struct bpf_reg_state *rold,
16663		       const struct bpf_reg_state *rcur,
16664		       struct bpf_idmap *idmap)
16665{
16666	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
16667	       check_ids(rold->id, rcur->id, idmap) &&
16668	       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
16669}
16670
16671enum exact_level {
16672	NOT_EXACT,
16673	EXACT,
16674	RANGE_WITHIN
16675};
16676
16677/* Returns true if (rold safe implies rcur safe) */
16678static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
16679		    struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
16680		    enum exact_level exact)
16681{
16682	if (exact == EXACT)
16683		return regs_exact(rold, rcur, idmap);
16684
16685	if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT)
16686		/* explored state didn't use this */
16687		return true;
16688	if (rold->type == NOT_INIT) {
16689		if (exact == NOT_EXACT || rcur->type == NOT_INIT)
16690			/* explored state can't have used this */
16691			return true;
16692	}
16693
16694	/* Enforce that register types have to match exactly, including their
16695	 * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
16696	 * rule.
16697	 *
16698	 * One can make a point that using a pointer register as unbounded
16699	 * SCALAR would be technically acceptable, but this could lead to
16700	 * pointer leaks because scalars are allowed to leak while pointers
16701	 * are not. We could make this safe in special cases if root is
16702	 * calling us, but it's probably not worth the hassle.
16703	 *
16704	 * Also, register types that are *not* MAYBE_NULL could technically be
16705	 * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
16706	 * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
16707	 * to the same map).
16708	 * However, if the old MAYBE_NULL register then got NULL checked,
16709	 * doing so could have affected others with the same id, and we can't
16710	 * check for that because we lost the id when we converted to
16711	 * a non-MAYBE_NULL variant.
16712	 * So, as a general rule we don't allow mixing MAYBE_NULL and
16713	 * non-MAYBE_NULL registers as well.
16714	 */
16715	if (rold->type != rcur->type)
16716		return false;
16717
16718	switch (base_type(rold->type)) {
16719	case SCALAR_VALUE:
16720		if (env->explore_alu_limits) {
16721			/* explore_alu_limits disables tnum_in() and range_within()
16722			 * logic and requires everything to be strict
16723			 */
16724			return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
16725			       check_scalar_ids(rold->id, rcur->id, idmap);
16726		}
16727		if (!rold->precise && exact == NOT_EXACT)
16728			return true;
16729		/* Why check_ids() for scalar registers?
16730		 *
16731		 * Consider the following BPF code:
16732		 *   1: r6 = ... unbound scalar, ID=a ...
16733		 *   2: r7 = ... unbound scalar, ID=b ...
16734		 *   3: if (r6 > r7) goto +1
16735		 *   4: r6 = r7
16736		 *   5: if (r6 > X) goto ...
16737		 *   6: ... memory operation using r7 ...
16738		 *
16739		 * First verification path is [1-6]:
16740		 * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
16741		 * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
16742		 *   r7 <= X, because r6 and r7 share same id.
16743		 * Next verification path is [1-4, 6].
16744		 *
16745		 * Instruction (6) would be reached in two states:
16746		 *   I.  r6{.id=b}, r7{.id=b} via path 1-6;
16747		 *   II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
16748		 *
16749		 * Use check_ids() to distinguish these states.
16750		 * ---
16751		 * Also verify that new value satisfies old value range knowledge.
16752		 */
16753		return range_within(rold, rcur) &&
16754		       tnum_in(rold->var_off, rcur->var_off) &&
16755		       check_scalar_ids(rold->id, rcur->id, idmap);
16756	case PTR_TO_MAP_KEY:
16757	case PTR_TO_MAP_VALUE:
16758	case PTR_TO_MEM:
16759	case PTR_TO_BUF:
16760	case PTR_TO_TP_BUFFER:
16761		/* If the new min/max/var_off satisfy the old ones and
16762		 * everything else matches, we are OK.
16763		 */
16764		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
16765		       range_within(rold, rcur) &&
16766		       tnum_in(rold->var_off, rcur->var_off) &&
16767		       check_ids(rold->id, rcur->id, idmap) &&
16768		       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
16769	case PTR_TO_PACKET_META:
16770	case PTR_TO_PACKET:
16771		/* We must have at least as much range as the old ptr
16772		 * did, so that any accesses which were safe before are
16773		 * still safe.  This is true even if old range < old off,
16774		 * since someone could have accessed through (ptr - k), or
16775		 * even done ptr -= k in a register, to get a safe access.
16776		 */
16777		if (rold->range > rcur->range)
16778			return false;
16779		/* If the offsets don't match, we can't trust our alignment;
16780		 * nor can we be sure that we won't fall out of range.
16781		 */
16782		if (rold->off != rcur->off)
16783			return false;
16784		/* id relations must be preserved */
16785		if (!check_ids(rold->id, rcur->id, idmap))
16786			return false;
16787		/* new val must satisfy old val knowledge */
16788		return range_within(rold, rcur) &&
16789		       tnum_in(rold->var_off, rcur->var_off);
16790	case PTR_TO_STACK:
16791		/* two stack pointers are equal only if they're pointing to
16792		 * the same stack frame, since fp-8 in foo != fp-8 in bar
16793		 */
16794		return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
16795	case PTR_TO_ARENA:
16796		return true;
16797	default:
16798		return regs_exact(rold, rcur, idmap);
16799	}
16800}
16801
16802static struct bpf_reg_state unbound_reg;
16803
16804static __init int unbound_reg_init(void)
16805{
16806	__mark_reg_unknown_imprecise(&unbound_reg);
16807	unbound_reg.live |= REG_LIVE_READ;
16808	return 0;
16809}
16810late_initcall(unbound_reg_init);
16811
16812static bool is_stack_all_misc(struct bpf_verifier_env *env,
16813			      struct bpf_stack_state *stack)
16814{
16815	u32 i;
16816
16817	for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
16818		if ((stack->slot_type[i] == STACK_MISC) ||
16819		    (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
16820			continue;
16821		return false;
16822	}
16823
16824	return true;
16825}
16826
16827static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
16828						  struct bpf_stack_state *stack)
16829{
16830	if (is_spilled_scalar_reg64(stack))
16831		return &stack->spilled_ptr;
16832
16833	if (is_stack_all_misc(env, stack))
16834		return &unbound_reg;
16835
16836	return NULL;
16837}
16838
16839static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
16840		      struct bpf_func_state *cur, struct bpf_idmap *idmap,
16841		      enum exact_level exact)
16842{
16843	int i, spi;
16844
16845	/* walk slots of the explored stack and ignore any additional
16846	 * slots in the current stack, since explored(safe) state
16847	 * didn't use them
16848	 */
16849	for (i = 0; i < old->allocated_stack; i++) {
16850		struct bpf_reg_state *old_reg, *cur_reg;
16851
16852		spi = i / BPF_REG_SIZE;
16853
16854		if (exact != NOT_EXACT &&
16855		    old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
16856		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
16857			return false;
16858
16859		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
16860		    && exact == NOT_EXACT) {
16861			i += BPF_REG_SIZE - 1;
16862			/* explored state didn't use this */
16863			continue;
16864		}
16865
16866		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
16867			continue;
16868
16869		if (env->allow_uninit_stack &&
16870		    old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
16871			continue;
16872
16873		/* explored stack has more populated slots than current stack
16874		 * and these slots were used
16875		 */
16876		if (i >= cur->allocated_stack)
16877			return false;
16878
16879		/* 64-bit scalar spill vs all slots MISC and vice versa.
16880		 * Load from all slots MISC produces unbound scalar.
16881		 * Construct a fake register for such stack and call
16882		 * regsafe() to ensure scalar ids are compared.
16883		 */
16884		old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
16885		cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
16886		if (old_reg && cur_reg) {
16887			if (!regsafe(env, old_reg, cur_reg, idmap, exact))
16888				return false;
16889			i += BPF_REG_SIZE - 1;
16890			continue;
16891		}
16892
16893		/* if old state was safe with misc data in the stack
16894		 * it will be safe with zero-initialized stack.
16895		 * The opposite is not true
16896		 */
16897		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
16898		    cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
16899			continue;
16900		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
16901		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
16902			/* Ex: old explored (safe) state has STACK_SPILL in
16903			 * this stack slot, but current has STACK_MISC ->
16904			 * this verifier states are not equivalent,
16905			 * return false to continue verification of this path
16906			 */
16907			return false;
16908		if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
16909			continue;
16910		/* Both old and cur are having same slot_type */
16911		switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
16912		case STACK_SPILL:
16913			/* when explored and current stack slot are both storing
16914			 * spilled registers, check that stored pointers types
16915			 * are the same as well.
16916			 * Ex: explored safe path could have stored
16917			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
16918			 * but current path has stored:
16919			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
16920			 * such verifier states are not equivalent.
16921			 * return false to continue verification of this path
16922			 */
16923			if (!regsafe(env, &old->stack[spi].spilled_ptr,
16924				     &cur->stack[spi].spilled_ptr, idmap, exact))
16925				return false;
16926			break;
16927		case STACK_DYNPTR:
16928			old_reg = &old->stack[spi].spilled_ptr;
16929			cur_reg = &cur->stack[spi].spilled_ptr;
16930			if (old_reg->dynptr.type != cur_reg->dynptr.type ||
16931			    old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
16932			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
16933				return false;
16934			break;
16935		case STACK_ITER:
16936			old_reg = &old->stack[spi].spilled_ptr;
16937			cur_reg = &cur->stack[spi].spilled_ptr;
16938			/* iter.depth is not compared between states as it
16939			 * doesn't matter for correctness and would otherwise
16940			 * prevent convergence; we maintain it only to prevent
16941			 * infinite loop check triggering, see
16942			 * iter_active_depths_differ()
16943			 */
16944			if (old_reg->iter.btf != cur_reg->iter.btf ||
16945			    old_reg->iter.btf_id != cur_reg->iter.btf_id ||
16946			    old_reg->iter.state != cur_reg->iter.state ||
16947			    /* ignore {old_reg,cur_reg}->iter.depth, see above */
16948			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
16949				return false;
16950			break;
16951		case STACK_MISC:
16952		case STACK_ZERO:
16953		case STACK_INVALID:
16954			continue;
16955		/* Ensure that new unhandled slot types return false by default */
16956		default:
16957			return false;
16958		}
16959	}
16960	return true;
16961}
16962
16963static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
16964		    struct bpf_idmap *idmap)
16965{
16966	int i;
16967
16968	if (old->acquired_refs != cur->acquired_refs)
16969		return false;
16970
16971	for (i = 0; i < old->acquired_refs; i++) {
16972		if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
16973			return false;
16974	}
16975
16976	return true;
16977}
16978
16979/* compare two verifier states
16980 *
16981 * all states stored in state_list are known to be valid, since
16982 * verifier reached 'bpf_exit' instruction through them
16983 *
16984 * this function is called when verifier exploring different branches of
16985 * execution popped from the state stack. If it sees an old state that has
16986 * more strict register state and more strict stack state then this execution
16987 * branch doesn't need to be explored further, since verifier already
16988 * concluded that more strict state leads to valid finish.
16989 *
16990 * Therefore two states are equivalent if register state is more conservative
16991 * and explored stack state is more conservative than the current one.
16992 * Example:
16993 *       explored                   current
16994 * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
16995 * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
16996 *
16997 * In other words if current stack state (one being explored) has more
16998 * valid slots than old one that already passed validation, it means
16999 * the verifier can stop exploring and conclude that current state is valid too
17000 *
17001 * Similarly with registers. If explored state has register type as invalid
17002 * whereas register type in current state is meaningful, it means that
17003 * the current state will reach 'bpf_exit' instruction safely
17004 */
17005static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
17006			      struct bpf_func_state *cur, enum exact_level exact)
17007{
17008	int i;
17009
17010	if (old->callback_depth > cur->callback_depth)
17011		return false;
17012
17013	for (i = 0; i < MAX_BPF_REG; i++)
17014		if (!regsafe(env, &old->regs[i], &cur->regs[i],
17015			     &env->idmap_scratch, exact))
17016			return false;
17017
17018	if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
17019		return false;
17020
17021	if (!refsafe(old, cur, &env->idmap_scratch))
17022		return false;
17023
17024	return true;
17025}
17026
17027static void reset_idmap_scratch(struct bpf_verifier_env *env)
17028{
17029	env->idmap_scratch.tmp_id_gen = env->id_gen;
17030	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
17031}
17032
17033static bool states_equal(struct bpf_verifier_env *env,
17034			 struct bpf_verifier_state *old,
17035			 struct bpf_verifier_state *cur,
17036			 enum exact_level exact)
17037{
17038	int i;
17039
17040	if (old->curframe != cur->curframe)
17041		return false;
17042
17043	reset_idmap_scratch(env);
17044
17045	/* Verification state from speculative execution simulation
17046	 * must never prune a non-speculative execution one.
17047	 */
17048	if (old->speculative && !cur->speculative)
17049		return false;
17050
17051	if (old->active_lock.ptr != cur->active_lock.ptr)
17052		return false;
17053
17054	/* Old and cur active_lock's have to be either both present
17055	 * or both absent.
17056	 */
17057	if (!!old->active_lock.id != !!cur->active_lock.id)
17058		return false;
17059
17060	if (old->active_lock.id &&
17061	    !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
17062		return false;
17063
17064	if (old->active_rcu_lock != cur->active_rcu_lock)
17065		return false;
17066
17067	if (old->active_preempt_lock != cur->active_preempt_lock)
17068		return false;
17069
17070	if (old->in_sleepable != cur->in_sleepable)
17071		return false;
17072
17073	/* for states to be equal callsites have to be the same
17074	 * and all frame states need to be equivalent
17075	 */
17076	for (i = 0; i <= old->curframe; i++) {
17077		if (old->frame[i]->callsite != cur->frame[i]->callsite)
17078			return false;
17079		if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
17080			return false;
17081	}
17082	return true;
17083}
17084
17085/* Return 0 if no propagation happened. Return negative error code if error
17086 * happened. Otherwise, return the propagated bit.
17087 */
17088static int propagate_liveness_reg(struct bpf_verifier_env *env,
17089				  struct bpf_reg_state *reg,
17090				  struct bpf_reg_state *parent_reg)
17091{
17092	u8 parent_flag = parent_reg->live & REG_LIVE_READ;
17093	u8 flag = reg->live & REG_LIVE_READ;
17094	int err;
17095
17096	/* When comes here, read flags of PARENT_REG or REG could be any of
17097	 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
17098	 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
17099	 */
17100	if (parent_flag == REG_LIVE_READ64 ||
17101	    /* Or if there is no read flag from REG. */
17102	    !flag ||
17103	    /* Or if the read flag from REG is the same as PARENT_REG. */
17104	    parent_flag == flag)
17105		return 0;
17106
17107	err = mark_reg_read(env, reg, parent_reg, flag);
17108	if (err)
17109		return err;
17110
17111	return flag;
17112}
17113
17114/* A write screens off any subsequent reads; but write marks come from the
17115 * straight-line code between a state and its parent.  When we arrive at an
17116 * equivalent state (jump target or such) we didn't arrive by the straight-line
17117 * code, so read marks in the state must propagate to the parent regardless
17118 * of the state's write marks. That's what 'parent == state->parent' comparison
17119 * in mark_reg_read() is for.
17120 */
17121static int propagate_liveness(struct bpf_verifier_env *env,
17122			      const struct bpf_verifier_state *vstate,
17123			      struct bpf_verifier_state *vparent)
17124{
17125	struct bpf_reg_state *state_reg, *parent_reg;
17126	struct bpf_func_state *state, *parent;
17127	int i, frame, err = 0;
17128
17129	if (vparent->curframe != vstate->curframe) {
17130		WARN(1, "propagate_live: parent frame %d current frame %d\n",
17131		     vparent->curframe, vstate->curframe);
17132		return -EFAULT;
17133	}
17134	/* Propagate read liveness of registers... */
17135	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
17136	for (frame = 0; frame <= vstate->curframe; frame++) {
17137		parent = vparent->frame[frame];
17138		state = vstate->frame[frame];
17139		parent_reg = parent->regs;
17140		state_reg = state->regs;
17141		/* We don't need to worry about FP liveness, it's read-only */
17142		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
17143			err = propagate_liveness_reg(env, &state_reg[i],
17144						     &parent_reg[i]);
17145			if (err < 0)
17146				return err;
17147			if (err == REG_LIVE_READ64)
17148				mark_insn_zext(env, &parent_reg[i]);
17149		}
17150
17151		/* Propagate stack slots. */
17152		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
17153			    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
17154			parent_reg = &parent->stack[i].spilled_ptr;
17155			state_reg = &state->stack[i].spilled_ptr;
17156			err = propagate_liveness_reg(env, state_reg,
17157						     parent_reg);
17158			if (err < 0)
17159				return err;
17160		}
17161	}
17162	return 0;
17163}
17164
17165/* find precise scalars in the previous equivalent state and
17166 * propagate them into the current state
17167 */
17168static int propagate_precision(struct bpf_verifier_env *env,
17169			       const struct bpf_verifier_state *old)
17170{
17171	struct bpf_reg_state *state_reg;
17172	struct bpf_func_state *state;
17173	int i, err = 0, fr;
17174	bool first;
17175
17176	for (fr = old->curframe; fr >= 0; fr--) {
17177		state = old->frame[fr];
17178		state_reg = state->regs;
17179		first = true;
17180		for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
17181			if (state_reg->type != SCALAR_VALUE ||
17182			    !state_reg->precise ||
17183			    !(state_reg->live & REG_LIVE_READ))
17184				continue;
17185			if (env->log.level & BPF_LOG_LEVEL2) {
17186				if (first)
17187					verbose(env, "frame %d: propagating r%d", fr, i);
17188				else
17189					verbose(env, ",r%d", i);
17190			}
17191			bt_set_frame_reg(&env->bt, fr, i);
17192			first = false;
17193		}
17194
17195		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
17196			if (!is_spilled_reg(&state->stack[i]))
17197				continue;
17198			state_reg = &state->stack[i].spilled_ptr;
17199			if (state_reg->type != SCALAR_VALUE ||
17200			    !state_reg->precise ||
17201			    !(state_reg->live & REG_LIVE_READ))
17202				continue;
17203			if (env->log.level & BPF_LOG_LEVEL2) {
17204				if (first)
17205					verbose(env, "frame %d: propagating fp%d",
17206						fr, (-i - 1) * BPF_REG_SIZE);
17207				else
17208					verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
17209			}
17210			bt_set_frame_slot(&env->bt, fr, i);
17211			first = false;
17212		}
17213		if (!first)
17214			verbose(env, "\n");
17215	}
17216
17217	err = mark_chain_precision_batch(env);
17218	if (err < 0)
17219		return err;
17220
17221	return 0;
17222}
17223
17224static bool states_maybe_looping(struct bpf_verifier_state *old,
17225				 struct bpf_verifier_state *cur)
17226{
17227	struct bpf_func_state *fold, *fcur;
17228	int i, fr = cur->curframe;
17229
17230	if (old->curframe != fr)
17231		return false;
17232
17233	fold = old->frame[fr];
17234	fcur = cur->frame[fr];
17235	for (i = 0; i < MAX_BPF_REG; i++)
17236		if (memcmp(&fold->regs[i], &fcur->regs[i],
17237			   offsetof(struct bpf_reg_state, parent)))
17238			return false;
17239	return true;
17240}
17241
17242static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
17243{
17244	return env->insn_aux_data[insn_idx].is_iter_next;
17245}
17246
17247/* is_state_visited() handles iter_next() (see process_iter_next_call() for
17248 * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
17249 * states to match, which otherwise would look like an infinite loop. So while
17250 * iter_next() calls are taken care of, we still need to be careful and
17251 * prevent erroneous and too eager declaration of "ininite loop", when
17252 * iterators are involved.
17253 *
17254 * Here's a situation in pseudo-BPF assembly form:
17255 *
17256 *   0: again:                          ; set up iter_next() call args
17257 *   1:   r1 = &it                      ; <CHECKPOINT HERE>
17258 *   2:   call bpf_iter_num_next        ; this is iter_next() call
17259 *   3:   if r0 == 0 goto done
17260 *   4:   ... something useful here ...
17261 *   5:   goto again                    ; another iteration
17262 *   6: done:
17263 *   7:   r1 = &it
17264 *   8:   call bpf_iter_num_destroy     ; clean up iter state
17265 *   9:   exit
17266 *
17267 * This is a typical loop. Let's assume that we have a prune point at 1:,
17268 * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
17269 * again`, assuming other heuristics don't get in a way).
17270 *
17271 * When we first time come to 1:, let's say we have some state X. We proceed
17272 * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
17273 * Now we come back to validate that forked ACTIVE state. We proceed through
17274 * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
17275 * are converging. But the problem is that we don't know that yet, as this
17276 * convergence has to happen at iter_next() call site only. So if nothing is
17277 * done, at 1: verifier will use bounded loop logic and declare infinite
17278 * looping (and would be *technically* correct, if not for iterator's
17279 * "eventual sticky NULL" contract, see process_iter_next_call()). But we
17280 * don't want that. So what we do in process_iter_next_call() when we go on
17281 * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
17282 * a different iteration. So when we suspect an infinite loop, we additionally
17283 * check if any of the *ACTIVE* iterator states depths differ. If yes, we
17284 * pretend we are not looping and wait for next iter_next() call.
17285 *
17286 * This only applies to ACTIVE state. In DRAINED state we don't expect to
17287 * loop, because that would actually mean infinite loop, as DRAINED state is
17288 * "sticky", and so we'll keep returning into the same instruction with the
17289 * same state (at least in one of possible code paths).
17290 *
17291 * This approach allows to keep infinite loop heuristic even in the face of
17292 * active iterator. E.g., C snippet below is and will be detected as
17293 * inifintely looping:
17294 *
17295 *   struct bpf_iter_num it;
17296 *   int *p, x;
17297 *
17298 *   bpf_iter_num_new(&it, 0, 10);
17299 *   while ((p = bpf_iter_num_next(&t))) {
17300 *       x = p;
17301 *       while (x--) {} // <<-- infinite loop here
17302 *   }
17303 *
17304 */
17305static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
17306{
17307	struct bpf_reg_state *slot, *cur_slot;
17308	struct bpf_func_state *state;
17309	int i, fr;
17310
17311	for (fr = old->curframe; fr >= 0; fr--) {
17312		state = old->frame[fr];
17313		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
17314			if (state->stack[i].slot_type[0] != STACK_ITER)
17315				continue;
17316
17317			slot = &state->stack[i].spilled_ptr;
17318			if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
17319				continue;
17320
17321			cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
17322			if (cur_slot->iter.depth != slot->iter.depth)
17323				return true;
17324		}
17325	}
17326	return false;
17327}
17328
17329static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
17330{
17331	struct bpf_verifier_state_list *new_sl;
17332	struct bpf_verifier_state_list *sl, **pprev;
17333	struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
17334	int i, j, n, err, states_cnt = 0;
17335	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
17336	bool add_new_state = force_new_state;
17337	bool force_exact;
17338
17339	/* bpf progs typically have pruning point every 4 instructions
17340	 * http://vger.kernel.org/bpfconf2019.html#session-1
17341	 * Do not add new state for future pruning if the verifier hasn't seen
17342	 * at least 2 jumps and at least 8 instructions.
17343	 * This heuristics helps decrease 'total_states' and 'peak_states' metric.
17344	 * In tests that amounts to up to 50% reduction into total verifier
17345	 * memory consumption and 20% verifier time speedup.
17346	 */
17347	if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
17348	    env->insn_processed - env->prev_insn_processed >= 8)
17349		add_new_state = true;
17350
17351	pprev = explored_state(env, insn_idx);
17352	sl = *pprev;
17353
17354	clean_live_states(env, insn_idx, cur);
17355
17356	while (sl) {
17357		states_cnt++;
17358		if (sl->state.insn_idx != insn_idx)
17359			goto next;
17360
17361		if (sl->state.branches) {
17362			struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
17363
17364			if (frame->in_async_callback_fn &&
17365			    frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
17366				/* Different async_entry_cnt means that the verifier is
17367				 * processing another entry into async callback.
17368				 * Seeing the same state is not an indication of infinite
17369				 * loop or infinite recursion.
17370				 * But finding the same state doesn't mean that it's safe
17371				 * to stop processing the current state. The previous state
17372				 * hasn't yet reached bpf_exit, since state.branches > 0.
17373				 * Checking in_async_callback_fn alone is not enough either.
17374				 * Since the verifier still needs to catch infinite loops
17375				 * inside async callbacks.
17376				 */
17377				goto skip_inf_loop_check;
17378			}
17379			/* BPF open-coded iterators loop detection is special.
17380			 * states_maybe_looping() logic is too simplistic in detecting
17381			 * states that *might* be equivalent, because it doesn't know
17382			 * about ID remapping, so don't even perform it.
17383			 * See process_iter_next_call() and iter_active_depths_differ()
17384			 * for overview of the logic. When current and one of parent
17385			 * states are detected as equivalent, it's a good thing: we prove
17386			 * convergence and can stop simulating further iterations.
17387			 * It's safe to assume that iterator loop will finish, taking into
17388			 * account iter_next() contract of eventually returning
17389			 * sticky NULL result.
17390			 *
17391			 * Note, that states have to be compared exactly in this case because
17392			 * read and precision marks might not be finalized inside the loop.
17393			 * E.g. as in the program below:
17394			 *
17395			 *     1. r7 = -16
17396			 *     2. r6 = bpf_get_prandom_u32()
17397			 *     3. while (bpf_iter_num_next(&fp[-8])) {
17398			 *     4.   if (r6 != 42) {
17399			 *     5.     r7 = -32
17400			 *     6.     r6 = bpf_get_prandom_u32()
17401			 *     7.     continue
17402			 *     8.   }
17403			 *     9.   r0 = r10
17404			 *    10.   r0 += r7
17405			 *    11.   r8 = *(u64 *)(r0 + 0)
17406			 *    12.   r6 = bpf_get_prandom_u32()
17407			 *    13. }
17408			 *
17409			 * Here verifier would first visit path 1-3, create a checkpoint at 3
17410			 * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
17411			 * not have read or precision mark for r7 yet, thus inexact states
17412			 * comparison would discard current state with r7=-32
17413			 * => unsafe memory access at 11 would not be caught.
17414			 */
17415			if (is_iter_next_insn(env, insn_idx)) {
17416				if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
17417					struct bpf_func_state *cur_frame;
17418					struct bpf_reg_state *iter_state, *iter_reg;
17419					int spi;
17420
17421					cur_frame = cur->frame[cur->curframe];
17422					/* btf_check_iter_kfuncs() enforces that
17423					 * iter state pointer is always the first arg
17424					 */
17425					iter_reg = &cur_frame->regs[BPF_REG_1];
17426					/* current state is valid due to states_equal(),
17427					 * so we can assume valid iter and reg state,
17428					 * no need for extra (re-)validations
17429					 */
17430					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
17431					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
17432					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
17433						update_loop_entry(cur, &sl->state);
17434						goto hit;
17435					}
17436				}
17437				goto skip_inf_loop_check;
17438			}
17439			if (is_may_goto_insn_at(env, insn_idx)) {
17440				if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
17441					update_loop_entry(cur, &sl->state);
17442					goto hit;
17443				}
17444				goto skip_inf_loop_check;
17445			}
17446			if (calls_callback(env, insn_idx)) {
17447				if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
17448					goto hit;
17449				goto skip_inf_loop_check;
17450			}
17451			/* attempt to detect infinite loop to avoid unnecessary doomed work */
17452			if (states_maybe_looping(&sl->state, cur) &&
17453			    states_equal(env, &sl->state, cur, EXACT) &&
17454			    !iter_active_depths_differ(&sl->state, cur) &&
17455			    sl->state.may_goto_depth == cur->may_goto_depth &&
17456			    sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
17457				verbose_linfo(env, insn_idx, "; ");
17458				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
17459				verbose(env, "cur state:");
17460				print_verifier_state(env, cur->frame[cur->curframe], true);
17461				verbose(env, "old state:");
17462				print_verifier_state(env, sl->state.frame[cur->curframe], true);
17463				return -EINVAL;
17464			}
17465			/* if the verifier is processing a loop, avoid adding new state
17466			 * too often, since different loop iterations have distinct
17467			 * states and may not help future pruning.
17468			 * This threshold shouldn't be too low to make sure that
17469			 * a loop with large bound will be rejected quickly.
17470			 * The most abusive loop will be:
17471			 * r1 += 1
17472			 * if r1 < 1000000 goto pc-2
17473			 * 1M insn_procssed limit / 100 == 10k peak states.
17474			 * This threshold shouldn't be too high either, since states
17475			 * at the end of the loop are likely to be useful in pruning.
17476			 */
17477skip_inf_loop_check:
17478			if (!force_new_state &&
17479			    env->jmps_processed - env->prev_jmps_processed < 20 &&
17480			    env->insn_processed - env->prev_insn_processed < 100)
17481				add_new_state = false;
17482			goto miss;
17483		}
17484		/* If sl->state is a part of a loop and this loop's entry is a part of
17485		 * current verification path then states have to be compared exactly.
17486		 * 'force_exact' is needed to catch the following case:
17487		 *
17488		 *                initial     Here state 'succ' was processed first,
17489		 *                  |         it was eventually tracked to produce a
17490		 *                  V         state identical to 'hdr'.
17491		 *     .---------> hdr        All branches from 'succ' had been explored
17492		 *     |            |         and thus 'succ' has its .branches == 0.
17493		 *     |            V
17494		 *     |    .------...        Suppose states 'cur' and 'succ' correspond
17495		 *     |    |       |         to the same instruction + callsites.
17496		 *     |    V       V         In such case it is necessary to check
17497		 *     |   ...     ...        if 'succ' and 'cur' are states_equal().
17498		 *     |    |       |         If 'succ' and 'cur' are a part of the
17499		 *     |    V       V         same loop exact flag has to be set.
17500		 *     |   succ <- cur        To check if that is the case, verify
17501		 *     |    |                 if loop entry of 'succ' is in current
17502		 *     |    V                 DFS path.
17503		 *     |   ...
17504		 *     |    |
17505		 *     '----'
17506		 *
17507		 * Additional details are in the comment before get_loop_entry().
17508		 */
17509		loop_entry = get_loop_entry(&sl->state);
17510		force_exact = loop_entry && loop_entry->branches > 0;
17511		if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
17512			if (force_exact)
17513				update_loop_entry(cur, loop_entry);
17514hit:
17515			sl->hit_cnt++;
17516			/* reached equivalent register/stack state,
17517			 * prune the search.
17518			 * Registers read by the continuation are read by us.
17519			 * If we have any write marks in env->cur_state, they
17520			 * will prevent corresponding reads in the continuation
17521			 * from reaching our parent (an explored_state).  Our
17522			 * own state will get the read marks recorded, but
17523			 * they'll be immediately forgotten as we're pruning
17524			 * this state and will pop a new one.
17525			 */
17526			err = propagate_liveness(env, &sl->state, cur);
17527
17528			/* if previous state reached the exit with precision and
17529			 * current state is equivalent to it (except precision marks)
17530			 * the precision needs to be propagated back in
17531			 * the current state.
17532			 */
17533			if (is_jmp_point(env, env->insn_idx))
17534				err = err ? : push_jmp_history(env, cur, 0);
17535			err = err ? : propagate_precision(env, &sl->state);
17536			if (err)
17537				return err;
17538			return 1;
17539		}
17540miss:
17541		/* when new state is not going to be added do not increase miss count.
17542		 * Otherwise several loop iterations will remove the state
17543		 * recorded earlier. The goal of these heuristics is to have
17544		 * states from some iterations of the loop (some in the beginning
17545		 * and some at the end) to help pruning.
17546		 */
17547		if (add_new_state)
17548			sl->miss_cnt++;
17549		/* heuristic to determine whether this state is beneficial
17550		 * to keep checking from state equivalence point of view.
17551		 * Higher numbers increase max_states_per_insn and verification time,
17552		 * but do not meaningfully decrease insn_processed.
17553		 * 'n' controls how many times state could miss before eviction.
17554		 * Use bigger 'n' for checkpoints because evicting checkpoint states
17555		 * too early would hinder iterator convergence.
17556		 */
17557		n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
17558		if (sl->miss_cnt > sl->hit_cnt * n + n) {
17559			/* the state is unlikely to be useful. Remove it to
17560			 * speed up verification
17561			 */
17562			*pprev = sl->next;
17563			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
17564			    !sl->state.used_as_loop_entry) {
17565				u32 br = sl->state.branches;
17566
17567				WARN_ONCE(br,
17568					  "BUG live_done but branches_to_explore %d\n",
17569					  br);
17570				free_verifier_state(&sl->state, false);
17571				kfree(sl);
17572				env->peak_states--;
17573			} else {
17574				/* cannot free this state, since parentage chain may
17575				 * walk it later. Add it for free_list instead to
17576				 * be freed at the end of verification
17577				 */
17578				sl->next = env->free_list;
17579				env->free_list = sl;
17580			}
17581			sl = *pprev;
17582			continue;
17583		}
17584next:
17585		pprev = &sl->next;
17586		sl = *pprev;
17587	}
17588
17589	if (env->max_states_per_insn < states_cnt)
17590		env->max_states_per_insn = states_cnt;
17591
17592	if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
17593		return 0;
17594
17595	if (!add_new_state)
17596		return 0;
17597
17598	/* There were no equivalent states, remember the current one.
17599	 * Technically the current state is not proven to be safe yet,
17600	 * but it will either reach outer most bpf_exit (which means it's safe)
17601	 * or it will be rejected. When there are no loops the verifier won't be
17602	 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
17603	 * again on the way to bpf_exit.
17604	 * When looping the sl->state.branches will be > 0 and this state
17605	 * will not be considered for equivalence until branches == 0.
17606	 */
17607	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
17608	if (!new_sl)
17609		return -ENOMEM;
17610	env->total_states++;
17611	env->peak_states++;
17612	env->prev_jmps_processed = env->jmps_processed;
17613	env->prev_insn_processed = env->insn_processed;
17614
17615	/* forget precise markings we inherited, see __mark_chain_precision */
17616	if (env->bpf_capable)
17617		mark_all_scalars_imprecise(env, cur);
17618
17619	/* add new state to the head of linked list */
17620	new = &new_sl->state;
17621	err = copy_verifier_state(new, cur);
17622	if (err) {
17623		free_verifier_state(new, false);
17624		kfree(new_sl);
17625		return err;
17626	}
17627	new->insn_idx = insn_idx;
17628	WARN_ONCE(new->branches != 1,
17629		  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
17630
17631	cur->parent = new;
17632	cur->first_insn_idx = insn_idx;
17633	cur->dfs_depth = new->dfs_depth + 1;
17634	clear_jmp_history(cur);
17635	new_sl->next = *explored_state(env, insn_idx);
17636	*explored_state(env, insn_idx) = new_sl;
17637	/* connect new state to parentage chain. Current frame needs all
17638	 * registers connected. Only r6 - r9 of the callers are alive (pushed
17639	 * to the stack implicitly by JITs) so in callers' frames connect just
17640	 * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
17641	 * the state of the call instruction (with WRITTEN set), and r0 comes
17642	 * from callee with its full parentage chain, anyway.
17643	 */
17644	/* clear write marks in current state: the writes we did are not writes
17645	 * our child did, so they don't screen off its reads from us.
17646	 * (There are no read marks in current state, because reads always mark
17647	 * their parent and current state never has children yet.  Only
17648	 * explored_states can get read marks.)
17649	 */
17650	for (j = 0; j <= cur->curframe; j++) {
17651		for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
17652			cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
17653		for (i = 0; i < BPF_REG_FP; i++)
17654			cur->frame[j]->regs[i].live = REG_LIVE_NONE;
17655	}
17656
17657	/* all stack frames are accessible from callee, clear them all */
17658	for (j = 0; j <= cur->curframe; j++) {
17659		struct bpf_func_state *frame = cur->frame[j];
17660		struct bpf_func_state *newframe = new->frame[j];
17661
17662		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
17663			frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
17664			frame->stack[i].spilled_ptr.parent =
17665						&newframe->stack[i].spilled_ptr;
17666		}
17667	}
17668	return 0;
17669}
17670
17671/* Return true if it's OK to have the same insn return a different type. */
17672static bool reg_type_mismatch_ok(enum bpf_reg_type type)
17673{
17674	switch (base_type(type)) {
17675	case PTR_TO_CTX:
17676	case PTR_TO_SOCKET:
17677	case PTR_TO_SOCK_COMMON:
17678	case PTR_TO_TCP_SOCK:
17679	case PTR_TO_XDP_SOCK:
17680	case PTR_TO_BTF_ID:
17681	case PTR_TO_ARENA:
17682		return false;
17683	default:
17684		return true;
17685	}
17686}
17687
17688/* If an instruction was previously used with particular pointer types, then we
17689 * need to be careful to avoid cases such as the below, where it may be ok
17690 * for one branch accessing the pointer, but not ok for the other branch:
17691 *
17692 * R1 = sock_ptr
17693 * goto X;
17694 * ...
17695 * R1 = some_other_valid_ptr;
17696 * goto X;
17697 * ...
17698 * R2 = *(u32 *)(R1 + 0);
17699 */
17700static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
17701{
17702	return src != prev && (!reg_type_mismatch_ok(src) ||
17703			       !reg_type_mismatch_ok(prev));
17704}
17705
17706static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
17707			     bool allow_trust_mismatch)
17708{
17709	enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
17710
17711	if (*prev_type == NOT_INIT) {
17712		/* Saw a valid insn
17713		 * dst_reg = *(u32 *)(src_reg + off)
17714		 * save type to validate intersecting paths
17715		 */
17716		*prev_type = type;
17717	} else if (reg_type_mismatch(type, *prev_type)) {
17718		/* Abuser program is trying to use the same insn
17719		 * dst_reg = *(u32*) (src_reg + off)
17720		 * with different pointer types:
17721		 * src_reg == ctx in one branch and
17722		 * src_reg == stack|map in some other branch.
17723		 * Reject it.
17724		 */
17725		if (allow_trust_mismatch &&
17726		    base_type(type) == PTR_TO_BTF_ID &&
17727		    base_type(*prev_type) == PTR_TO_BTF_ID) {
17728			/*
17729			 * Have to support a use case when one path through
17730			 * the program yields TRUSTED pointer while another
17731			 * is UNTRUSTED. Fallback to UNTRUSTED to generate
17732			 * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
17733			 */
17734			*prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
17735		} else {
17736			verbose(env, "same insn cannot be used with different pointers\n");
17737			return -EINVAL;
17738		}
17739	}
17740
17741	return 0;
17742}
17743
17744static int do_check(struct bpf_verifier_env *env)
17745{
17746	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
17747	struct bpf_verifier_state *state = env->cur_state;
17748	struct bpf_insn *insns = env->prog->insnsi;
17749	struct bpf_reg_state *regs;
17750	int insn_cnt = env->prog->len;
17751	bool do_print_state = false;
17752	int prev_insn_idx = -1;
17753
17754	for (;;) {
17755		bool exception_exit = false;
17756		struct bpf_insn *insn;
17757		u8 class;
17758		int err;
17759
17760		/* reset current history entry on each new instruction */
17761		env->cur_hist_ent = NULL;
17762
17763		env->prev_insn_idx = prev_insn_idx;
17764		if (env->insn_idx >= insn_cnt) {
17765			verbose(env, "invalid insn idx %d insn_cnt %d\n",
17766				env->insn_idx, insn_cnt);
17767			return -EFAULT;
17768		}
17769
17770		insn = &insns[env->insn_idx];
17771		class = BPF_CLASS(insn->code);
17772
17773		if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
17774			verbose(env,
17775				"BPF program is too large. Processed %d insn\n",
17776				env->insn_processed);
17777			return -E2BIG;
17778		}
17779
17780		state->last_insn_idx = env->prev_insn_idx;
17781
17782		if (is_prune_point(env, env->insn_idx)) {
17783			err = is_state_visited(env, env->insn_idx);
17784			if (err < 0)
17785				return err;
17786			if (err == 1) {
17787				/* found equivalent state, can prune the search */
17788				if (env->log.level & BPF_LOG_LEVEL) {
17789					if (do_print_state)
17790						verbose(env, "\nfrom %d to %d%s: safe\n",
17791							env->prev_insn_idx, env->insn_idx,
17792							env->cur_state->speculative ?
17793							" (speculative execution)" : "");
17794					else
17795						verbose(env, "%d: safe\n", env->insn_idx);
17796				}
17797				goto process_bpf_exit;
17798			}
17799		}
17800
17801		if (is_jmp_point(env, env->insn_idx)) {
17802			err = push_jmp_history(env, state, 0);
17803			if (err)
17804				return err;
17805		}
17806
17807		if (signal_pending(current))
17808			return -EAGAIN;
17809
17810		if (need_resched())
17811			cond_resched();
17812
17813		if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
17814			verbose(env, "\nfrom %d to %d%s:",
17815				env->prev_insn_idx, env->insn_idx,
17816				env->cur_state->speculative ?
17817				" (speculative execution)" : "");
17818			print_verifier_state(env, state->frame[state->curframe], true);
17819			do_print_state = false;
17820		}
17821
17822		if (env->log.level & BPF_LOG_LEVEL) {
17823			const struct bpf_insn_cbs cbs = {
17824				.cb_call	= disasm_kfunc_name,
17825				.cb_print	= verbose,
17826				.private_data	= env,
17827			};
17828
17829			if (verifier_state_scratched(env))
17830				print_insn_state(env, state->frame[state->curframe]);
17831
17832			verbose_linfo(env, env->insn_idx, "; ");
17833			env->prev_log_pos = env->log.end_pos;
17834			verbose(env, "%d: ", env->insn_idx);
17835			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
17836			env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
17837			env->prev_log_pos = env->log.end_pos;
17838		}
17839
17840		if (bpf_prog_is_offloaded(env->prog->aux)) {
17841			err = bpf_prog_offload_verify_insn(env, env->insn_idx,
17842							   env->prev_insn_idx);
17843			if (err)
17844				return err;
17845		}
17846
17847		regs = cur_regs(env);
17848		sanitize_mark_insn_seen(env);
17849		prev_insn_idx = env->insn_idx;
17850
17851		if (class == BPF_ALU || class == BPF_ALU64) {
17852			err = check_alu_op(env, insn);
17853			if (err)
17854				return err;
17855
17856		} else if (class == BPF_LDX) {
17857			enum bpf_reg_type src_reg_type;
17858
17859			/* check for reserved fields is already done */
17860
17861			/* check src operand */
17862			err = check_reg_arg(env, insn->src_reg, SRC_OP);
17863			if (err)
17864				return err;
17865
17866			err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
17867			if (err)
17868				return err;
17869
17870			src_reg_type = regs[insn->src_reg].type;
17871
17872			/* check that memory (src_reg + off) is readable,
17873			 * the state of dst_reg will be updated by this func
17874			 */
17875			err = check_mem_access(env, env->insn_idx, insn->src_reg,
17876					       insn->off, BPF_SIZE(insn->code),
17877					       BPF_READ, insn->dst_reg, false,
17878					       BPF_MODE(insn->code) == BPF_MEMSX);
17879			err = err ?: save_aux_ptr_type(env, src_reg_type, true);
17880			err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
17881			if (err)
17882				return err;
17883		} else if (class == BPF_STX) {
17884			enum bpf_reg_type dst_reg_type;
17885
17886			if (BPF_MODE(insn->code) == BPF_ATOMIC) {
17887				err = check_atomic(env, env->insn_idx, insn);
17888				if (err)
17889					return err;
17890				env->insn_idx++;
17891				continue;
17892			}
17893
17894			if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
17895				verbose(env, "BPF_STX uses reserved fields\n");
17896				return -EINVAL;
17897			}
17898
17899			/* check src1 operand */
17900			err = check_reg_arg(env, insn->src_reg, SRC_OP);
17901			if (err)
17902				return err;
17903			/* check src2 operand */
17904			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
17905			if (err)
17906				return err;
17907
17908			dst_reg_type = regs[insn->dst_reg].type;
17909
17910			/* check that memory (dst_reg + off) is writeable */
17911			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
17912					       insn->off, BPF_SIZE(insn->code),
17913					       BPF_WRITE, insn->src_reg, false, false);
17914			if (err)
17915				return err;
17916
17917			err = save_aux_ptr_type(env, dst_reg_type, false);
17918			if (err)
17919				return err;
17920		} else if (class == BPF_ST) {
17921			enum bpf_reg_type dst_reg_type;
17922
17923			if (BPF_MODE(insn->code) != BPF_MEM ||
17924			    insn->src_reg != BPF_REG_0) {
17925				verbose(env, "BPF_ST uses reserved fields\n");
17926				return -EINVAL;
17927			}
17928			/* check src operand */
17929			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
17930			if (err)
17931				return err;
17932
17933			dst_reg_type = regs[insn->dst_reg].type;
17934
17935			/* check that memory (dst_reg + off) is writeable */
17936			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
17937					       insn->off, BPF_SIZE(insn->code),
17938					       BPF_WRITE, -1, false, false);
17939			if (err)
17940				return err;
17941
17942			err = save_aux_ptr_type(env, dst_reg_type, false);
17943			if (err)
17944				return err;
17945		} else if (class == BPF_JMP || class == BPF_JMP32) {
17946			u8 opcode = BPF_OP(insn->code);
17947
17948			env->jmps_processed++;
17949			if (opcode == BPF_CALL) {
17950				if (BPF_SRC(insn->code) != BPF_K ||
17951				    (insn->src_reg != BPF_PSEUDO_KFUNC_CALL
17952				     && insn->off != 0) ||
17953				    (insn->src_reg != BPF_REG_0 &&
17954				     insn->src_reg != BPF_PSEUDO_CALL &&
17955				     insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
17956				    insn->dst_reg != BPF_REG_0 ||
17957				    class == BPF_JMP32) {
17958					verbose(env, "BPF_CALL uses reserved fields\n");
17959					return -EINVAL;
17960				}
17961
17962				if (env->cur_state->active_lock.ptr) {
17963					if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
17964					    (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
17965					     (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
17966						verbose(env, "function calls are not allowed while holding a lock\n");
17967						return -EINVAL;
17968					}
17969				}
17970				if (insn->src_reg == BPF_PSEUDO_CALL) {
17971					err = check_func_call(env, insn, &env->insn_idx);
17972				} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
17973					err = check_kfunc_call(env, insn, &env->insn_idx);
17974					if (!err && is_bpf_throw_kfunc(insn)) {
17975						exception_exit = true;
17976						goto process_bpf_exit_full;
17977					}
17978				} else {
17979					err = check_helper_call(env, insn, &env->insn_idx);
17980				}
17981				if (err)
17982					return err;
17983
17984				mark_reg_scratched(env, BPF_REG_0);
17985			} else if (opcode == BPF_JA) {
17986				if (BPF_SRC(insn->code) != BPF_K ||
17987				    insn->src_reg != BPF_REG_0 ||
17988				    insn->dst_reg != BPF_REG_0 ||
17989				    (class == BPF_JMP && insn->imm != 0) ||
17990				    (class == BPF_JMP32 && insn->off != 0)) {
17991					verbose(env, "BPF_JA uses reserved fields\n");
17992					return -EINVAL;
17993				}
17994
17995				if (class == BPF_JMP)
17996					env->insn_idx += insn->off + 1;
17997				else
17998					env->insn_idx += insn->imm + 1;
17999				continue;
18000
18001			} else if (opcode == BPF_EXIT) {
18002				if (BPF_SRC(insn->code) != BPF_K ||
18003				    insn->imm != 0 ||
18004				    insn->src_reg != BPF_REG_0 ||
18005				    insn->dst_reg != BPF_REG_0 ||
18006				    class == BPF_JMP32) {
18007					verbose(env, "BPF_EXIT uses reserved fields\n");
18008					return -EINVAL;
18009				}
18010process_bpf_exit_full:
18011				if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) {
18012					verbose(env, "bpf_spin_unlock is missing\n");
18013					return -EINVAL;
18014				}
18015
18016				if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) {
18017					verbose(env, "bpf_rcu_read_unlock is missing\n");
18018					return -EINVAL;
18019				}
18020
18021				if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) {
18022					verbose(env, "%d bpf_preempt_enable%s missing\n",
18023						env->cur_state->active_preempt_lock,
18024						env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are");
18025					return -EINVAL;
18026				}
18027
18028				/* We must do check_reference_leak here before
18029				 * prepare_func_exit to handle the case when
18030				 * state->curframe > 0, it may be a callback
18031				 * function, for which reference_state must
18032				 * match caller reference state when it exits.
18033				 */
18034				err = check_reference_leak(env, exception_exit);
18035				if (err)
18036					return err;
18037
18038				/* The side effect of the prepare_func_exit
18039				 * which is being skipped is that it frees
18040				 * bpf_func_state. Typically, process_bpf_exit
18041				 * will only be hit with outermost exit.
18042				 * copy_verifier_state in pop_stack will handle
18043				 * freeing of any extra bpf_func_state left over
18044				 * from not processing all nested function
18045				 * exits. We also skip return code checks as
18046				 * they are not needed for exceptional exits.
18047				 */
18048				if (exception_exit)
18049					goto process_bpf_exit;
18050
18051				if (state->curframe) {
18052					/* exit from nested function */
18053					err = prepare_func_exit(env, &env->insn_idx);
18054					if (err)
18055						return err;
18056					do_print_state = true;
18057					continue;
18058				}
18059
18060				err = check_return_code(env, BPF_REG_0, "R0");
18061				if (err)
18062					return err;
18063process_bpf_exit:
18064				mark_verifier_state_scratched(env);
18065				update_branch_counts(env, env->cur_state);
18066				err = pop_stack(env, &prev_insn_idx,
18067						&env->insn_idx, pop_log);
18068				if (err < 0) {
18069					if (err != -ENOENT)
18070						return err;
18071					break;
18072				} else {
18073					do_print_state = true;
18074					continue;
18075				}
18076			} else {
18077				err = check_cond_jmp_op(env, insn, &env->insn_idx);
18078				if (err)
18079					return err;
18080			}
18081		} else if (class == BPF_LD) {
18082			u8 mode = BPF_MODE(insn->code);
18083
18084			if (mode == BPF_ABS || mode == BPF_IND) {
18085				err = check_ld_abs(env, insn);
18086				if (err)
18087					return err;
18088
18089			} else if (mode == BPF_IMM) {
18090				err = check_ld_imm(env, insn);
18091				if (err)
18092					return err;
18093
18094				env->insn_idx++;
18095				sanitize_mark_insn_seen(env);
18096			} else {
18097				verbose(env, "invalid BPF_LD mode\n");
18098				return -EINVAL;
18099			}
18100		} else {
18101			verbose(env, "unknown insn class %d\n", class);
18102			return -EINVAL;
18103		}
18104
18105		env->insn_idx++;
18106	}
18107
18108	return 0;
18109}
18110
18111static int find_btf_percpu_datasec(struct btf *btf)
18112{
18113	const struct btf_type *t;
18114	const char *tname;
18115	int i, n;
18116
18117	/*
18118	 * Both vmlinux and module each have their own ".data..percpu"
18119	 * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
18120	 * types to look at only module's own BTF types.
18121	 */
18122	n = btf_nr_types(btf);
18123	if (btf_is_module(btf))
18124		i = btf_nr_types(btf_vmlinux);
18125	else
18126		i = 1;
18127
18128	for(; i < n; i++) {
18129		t = btf_type_by_id(btf, i);
18130		if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
18131			continue;
18132
18133		tname = btf_name_by_offset(btf, t->name_off);
18134		if (!strcmp(tname, ".data..percpu"))
18135			return i;
18136	}
18137
18138	return -ENOENT;
18139}
18140
18141/* replace pseudo btf_id with kernel symbol address */
18142static int check_pseudo_btf_id(struct bpf_verifier_env *env,
18143			       struct bpf_insn *insn,
18144			       struct bpf_insn_aux_data *aux)
18145{
18146	const struct btf_var_secinfo *vsi;
18147	const struct btf_type *datasec;
18148	struct btf_mod_pair *btf_mod;
18149	const struct btf_type *t;
18150	const char *sym_name;
18151	bool percpu = false;
18152	u32 type, id = insn->imm;
18153	struct btf *btf;
18154	s32 datasec_id;
18155	u64 addr;
18156	int i, btf_fd, err;
18157
18158	btf_fd = insn[1].imm;
18159	if (btf_fd) {
18160		btf = btf_get_by_fd(btf_fd);
18161		if (IS_ERR(btf)) {
18162			verbose(env, "invalid module BTF object FD specified.\n");
18163			return -EINVAL;
18164		}
18165	} else {
18166		if (!btf_vmlinux) {
18167			verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
18168			return -EINVAL;
18169		}
18170		btf = btf_vmlinux;
18171		btf_get(btf);
18172	}
18173
18174	t = btf_type_by_id(btf, id);
18175	if (!t) {
18176		verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
18177		err = -ENOENT;
18178		goto err_put;
18179	}
18180
18181	if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
18182		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
18183		err = -EINVAL;
18184		goto err_put;
18185	}
18186
18187	sym_name = btf_name_by_offset(btf, t->name_off);
18188	addr = kallsyms_lookup_name(sym_name);
18189	if (!addr) {
18190		verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
18191			sym_name);
18192		err = -ENOENT;
18193		goto err_put;
18194	}
18195	insn[0].imm = (u32)addr;
18196	insn[1].imm = addr >> 32;
18197
18198	if (btf_type_is_func(t)) {
18199		aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
18200		aux->btf_var.mem_size = 0;
18201		goto check_btf;
18202	}
18203
18204	datasec_id = find_btf_percpu_datasec(btf);
18205	if (datasec_id > 0) {
18206		datasec = btf_type_by_id(btf, datasec_id);
18207		for_each_vsi(i, datasec, vsi) {
18208			if (vsi->type == id) {
18209				percpu = true;
18210				break;
18211			}
18212		}
18213	}
18214
18215	type = t->type;
18216	t = btf_type_skip_modifiers(btf, type, NULL);
18217	if (percpu) {
18218		aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
18219		aux->btf_var.btf = btf;
18220		aux->btf_var.btf_id = type;
18221	} else if (!btf_type_is_struct(t)) {
18222		const struct btf_type *ret;
18223		const char *tname;
18224		u32 tsize;
18225
18226		/* resolve the type size of ksym. */
18227		ret = btf_resolve_size(btf, t, &tsize);
18228		if (IS_ERR(ret)) {
18229			tname = btf_name_by_offset(btf, t->name_off);
18230			verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
18231				tname, PTR_ERR(ret));
18232			err = -EINVAL;
18233			goto err_put;
18234		}
18235		aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
18236		aux->btf_var.mem_size = tsize;
18237	} else {
18238		aux->btf_var.reg_type = PTR_TO_BTF_ID;
18239		aux->btf_var.btf = btf;
18240		aux->btf_var.btf_id = type;
18241	}
18242check_btf:
18243	/* check whether we recorded this BTF (and maybe module) already */
18244	for (i = 0; i < env->used_btf_cnt; i++) {
18245		if (env->used_btfs[i].btf == btf) {
18246			btf_put(btf);
18247			return 0;
18248		}
18249	}
18250
18251	if (env->used_btf_cnt >= MAX_USED_BTFS) {
18252		err = -E2BIG;
18253		goto err_put;
18254	}
18255
18256	btf_mod = &env->used_btfs[env->used_btf_cnt];
18257	btf_mod->btf = btf;
18258	btf_mod->module = NULL;
18259
18260	/* if we reference variables from kernel module, bump its refcount */
18261	if (btf_is_module(btf)) {
18262		btf_mod->module = btf_try_get_module(btf);
18263		if (!btf_mod->module) {
18264			err = -ENXIO;
18265			goto err_put;
18266		}
18267	}
18268
18269	env->used_btf_cnt++;
18270
18271	return 0;
18272err_put:
18273	btf_put(btf);
18274	return err;
18275}
18276
18277static bool is_tracing_prog_type(enum bpf_prog_type type)
18278{
18279	switch (type) {
18280	case BPF_PROG_TYPE_KPROBE:
18281	case BPF_PROG_TYPE_TRACEPOINT:
18282	case BPF_PROG_TYPE_PERF_EVENT:
18283	case BPF_PROG_TYPE_RAW_TRACEPOINT:
18284	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
18285		return true;
18286	default:
18287		return false;
18288	}
18289}
18290
18291static int check_map_prog_compatibility(struct bpf_verifier_env *env,
18292					struct bpf_map *map,
18293					struct bpf_prog *prog)
18294
18295{
18296	enum bpf_prog_type prog_type = resolve_prog_type(prog);
18297
18298	if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
18299	    btf_record_has_field(map->record, BPF_RB_ROOT)) {
18300		if (is_tracing_prog_type(prog_type)) {
18301			verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
18302			return -EINVAL;
18303		}
18304	}
18305
18306	if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
18307		if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
18308			verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
18309			return -EINVAL;
18310		}
18311
18312		if (is_tracing_prog_type(prog_type)) {
18313			verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
18314			return -EINVAL;
18315		}
18316	}
18317
18318	if (btf_record_has_field(map->record, BPF_TIMER)) {
18319		if (is_tracing_prog_type(prog_type)) {
18320			verbose(env, "tracing progs cannot use bpf_timer yet\n");
18321			return -EINVAL;
18322		}
18323	}
18324
18325	if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
18326		if (is_tracing_prog_type(prog_type)) {
18327			verbose(env, "tracing progs cannot use bpf_wq yet\n");
18328			return -EINVAL;
18329		}
18330	}
18331
18332	if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
18333	    !bpf_offload_prog_map_match(prog, map)) {
18334		verbose(env, "offload device mismatch between prog and map\n");
18335		return -EINVAL;
18336	}
18337
18338	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
18339		verbose(env, "bpf_struct_ops map cannot be used in prog\n");
18340		return -EINVAL;
18341	}
18342
18343	if (prog->sleepable)
18344		switch (map->map_type) {
18345		case BPF_MAP_TYPE_HASH:
18346		case BPF_MAP_TYPE_LRU_HASH:
18347		case BPF_MAP_TYPE_ARRAY:
18348		case BPF_MAP_TYPE_PERCPU_HASH:
18349		case BPF_MAP_TYPE_PERCPU_ARRAY:
18350		case BPF_MAP_TYPE_LRU_PERCPU_HASH:
18351		case BPF_MAP_TYPE_ARRAY_OF_MAPS:
18352		case BPF_MAP_TYPE_HASH_OF_MAPS:
18353		case BPF_MAP_TYPE_RINGBUF:
18354		case BPF_MAP_TYPE_USER_RINGBUF:
18355		case BPF_MAP_TYPE_INODE_STORAGE:
18356		case BPF_MAP_TYPE_SK_STORAGE:
18357		case BPF_MAP_TYPE_TASK_STORAGE:
18358		case BPF_MAP_TYPE_CGRP_STORAGE:
18359		case BPF_MAP_TYPE_QUEUE:
18360		case BPF_MAP_TYPE_STACK:
18361		case BPF_MAP_TYPE_ARENA:
18362			break;
18363		default:
18364			verbose(env,
18365				"Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
18366			return -EINVAL;
18367		}
18368
18369	return 0;
18370}
18371
18372static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
18373{
18374	return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
18375		map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
18376}
18377
18378/* find and rewrite pseudo imm in ld_imm64 instructions:
18379 *
18380 * 1. if it accesses map FD, replace it with actual map pointer.
18381 * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
18382 *
18383 * NOTE: btf_vmlinux is required for converting pseudo btf_id.
18384 */
18385static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
18386{
18387	struct bpf_insn *insn = env->prog->insnsi;
18388	int insn_cnt = env->prog->len;
18389	int i, j, err;
18390
18391	err = bpf_prog_calc_tag(env->prog);
18392	if (err)
18393		return err;
18394
18395	for (i = 0; i < insn_cnt; i++, insn++) {
18396		if (BPF_CLASS(insn->code) == BPF_LDX &&
18397		    ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
18398		    insn->imm != 0)) {
18399			verbose(env, "BPF_LDX uses reserved fields\n");
18400			return -EINVAL;
18401		}
18402
18403		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
18404			struct bpf_insn_aux_data *aux;
18405			struct bpf_map *map;
18406			struct fd f;
18407			u64 addr;
18408			u32 fd;
18409
18410			if (i == insn_cnt - 1 || insn[1].code != 0 ||
18411			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
18412			    insn[1].off != 0) {
18413				verbose(env, "invalid bpf_ld_imm64 insn\n");
18414				return -EINVAL;
18415			}
18416
18417			if (insn[0].src_reg == 0)
18418				/* valid generic load 64-bit imm */
18419				goto next_insn;
18420
18421			if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
18422				aux = &env->insn_aux_data[i];
18423				err = check_pseudo_btf_id(env, insn, aux);
18424				if (err)
18425					return err;
18426				goto next_insn;
18427			}
18428
18429			if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
18430				aux = &env->insn_aux_data[i];
18431				aux->ptr_type = PTR_TO_FUNC;
18432				goto next_insn;
18433			}
18434
18435			/* In final convert_pseudo_ld_imm64() step, this is
18436			 * converted into regular 64-bit imm load insn.
18437			 */
18438			switch (insn[0].src_reg) {
18439			case BPF_PSEUDO_MAP_VALUE:
18440			case BPF_PSEUDO_MAP_IDX_VALUE:
18441				break;
18442			case BPF_PSEUDO_MAP_FD:
18443			case BPF_PSEUDO_MAP_IDX:
18444				if (insn[1].imm == 0)
18445					break;
18446				fallthrough;
18447			default:
18448				verbose(env, "unrecognized bpf_ld_imm64 insn\n");
18449				return -EINVAL;
18450			}
18451
18452			switch (insn[0].src_reg) {
18453			case BPF_PSEUDO_MAP_IDX_VALUE:
18454			case BPF_PSEUDO_MAP_IDX:
18455				if (bpfptr_is_null(env->fd_array)) {
18456					verbose(env, "fd_idx without fd_array is invalid\n");
18457					return -EPROTO;
18458				}
18459				if (copy_from_bpfptr_offset(&fd, env->fd_array,
18460							    insn[0].imm * sizeof(fd),
18461							    sizeof(fd)))
18462					return -EFAULT;
18463				break;
18464			default:
18465				fd = insn[0].imm;
18466				break;
18467			}
18468
18469			f = fdget(fd);
18470			map = __bpf_map_get(f);
18471			if (IS_ERR(map)) {
18472				verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
18473				return PTR_ERR(map);
18474			}
18475
18476			err = check_map_prog_compatibility(env, map, env->prog);
18477			if (err) {
18478				fdput(f);
18479				return err;
18480			}
18481
18482			aux = &env->insn_aux_data[i];
18483			if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
18484			    insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
18485				addr = (unsigned long)map;
18486			} else {
18487				u32 off = insn[1].imm;
18488
18489				if (off >= BPF_MAX_VAR_OFF) {
18490					verbose(env, "direct value offset of %u is not allowed\n", off);
18491					fdput(f);
18492					return -EINVAL;
18493				}
18494
18495				if (!map->ops->map_direct_value_addr) {
18496					verbose(env, "no direct value access support for this map type\n");
18497					fdput(f);
18498					return -EINVAL;
18499				}
18500
18501				err = map->ops->map_direct_value_addr(map, &addr, off);
18502				if (err) {
18503					verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
18504						map->value_size, off);
18505					fdput(f);
18506					return err;
18507				}
18508
18509				aux->map_off = off;
18510				addr += off;
18511			}
18512
18513			insn[0].imm = (u32)addr;
18514			insn[1].imm = addr >> 32;
18515
18516			/* check whether we recorded this map already */
18517			for (j = 0; j < env->used_map_cnt; j++) {
18518				if (env->used_maps[j] == map) {
18519					aux->map_index = j;
18520					fdput(f);
18521					goto next_insn;
18522				}
18523			}
18524
18525			if (env->used_map_cnt >= MAX_USED_MAPS) {
18526				verbose(env, "The total number of maps per program has reached the limit of %u\n",
18527					MAX_USED_MAPS);
18528				fdput(f);
18529				return -E2BIG;
18530			}
18531
18532			if (env->prog->sleepable)
18533				atomic64_inc(&map->sleepable_refcnt);
18534			/* hold the map. If the program is rejected by verifier,
18535			 * the map will be released by release_maps() or it
18536			 * will be used by the valid program until it's unloaded
18537			 * and all maps are released in bpf_free_used_maps()
18538			 */
18539			bpf_map_inc(map);
18540
18541			aux->map_index = env->used_map_cnt;
18542			env->used_maps[env->used_map_cnt++] = map;
18543
18544			if (bpf_map_is_cgroup_storage(map) &&
18545			    bpf_cgroup_storage_assign(env->prog->aux, map)) {
18546				verbose(env, "only one cgroup storage of each type is allowed\n");
18547				fdput(f);
18548				return -EBUSY;
18549			}
18550			if (map->map_type == BPF_MAP_TYPE_ARENA) {
18551				if (env->prog->aux->arena) {
18552					verbose(env, "Only one arena per program\n");
18553					fdput(f);
18554					return -EBUSY;
18555				}
18556				if (!env->allow_ptr_leaks || !env->bpf_capable) {
18557					verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
18558					fdput(f);
18559					return -EPERM;
18560				}
18561				if (!env->prog->jit_requested) {
18562					verbose(env, "JIT is required to use arena\n");
18563					fdput(f);
18564					return -EOPNOTSUPP;
18565				}
18566				if (!bpf_jit_supports_arena()) {
18567					verbose(env, "JIT doesn't support arena\n");
18568					fdput(f);
18569					return -EOPNOTSUPP;
18570				}
18571				env->prog->aux->arena = (void *)map;
18572				if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
18573					verbose(env, "arena's user address must be set via map_extra or mmap()\n");
18574					fdput(f);
18575					return -EINVAL;
18576				}
18577			}
18578
18579			fdput(f);
18580next_insn:
18581			insn++;
18582			i++;
18583			continue;
18584		}
18585
18586		/* Basic sanity check before we invest more work here. */
18587		if (!bpf_opcode_in_insntable(insn->code)) {
18588			verbose(env, "unknown opcode %02x\n", insn->code);
18589			return -EINVAL;
18590		}
18591	}
18592
18593	/* now all pseudo BPF_LD_IMM64 instructions load valid
18594	 * 'struct bpf_map *' into a register instead of user map_fd.
18595	 * These pointers will be used later by verifier to validate map access.
18596	 */
18597	return 0;
18598}
18599
18600/* drop refcnt of maps used by the rejected program */
18601static void release_maps(struct bpf_verifier_env *env)
18602{
18603	__bpf_free_used_maps(env->prog->aux, env->used_maps,
18604			     env->used_map_cnt);
18605}
18606
18607/* drop refcnt of maps used by the rejected program */
18608static void release_btfs(struct bpf_verifier_env *env)
18609{
18610	__bpf_free_used_btfs(env->prog->aux, env->used_btfs,
18611			     env->used_btf_cnt);
18612}
18613
18614/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
18615static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
18616{
18617	struct bpf_insn *insn = env->prog->insnsi;
18618	int insn_cnt = env->prog->len;
18619	int i;
18620
18621	for (i = 0; i < insn_cnt; i++, insn++) {
18622		if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
18623			continue;
18624		if (insn->src_reg == BPF_PSEUDO_FUNC)
18625			continue;
18626		insn->src_reg = 0;
18627	}
18628}
18629
18630/* single env->prog->insni[off] instruction was replaced with the range
18631 * insni[off, off + cnt).  Adjust corresponding insn_aux_data by copying
18632 * [0, off) and [off, end) to new locations, so the patched range stays zero
18633 */
18634static void adjust_insn_aux_data(struct bpf_verifier_env *env,
18635				 struct bpf_insn_aux_data *new_data,
18636				 struct bpf_prog *new_prog, u32 off, u32 cnt)
18637{
18638	struct bpf_insn_aux_data *old_data = env->insn_aux_data;
18639	struct bpf_insn *insn = new_prog->insnsi;
18640	u32 old_seen = old_data[off].seen;
18641	u32 prog_len;
18642	int i;
18643
18644	/* aux info at OFF always needs adjustment, no matter fast path
18645	 * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
18646	 * original insn at old prog.
18647	 */
18648	old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
18649
18650	if (cnt == 1)
18651		return;
18652	prog_len = new_prog->len;
18653
18654	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
18655	memcpy(new_data + off + cnt - 1, old_data + off,
18656	       sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
18657	for (i = off; i < off + cnt - 1; i++) {
18658		/* Expand insni[off]'s seen count to the patched range. */
18659		new_data[i].seen = old_seen;
18660		new_data[i].zext_dst = insn_has_def32(env, insn + i);
18661	}
18662	env->insn_aux_data = new_data;
18663	vfree(old_data);
18664}
18665
18666static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
18667{
18668	int i;
18669
18670	if (len == 1)
18671		return;
18672	/* NOTE: fake 'exit' subprog should be updated as well. */
18673	for (i = 0; i <= env->subprog_cnt; i++) {
18674		if (env->subprog_info[i].start <= off)
18675			continue;
18676		env->subprog_info[i].start += len - 1;
18677	}
18678}
18679
18680static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
18681{
18682	struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
18683	int i, sz = prog->aux->size_poke_tab;
18684	struct bpf_jit_poke_descriptor *desc;
18685
18686	for (i = 0; i < sz; i++) {
18687		desc = &tab[i];
18688		if (desc->insn_idx <= off)
18689			continue;
18690		desc->insn_idx += len - 1;
18691	}
18692}
18693
18694static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
18695					    const struct bpf_insn *patch, u32 len)
18696{
18697	struct bpf_prog *new_prog;
18698	struct bpf_insn_aux_data *new_data = NULL;
18699
18700	if (len > 1) {
18701		new_data = vzalloc(array_size(env->prog->len + len - 1,
18702					      sizeof(struct bpf_insn_aux_data)));
18703		if (!new_data)
18704			return NULL;
18705	}
18706
18707	new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
18708	if (IS_ERR(new_prog)) {
18709		if (PTR_ERR(new_prog) == -ERANGE)
18710			verbose(env,
18711				"insn %d cannot be patched due to 16-bit range\n",
18712				env->insn_aux_data[off].orig_idx);
18713		vfree(new_data);
18714		return NULL;
18715	}
18716	adjust_insn_aux_data(env, new_data, new_prog, off, len);
18717	adjust_subprog_starts(env, off, len);
18718	adjust_poke_descs(new_prog, off, len);
18719	return new_prog;
18720}
18721
18722static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
18723					      u32 off, u32 cnt)
18724{
18725	int i, j;
18726
18727	/* find first prog starting at or after off (first to remove) */
18728	for (i = 0; i < env->subprog_cnt; i++)
18729		if (env->subprog_info[i].start >= off)
18730			break;
18731	/* find first prog starting at or after off + cnt (first to stay) */
18732	for (j = i; j < env->subprog_cnt; j++)
18733		if (env->subprog_info[j].start >= off + cnt)
18734			break;
18735	/* if j doesn't start exactly at off + cnt, we are just removing
18736	 * the front of previous prog
18737	 */
18738	if (env->subprog_info[j].start != off + cnt)
18739		j--;
18740
18741	if (j > i) {
18742		struct bpf_prog_aux *aux = env->prog->aux;
18743		int move;
18744
18745		/* move fake 'exit' subprog as well */
18746		move = env->subprog_cnt + 1 - j;
18747
18748		memmove(env->subprog_info + i,
18749			env->subprog_info + j,
18750			sizeof(*env->subprog_info) * move);
18751		env->subprog_cnt -= j - i;
18752
18753		/* remove func_info */
18754		if (aux->func_info) {
18755			move = aux->func_info_cnt - j;
18756
18757			memmove(aux->func_info + i,
18758				aux->func_info + j,
18759				sizeof(*aux->func_info) * move);
18760			aux->func_info_cnt -= j - i;
18761			/* func_info->insn_off is set after all code rewrites,
18762			 * in adjust_btf_func() - no need to adjust
18763			 */
18764		}
18765	} else {
18766		/* convert i from "first prog to remove" to "first to adjust" */
18767		if (env->subprog_info[i].start == off)
18768			i++;
18769	}
18770
18771	/* update fake 'exit' subprog as well */
18772	for (; i <= env->subprog_cnt; i++)
18773		env->subprog_info[i].start -= cnt;
18774
18775	return 0;
18776}
18777
18778static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
18779				      u32 cnt)
18780{
18781	struct bpf_prog *prog = env->prog;
18782	u32 i, l_off, l_cnt, nr_linfo;
18783	struct bpf_line_info *linfo;
18784
18785	nr_linfo = prog->aux->nr_linfo;
18786	if (!nr_linfo)
18787		return 0;
18788
18789	linfo = prog->aux->linfo;
18790
18791	/* find first line info to remove, count lines to be removed */
18792	for (i = 0; i < nr_linfo; i++)
18793		if (linfo[i].insn_off >= off)
18794			break;
18795
18796	l_off = i;
18797	l_cnt = 0;
18798	for (; i < nr_linfo; i++)
18799		if (linfo[i].insn_off < off + cnt)
18800			l_cnt++;
18801		else
18802			break;
18803
18804	/* First live insn doesn't match first live linfo, it needs to "inherit"
18805	 * last removed linfo.  prog is already modified, so prog->len == off
18806	 * means no live instructions after (tail of the program was removed).
18807	 */
18808	if (prog->len != off && l_cnt &&
18809	    (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
18810		l_cnt--;
18811		linfo[--i].insn_off = off + cnt;
18812	}
18813
18814	/* remove the line info which refer to the removed instructions */
18815	if (l_cnt) {
18816		memmove(linfo + l_off, linfo + i,
18817			sizeof(*linfo) * (nr_linfo - i));
18818
18819		prog->aux->nr_linfo -= l_cnt;
18820		nr_linfo = prog->aux->nr_linfo;
18821	}
18822
18823	/* pull all linfo[i].insn_off >= off + cnt in by cnt */
18824	for (i = l_off; i < nr_linfo; i++)
18825		linfo[i].insn_off -= cnt;
18826
18827	/* fix up all subprogs (incl. 'exit') which start >= off */
18828	for (i = 0; i <= env->subprog_cnt; i++)
18829		if (env->subprog_info[i].linfo_idx > l_off) {
18830			/* program may have started in the removed region but
18831			 * may not be fully removed
18832			 */
18833			if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
18834				env->subprog_info[i].linfo_idx -= l_cnt;
18835			else
18836				env->subprog_info[i].linfo_idx = l_off;
18837		}
18838
18839	return 0;
18840}
18841
18842static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
18843{
18844	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
18845	unsigned int orig_prog_len = env->prog->len;
18846	int err;
18847
18848	if (bpf_prog_is_offloaded(env->prog->aux))
18849		bpf_prog_offload_remove_insns(env, off, cnt);
18850
18851	err = bpf_remove_insns(env->prog, off, cnt);
18852	if (err)
18853		return err;
18854
18855	err = adjust_subprog_starts_after_remove(env, off, cnt);
18856	if (err)
18857		return err;
18858
18859	err = bpf_adj_linfo_after_remove(env, off, cnt);
18860	if (err)
18861		return err;
18862
18863	memmove(aux_data + off,	aux_data + off + cnt,
18864		sizeof(*aux_data) * (orig_prog_len - off - cnt));
18865
18866	return 0;
18867}
18868
18869/* The verifier does more data flow analysis than llvm and will not
18870 * explore branches that are dead at run time. Malicious programs can
18871 * have dead code too. Therefore replace all dead at-run-time code
18872 * with 'ja -1'.
18873 *
18874 * Just nops are not optimal, e.g. if they would sit at the end of the
18875 * program and through another bug we would manage to jump there, then
18876 * we'd execute beyond program memory otherwise. Returning exception
18877 * code also wouldn't work since we can have subprogs where the dead
18878 * code could be located.
18879 */
18880static void sanitize_dead_code(struct bpf_verifier_env *env)
18881{
18882	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
18883	struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
18884	struct bpf_insn *insn = env->prog->insnsi;
18885	const int insn_cnt = env->prog->len;
18886	int i;
18887
18888	for (i = 0; i < insn_cnt; i++) {
18889		if (aux_data[i].seen)
18890			continue;
18891		memcpy(insn + i, &trap, sizeof(trap));
18892		aux_data[i].zext_dst = false;
18893	}
18894}
18895
18896static bool insn_is_cond_jump(u8 code)
18897{
18898	u8 op;
18899
18900	op = BPF_OP(code);
18901	if (BPF_CLASS(code) == BPF_JMP32)
18902		return op != BPF_JA;
18903
18904	if (BPF_CLASS(code) != BPF_JMP)
18905		return false;
18906
18907	return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
18908}
18909
18910static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
18911{
18912	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
18913	struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
18914	struct bpf_insn *insn = env->prog->insnsi;
18915	const int insn_cnt = env->prog->len;
18916	int i;
18917
18918	for (i = 0; i < insn_cnt; i++, insn++) {
18919		if (!insn_is_cond_jump(insn->code))
18920			continue;
18921
18922		if (!aux_data[i + 1].seen)
18923			ja.off = insn->off;
18924		else if (!aux_data[i + 1 + insn->off].seen)
18925			ja.off = 0;
18926		else
18927			continue;
18928
18929		if (bpf_prog_is_offloaded(env->prog->aux))
18930			bpf_prog_offload_replace_insn(env, i, &ja);
18931
18932		memcpy(insn, &ja, sizeof(ja));
18933	}
18934}
18935
18936static int opt_remove_dead_code(struct bpf_verifier_env *env)
18937{
18938	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
18939	int insn_cnt = env->prog->len;
18940	int i, err;
18941
18942	for (i = 0; i < insn_cnt; i++) {
18943		int j;
18944
18945		j = 0;
18946		while (i + j < insn_cnt && !aux_data[i + j].seen)
18947			j++;
18948		if (!j)
18949			continue;
18950
18951		err = verifier_remove_insns(env, i, j);
18952		if (err)
18953			return err;
18954		insn_cnt = env->prog->len;
18955	}
18956
18957	return 0;
18958}
18959
18960static int opt_remove_nops(struct bpf_verifier_env *env)
18961{
18962	const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
18963	struct bpf_insn *insn = env->prog->insnsi;
18964	int insn_cnt = env->prog->len;
18965	int i, err;
18966
18967	for (i = 0; i < insn_cnt; i++) {
18968		if (memcmp(&insn[i], &ja, sizeof(ja)))
18969			continue;
18970
18971		err = verifier_remove_insns(env, i, 1);
18972		if (err)
18973			return err;
18974		insn_cnt--;
18975		i--;
18976	}
18977
18978	return 0;
18979}
18980
18981static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
18982					 const union bpf_attr *attr)
18983{
18984	struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
18985	struct bpf_insn_aux_data *aux = env->insn_aux_data;
18986	int i, patch_len, delta = 0, len = env->prog->len;
18987	struct bpf_insn *insns = env->prog->insnsi;
18988	struct bpf_prog *new_prog;
18989	bool rnd_hi32;
18990
18991	rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
18992	zext_patch[1] = BPF_ZEXT_REG(0);
18993	rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
18994	rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
18995	rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
18996	for (i = 0; i < len; i++) {
18997		int adj_idx = i + delta;
18998		struct bpf_insn insn;
18999		int load_reg;
19000
19001		insn = insns[adj_idx];
19002		load_reg = insn_def_regno(&insn);
19003		if (!aux[adj_idx].zext_dst) {
19004			u8 code, class;
19005			u32 imm_rnd;
19006
19007			if (!rnd_hi32)
19008				continue;
19009
19010			code = insn.code;
19011			class = BPF_CLASS(code);
19012			if (load_reg == -1)
19013				continue;
19014
19015			/* NOTE: arg "reg" (the fourth one) is only used for
19016			 *       BPF_STX + SRC_OP, so it is safe to pass NULL
19017			 *       here.
19018			 */
19019			if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
19020				if (class == BPF_LD &&
19021				    BPF_MODE(code) == BPF_IMM)
19022					i++;
19023				continue;
19024			}
19025
19026			/* ctx load could be transformed into wider load. */
19027			if (class == BPF_LDX &&
19028			    aux[adj_idx].ptr_type == PTR_TO_CTX)
19029				continue;
19030
19031			imm_rnd = get_random_u32();
19032			rnd_hi32_patch[0] = insn;
19033			rnd_hi32_patch[1].imm = imm_rnd;
19034			rnd_hi32_patch[3].dst_reg = load_reg;
19035			patch = rnd_hi32_patch;
19036			patch_len = 4;
19037			goto apply_patch_buffer;
19038		}
19039
19040		/* Add in an zero-extend instruction if a) the JIT has requested
19041		 * it or b) it's a CMPXCHG.
19042		 *
19043		 * The latter is because: BPF_CMPXCHG always loads a value into
19044		 * R0, therefore always zero-extends. However some archs'
19045		 * equivalent instruction only does this load when the
19046		 * comparison is successful. This detail of CMPXCHG is
19047		 * orthogonal to the general zero-extension behaviour of the
19048		 * CPU, so it's treated independently of bpf_jit_needs_zext.
19049		 */
19050		if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
19051			continue;
19052
19053		/* Zero-extension is done by the caller. */
19054		if (bpf_pseudo_kfunc_call(&insn))
19055			continue;
19056
19057		if (WARN_ON(load_reg == -1)) {
19058			verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
19059			return -EFAULT;
19060		}
19061
19062		zext_patch[0] = insn;
19063		zext_patch[1].dst_reg = load_reg;
19064		zext_patch[1].src_reg = load_reg;
19065		patch = zext_patch;
19066		patch_len = 2;
19067apply_patch_buffer:
19068		new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
19069		if (!new_prog)
19070			return -ENOMEM;
19071		env->prog = new_prog;
19072		insns = new_prog->insnsi;
19073		aux = env->insn_aux_data;
19074		delta += patch_len - 1;
19075	}
19076
19077	return 0;
19078}
19079
19080/* convert load instructions that access fields of a context type into a
19081 * sequence of instructions that access fields of the underlying structure:
19082 *     struct __sk_buff    -> struct sk_buff
19083 *     struct bpf_sock_ops -> struct sock
19084 */
19085static int convert_ctx_accesses(struct bpf_verifier_env *env)
19086{
19087	const struct bpf_verifier_ops *ops = env->ops;
19088	int i, cnt, size, ctx_field_size, delta = 0;
19089	const int insn_cnt = env->prog->len;
19090	struct bpf_insn insn_buf[16], *insn;
19091	u32 target_size, size_default, off;
19092	struct bpf_prog *new_prog;
19093	enum bpf_access_type type;
19094	bool is_narrower_load;
19095
19096	if (ops->gen_prologue || env->seen_direct_write) {
19097		if (!ops->gen_prologue) {
19098			verbose(env, "bpf verifier is misconfigured\n");
19099			return -EINVAL;
19100		}
19101		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
19102					env->prog);
19103		if (cnt >= ARRAY_SIZE(insn_buf)) {
19104			verbose(env, "bpf verifier is misconfigured\n");
19105			return -EINVAL;
19106		} else if (cnt) {
19107			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
19108			if (!new_prog)
19109				return -ENOMEM;
19110
19111			env->prog = new_prog;
19112			delta += cnt - 1;
19113		}
19114	}
19115
19116	if (bpf_prog_is_offloaded(env->prog->aux))
19117		return 0;
19118
19119	insn = env->prog->insnsi + delta;
19120
19121	for (i = 0; i < insn_cnt; i++, insn++) {
19122		bpf_convert_ctx_access_t convert_ctx_access;
19123		u8 mode;
19124
19125		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
19126		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
19127		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
19128		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
19129		    insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
19130		    insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
19131		    insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
19132			type = BPF_READ;
19133		} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
19134			   insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
19135			   insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
19136			   insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
19137			   insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
19138			   insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
19139			   insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
19140			   insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
19141			type = BPF_WRITE;
19142		} else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
19143			    insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
19144			   env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
19145			insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
19146			env->prog->aux->num_exentries++;
19147			continue;
19148		} else {
19149			continue;
19150		}
19151
19152		if (type == BPF_WRITE &&
19153		    env->insn_aux_data[i + delta].sanitize_stack_spill) {
19154			struct bpf_insn patch[] = {
19155				*insn,
19156				BPF_ST_NOSPEC(),
19157			};
19158
19159			cnt = ARRAY_SIZE(patch);
19160			new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
19161			if (!new_prog)
19162				return -ENOMEM;
19163
19164			delta    += cnt - 1;
19165			env->prog = new_prog;
19166			insn      = new_prog->insnsi + i + delta;
19167			continue;
19168		}
19169
19170		switch ((int)env->insn_aux_data[i + delta].ptr_type) {
19171		case PTR_TO_CTX:
19172			if (!ops->convert_ctx_access)
19173				continue;
19174			convert_ctx_access = ops->convert_ctx_access;
19175			break;
19176		case PTR_TO_SOCKET:
19177		case PTR_TO_SOCK_COMMON:
19178			convert_ctx_access = bpf_sock_convert_ctx_access;
19179			break;
19180		case PTR_TO_TCP_SOCK:
19181			convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
19182			break;
19183		case PTR_TO_XDP_SOCK:
19184			convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
19185			break;
19186		case PTR_TO_BTF_ID:
19187		case PTR_TO_BTF_ID | PTR_UNTRUSTED:
19188		/* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
19189		 * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
19190		 * be said once it is marked PTR_UNTRUSTED, hence we must handle
19191		 * any faults for loads into such types. BPF_WRITE is disallowed
19192		 * for this case.
19193		 */
19194		case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
19195			if (type == BPF_READ) {
19196				if (BPF_MODE(insn->code) == BPF_MEM)
19197					insn->code = BPF_LDX | BPF_PROBE_MEM |
19198						     BPF_SIZE((insn)->code);
19199				else
19200					insn->code = BPF_LDX | BPF_PROBE_MEMSX |
19201						     BPF_SIZE((insn)->code);
19202				env->prog->aux->num_exentries++;
19203			}
19204			continue;
19205		case PTR_TO_ARENA:
19206			if (BPF_MODE(insn->code) == BPF_MEMSX) {
19207				verbose(env, "sign extending loads from arena are not supported yet\n");
19208				return -EOPNOTSUPP;
19209			}
19210			insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
19211			env->prog->aux->num_exentries++;
19212			continue;
19213		default:
19214			continue;
19215		}
19216
19217		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
19218		size = BPF_LDST_BYTES(insn);
19219		mode = BPF_MODE(insn->code);
19220
19221		/* If the read access is a narrower load of the field,
19222		 * convert to a 4/8-byte load, to minimum program type specific
19223		 * convert_ctx_access changes. If conversion is successful,
19224		 * we will apply proper mask to the result.
19225		 */
19226		is_narrower_load = size < ctx_field_size;
19227		size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
19228		off = insn->off;
19229		if (is_narrower_load) {
19230			u8 size_code;
19231
19232			if (type == BPF_WRITE) {
19233				verbose(env, "bpf verifier narrow ctx access misconfigured\n");
19234				return -EINVAL;
19235			}
19236
19237			size_code = BPF_H;
19238			if (ctx_field_size == 4)
19239				size_code = BPF_W;
19240			else if (ctx_field_size == 8)
19241				size_code = BPF_DW;
19242
19243			insn->off = off & ~(size_default - 1);
19244			insn->code = BPF_LDX | BPF_MEM | size_code;
19245		}
19246
19247		target_size = 0;
19248		cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
19249					 &target_size);
19250		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
19251		    (ctx_field_size && !target_size)) {
19252			verbose(env, "bpf verifier is misconfigured\n");
19253			return -EINVAL;
19254		}
19255
19256		if (is_narrower_load && size < target_size) {
19257			u8 shift = bpf_ctx_narrow_access_offset(
19258				off, size, size_default) * 8;
19259			if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
19260				verbose(env, "bpf verifier narrow ctx load misconfigured\n");
19261				return -EINVAL;
19262			}
19263			if (ctx_field_size <= 4) {
19264				if (shift)
19265					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
19266									insn->dst_reg,
19267									shift);
19268				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
19269								(1 << size * 8) - 1);
19270			} else {
19271				if (shift)
19272					insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
19273									insn->dst_reg,
19274									shift);
19275				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
19276								(1ULL << size * 8) - 1);
19277			}
19278		}
19279		if (mode == BPF_MEMSX)
19280			insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
19281						       insn->dst_reg, insn->dst_reg,
19282						       size * 8, 0);
19283
19284		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
19285		if (!new_prog)
19286			return -ENOMEM;
19287
19288		delta += cnt - 1;
19289
19290		/* keep walking new program and skip insns we just inserted */
19291		env->prog = new_prog;
19292		insn      = new_prog->insnsi + i + delta;
19293	}
19294
19295	return 0;
19296}
19297
19298static int jit_subprogs(struct bpf_verifier_env *env)
19299{
19300	struct bpf_prog *prog = env->prog, **func, *tmp;
19301	int i, j, subprog_start, subprog_end = 0, len, subprog;
19302	struct bpf_map *map_ptr;
19303	struct bpf_insn *insn;
19304	void *old_bpf_func;
19305	int err, num_exentries;
19306
19307	if (env->subprog_cnt <= 1)
19308		return 0;
19309
19310	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
19311		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
19312			continue;
19313
19314		/* Upon error here we cannot fall back to interpreter but
19315		 * need a hard reject of the program. Thus -EFAULT is
19316		 * propagated in any case.
19317		 */
19318		subprog = find_subprog(env, i + insn->imm + 1);
19319		if (subprog < 0) {
19320			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
19321				  i + insn->imm + 1);
19322			return -EFAULT;
19323		}
19324		/* temporarily remember subprog id inside insn instead of
19325		 * aux_data, since next loop will split up all insns into funcs
19326		 */
19327		insn->off = subprog;
19328		/* remember original imm in case JIT fails and fallback
19329		 * to interpreter will be needed
19330		 */
19331		env->insn_aux_data[i].call_imm = insn->imm;
19332		/* point imm to __bpf_call_base+1 from JITs point of view */
19333		insn->imm = 1;
19334		if (bpf_pseudo_func(insn)) {
19335#if defined(MODULES_VADDR)
19336			u64 addr = MODULES_VADDR;
19337#else
19338			u64 addr = VMALLOC_START;
19339#endif
19340			/* jit (e.g. x86_64) may emit fewer instructions
19341			 * if it learns a u32 imm is the same as a u64 imm.
19342			 * Set close enough to possible prog address.
19343			 */
19344			insn[0].imm = (u32)addr;
19345			insn[1].imm = addr >> 32;
19346		}
19347	}
19348
19349	err = bpf_prog_alloc_jited_linfo(prog);
19350	if (err)
19351		goto out_undo_insn;
19352
19353	err = -ENOMEM;
19354	func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
19355	if (!func)
19356		goto out_undo_insn;
19357
19358	for (i = 0; i < env->subprog_cnt; i++) {
19359		subprog_start = subprog_end;
19360		subprog_end = env->subprog_info[i + 1].start;
19361
19362		len = subprog_end - subprog_start;
19363		/* bpf_prog_run() doesn't call subprogs directly,
19364		 * hence main prog stats include the runtime of subprogs.
19365		 * subprogs don't have IDs and not reachable via prog_get_next_id
19366		 * func[i]->stats will never be accessed and stays NULL
19367		 */
19368		func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
19369		if (!func[i])
19370			goto out_free;
19371		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
19372		       len * sizeof(struct bpf_insn));
19373		func[i]->type = prog->type;
19374		func[i]->len = len;
19375		if (bpf_prog_calc_tag(func[i]))
19376			goto out_free;
19377		func[i]->is_func = 1;
19378		func[i]->sleepable = prog->sleepable;
19379		func[i]->aux->func_idx = i;
19380		/* Below members will be freed only at prog->aux */
19381		func[i]->aux->btf = prog->aux->btf;
19382		func[i]->aux->func_info = prog->aux->func_info;
19383		func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
19384		func[i]->aux->poke_tab = prog->aux->poke_tab;
19385		func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
19386
19387		for (j = 0; j < prog->aux->size_poke_tab; j++) {
19388			struct bpf_jit_poke_descriptor *poke;
19389
19390			poke = &prog->aux->poke_tab[j];
19391			if (poke->insn_idx < subprog_end &&
19392			    poke->insn_idx >= subprog_start)
19393				poke->aux = func[i]->aux;
19394		}
19395
19396		func[i]->aux->name[0] = 'F';
19397		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
19398		func[i]->jit_requested = 1;
19399		func[i]->blinding_requested = prog->blinding_requested;
19400		func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
19401		func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
19402		func[i]->aux->linfo = prog->aux->linfo;
19403		func[i]->aux->nr_linfo = prog->aux->nr_linfo;
19404		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
19405		func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
19406		func[i]->aux->arena = prog->aux->arena;
19407		num_exentries = 0;
19408		insn = func[i]->insnsi;
19409		for (j = 0; j < func[i]->len; j++, insn++) {
19410			if (BPF_CLASS(insn->code) == BPF_LDX &&
19411			    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
19412			     BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
19413			     BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
19414				num_exentries++;
19415			if ((BPF_CLASS(insn->code) == BPF_STX ||
19416			     BPF_CLASS(insn->code) == BPF_ST) &&
19417			     BPF_MODE(insn->code) == BPF_PROBE_MEM32)
19418				num_exentries++;
19419			if (BPF_CLASS(insn->code) == BPF_STX &&
19420			     BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
19421				num_exentries++;
19422		}
19423		func[i]->aux->num_exentries = num_exentries;
19424		func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
19425		func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
19426		if (!i)
19427			func[i]->aux->exception_boundary = env->seen_exception;
19428		func[i] = bpf_int_jit_compile(func[i]);
19429		if (!func[i]->jited) {
19430			err = -ENOTSUPP;
19431			goto out_free;
19432		}
19433		cond_resched();
19434	}
19435
19436	/* at this point all bpf functions were successfully JITed
19437	 * now populate all bpf_calls with correct addresses and
19438	 * run last pass of JIT
19439	 */
19440	for (i = 0; i < env->subprog_cnt; i++) {
19441		insn = func[i]->insnsi;
19442		for (j = 0; j < func[i]->len; j++, insn++) {
19443			if (bpf_pseudo_func(insn)) {
19444				subprog = insn->off;
19445				insn[0].imm = (u32)(long)func[subprog]->bpf_func;
19446				insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
19447				continue;
19448			}
19449			if (!bpf_pseudo_call(insn))
19450				continue;
19451			subprog = insn->off;
19452			insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
19453		}
19454
19455		/* we use the aux data to keep a list of the start addresses
19456		 * of the JITed images for each function in the program
19457		 *
19458		 * for some architectures, such as powerpc64, the imm field
19459		 * might not be large enough to hold the offset of the start
19460		 * address of the callee's JITed image from __bpf_call_base
19461		 *
19462		 * in such cases, we can lookup the start address of a callee
19463		 * by using its subprog id, available from the off field of
19464		 * the call instruction, as an index for this list
19465		 */
19466		func[i]->aux->func = func;
19467		func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
19468		func[i]->aux->real_func_cnt = env->subprog_cnt;
19469	}
19470	for (i = 0; i < env->subprog_cnt; i++) {
19471		old_bpf_func = func[i]->bpf_func;
19472		tmp = bpf_int_jit_compile(func[i]);
19473		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
19474			verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
19475			err = -ENOTSUPP;
19476			goto out_free;
19477		}
19478		cond_resched();
19479	}
19480
19481	/* finally lock prog and jit images for all functions and
19482	 * populate kallsysm. Begin at the first subprogram, since
19483	 * bpf_prog_load will add the kallsyms for the main program.
19484	 */
19485	for (i = 1; i < env->subprog_cnt; i++) {
19486		err = bpf_prog_lock_ro(func[i]);
19487		if (err)
19488			goto out_free;
19489	}
19490
19491	for (i = 1; i < env->subprog_cnt; i++)
19492		bpf_prog_kallsyms_add(func[i]);
19493
19494	/* Last step: make now unused interpreter insns from main
19495	 * prog consistent for later dump requests, so they can
19496	 * later look the same as if they were interpreted only.
19497	 */
19498	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
19499		if (bpf_pseudo_func(insn)) {
19500			insn[0].imm = env->insn_aux_data[i].call_imm;
19501			insn[1].imm = insn->off;
19502			insn->off = 0;
19503			continue;
19504		}
19505		if (!bpf_pseudo_call(insn))
19506			continue;
19507		insn->off = env->insn_aux_data[i].call_imm;
19508		subprog = find_subprog(env, i + insn->off + 1);
19509		insn->imm = subprog;
19510	}
19511
19512	prog->jited = 1;
19513	prog->bpf_func = func[0]->bpf_func;
19514	prog->jited_len = func[0]->jited_len;
19515	prog->aux->extable = func[0]->aux->extable;
19516	prog->aux->num_exentries = func[0]->aux->num_exentries;
19517	prog->aux->func = func;
19518	prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
19519	prog->aux->real_func_cnt = env->subprog_cnt;
19520	prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
19521	prog->aux->exception_boundary = func[0]->aux->exception_boundary;
19522	bpf_prog_jit_attempt_done(prog);
19523	return 0;
19524out_free:
19525	/* We failed JIT'ing, so at this point we need to unregister poke
19526	 * descriptors from subprogs, so that kernel is not attempting to
19527	 * patch it anymore as we're freeing the subprog JIT memory.
19528	 */
19529	for (i = 0; i < prog->aux->size_poke_tab; i++) {
19530		map_ptr = prog->aux->poke_tab[i].tail_call.map;
19531		map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
19532	}
19533	/* At this point we're guaranteed that poke descriptors are not
19534	 * live anymore. We can just unlink its descriptor table as it's
19535	 * released with the main prog.
19536	 */
19537	for (i = 0; i < env->subprog_cnt; i++) {
19538		if (!func[i])
19539			continue;
19540		func[i]->aux->poke_tab = NULL;
19541		bpf_jit_free(func[i]);
19542	}
19543	kfree(func);
19544out_undo_insn:
19545	/* cleanup main prog to be interpreted */
19546	prog->jit_requested = 0;
19547	prog->blinding_requested = 0;
19548	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
19549		if (!bpf_pseudo_call(insn))
19550			continue;
19551		insn->off = 0;
19552		insn->imm = env->insn_aux_data[i].call_imm;
19553	}
19554	bpf_prog_jit_attempt_done(prog);
19555	return err;
19556}
19557
19558static int fixup_call_args(struct bpf_verifier_env *env)
19559{
19560#ifndef CONFIG_BPF_JIT_ALWAYS_ON
19561	struct bpf_prog *prog = env->prog;
19562	struct bpf_insn *insn = prog->insnsi;
19563	bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
19564	int i, depth;
19565#endif
19566	int err = 0;
19567
19568	if (env->prog->jit_requested &&
19569	    !bpf_prog_is_offloaded(env->prog->aux)) {
19570		err = jit_subprogs(env);
19571		if (err == 0)
19572			return 0;
19573		if (err == -EFAULT)
19574			return err;
19575	}
19576#ifndef CONFIG_BPF_JIT_ALWAYS_ON
19577	if (has_kfunc_call) {
19578		verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
19579		return -EINVAL;
19580	}
19581	if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
19582		/* When JIT fails the progs with bpf2bpf calls and tail_calls
19583		 * have to be rejected, since interpreter doesn't support them yet.
19584		 */
19585		verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
19586		return -EINVAL;
19587	}
19588	for (i = 0; i < prog->len; i++, insn++) {
19589		if (bpf_pseudo_func(insn)) {
19590			/* When JIT fails the progs with callback calls
19591			 * have to be rejected, since interpreter doesn't support them yet.
19592			 */
19593			verbose(env, "callbacks are not allowed in non-JITed programs\n");
19594			return -EINVAL;
19595		}
19596
19597		if (!bpf_pseudo_call(insn))
19598			continue;
19599		depth = get_callee_stack_depth(env, insn, i);
19600		if (depth < 0)
19601			return depth;
19602		bpf_patch_call_args(insn, depth);
19603	}
19604	err = 0;
19605#endif
19606	return err;
19607}
19608
19609/* replace a generic kfunc with a specialized version if necessary */
19610static void specialize_kfunc(struct bpf_verifier_env *env,
19611			     u32 func_id, u16 offset, unsigned long *addr)
19612{
19613	struct bpf_prog *prog = env->prog;
19614	bool seen_direct_write;
19615	void *xdp_kfunc;
19616	bool is_rdonly;
19617
19618	if (bpf_dev_bound_kfunc_id(func_id)) {
19619		xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
19620		if (xdp_kfunc) {
19621			*addr = (unsigned long)xdp_kfunc;
19622			return;
19623		}
19624		/* fallback to default kfunc when not supported by netdev */
19625	}
19626
19627	if (offset)
19628		return;
19629
19630	if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
19631		seen_direct_write = env->seen_direct_write;
19632		is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
19633
19634		if (is_rdonly)
19635			*addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
19636
19637		/* restore env->seen_direct_write to its original value, since
19638		 * may_access_direct_pkt_data mutates it
19639		 */
19640		env->seen_direct_write = seen_direct_write;
19641	}
19642}
19643
19644static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
19645					    u16 struct_meta_reg,
19646					    u16 node_offset_reg,
19647					    struct bpf_insn *insn,
19648					    struct bpf_insn *insn_buf,
19649					    int *cnt)
19650{
19651	struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
19652	struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
19653
19654	insn_buf[0] = addr[0];
19655	insn_buf[1] = addr[1];
19656	insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
19657	insn_buf[3] = *insn;
19658	*cnt = 4;
19659}
19660
19661static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
19662			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
19663{
19664	const struct bpf_kfunc_desc *desc;
19665
19666	if (!insn->imm) {
19667		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
19668		return -EINVAL;
19669	}
19670
19671	*cnt = 0;
19672
19673	/* insn->imm has the btf func_id. Replace it with an offset relative to
19674	 * __bpf_call_base, unless the JIT needs to call functions that are
19675	 * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
19676	 */
19677	desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
19678	if (!desc) {
19679		verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
19680			insn->imm);
19681		return -EFAULT;
19682	}
19683
19684	if (!bpf_jit_supports_far_kfunc_call())
19685		insn->imm = BPF_CALL_IMM(desc->addr);
19686	if (insn->off)
19687		return 0;
19688	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
19689	    desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
19690		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
19691		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
19692		u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
19693
19694		if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
19695			verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
19696				insn_idx);
19697			return -EFAULT;
19698		}
19699
19700		insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
19701		insn_buf[1] = addr[0];
19702		insn_buf[2] = addr[1];
19703		insn_buf[3] = *insn;
19704		*cnt = 4;
19705	} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
19706		   desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
19707		   desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
19708		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
19709		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
19710
19711		if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
19712			verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
19713				insn_idx);
19714			return -EFAULT;
19715		}
19716
19717		if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
19718		    !kptr_struct_meta) {
19719			verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
19720				insn_idx);
19721			return -EFAULT;
19722		}
19723
19724		insn_buf[0] = addr[0];
19725		insn_buf[1] = addr[1];
19726		insn_buf[2] = *insn;
19727		*cnt = 3;
19728	} else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
19729		   desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
19730		   desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
19731		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
19732		int struct_meta_reg = BPF_REG_3;
19733		int node_offset_reg = BPF_REG_4;
19734
19735		/* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
19736		if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
19737			struct_meta_reg = BPF_REG_4;
19738			node_offset_reg = BPF_REG_5;
19739		}
19740
19741		if (!kptr_struct_meta) {
19742			verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
19743				insn_idx);
19744			return -EFAULT;
19745		}
19746
19747		__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
19748						node_offset_reg, insn, insn_buf, cnt);
19749	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
19750		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
19751		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
19752		*cnt = 1;
19753	} else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
19754		struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };
19755
19756		insn_buf[0] = ld_addrs[0];
19757		insn_buf[1] = ld_addrs[1];
19758		insn_buf[2] = *insn;
19759		*cnt = 3;
19760	}
19761	return 0;
19762}
19763
19764/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
19765static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
19766{
19767	struct bpf_subprog_info *info = env->subprog_info;
19768	int cnt = env->subprog_cnt;
19769	struct bpf_prog *prog;
19770
19771	/* We only reserve one slot for hidden subprogs in subprog_info. */
19772	if (env->hidden_subprog_cnt) {
19773		verbose(env, "verifier internal error: only one hidden subprog supported\n");
19774		return -EFAULT;
19775	}
19776	/* We're not patching any existing instruction, just appending the new
19777	 * ones for the hidden subprog. Hence all of the adjustment operations
19778	 * in bpf_patch_insn_data are no-ops.
19779	 */
19780	prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
19781	if (!prog)
19782		return -ENOMEM;
19783	env->prog = prog;
19784	info[cnt + 1].start = info[cnt].start;
19785	info[cnt].start = prog->len - len + 1;
19786	env->subprog_cnt++;
19787	env->hidden_subprog_cnt++;
19788	return 0;
19789}
19790
19791/* Do various post-verification rewrites in a single program pass.
19792 * These rewrites simplify JIT and interpreter implementations.
19793 */
19794static int do_misc_fixups(struct bpf_verifier_env *env)
19795{
19796	struct bpf_prog *prog = env->prog;
19797	enum bpf_attach_type eatype = prog->expected_attach_type;
19798	enum bpf_prog_type prog_type = resolve_prog_type(prog);
19799	struct bpf_insn *insn = prog->insnsi;
19800	const struct bpf_func_proto *fn;
19801	const int insn_cnt = prog->len;
19802	const struct bpf_map_ops *ops;
19803	struct bpf_insn_aux_data *aux;
19804	struct bpf_insn insn_buf[16];
19805	struct bpf_prog *new_prog;
19806	struct bpf_map *map_ptr;
19807	int i, ret, cnt, delta = 0, cur_subprog = 0;
19808	struct bpf_subprog_info *subprogs = env->subprog_info;
19809	u16 stack_depth = subprogs[cur_subprog].stack_depth;
19810	u16 stack_depth_extra = 0;
19811
19812	if (env->seen_exception && !env->exception_callback_subprog) {
19813		struct bpf_insn patch[] = {
19814			env->prog->insnsi[insn_cnt - 1],
19815			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
19816			BPF_EXIT_INSN(),
19817		};
19818
19819		ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
19820		if (ret < 0)
19821			return ret;
19822		prog = env->prog;
19823		insn = prog->insnsi;
19824
19825		env->exception_callback_subprog = env->subprog_cnt - 1;
19826		/* Don't update insn_cnt, as add_hidden_subprog always appends insns */
19827		mark_subprog_exc_cb(env, env->exception_callback_subprog);
19828	}
19829
19830	for (i = 0; i < insn_cnt;) {
19831		if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
19832			if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
19833			    (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
19834				/* convert to 32-bit mov that clears upper 32-bit */
19835				insn->code = BPF_ALU | BPF_MOV | BPF_X;
19836				/* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
19837				insn->off = 0;
19838				insn->imm = 0;
19839			} /* cast from as(0) to as(1) should be handled by JIT */
19840			goto next_insn;
19841		}
19842
19843		if (env->insn_aux_data[i + delta].needs_zext)
19844			/* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
19845			insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
19846
19847		/* Make divide-by-zero exceptions impossible. */
19848		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
19849		    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
19850		    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
19851		    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
19852			bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
19853			bool isdiv = BPF_OP(insn->code) == BPF_DIV;
19854			struct bpf_insn *patchlet;
19855			struct bpf_insn chk_and_div[] = {
19856				/* [R,W]x div 0 -> 0 */
19857				BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
19858					     BPF_JNE | BPF_K, insn->src_reg,
19859					     0, 2, 0),
19860				BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
19861				BPF_JMP_IMM(BPF_JA, 0, 0, 1),
19862				*insn,
19863			};
19864			struct bpf_insn chk_and_mod[] = {
19865				/* [R,W]x mod 0 -> [R,W]x */
19866				BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
19867					     BPF_JEQ | BPF_K, insn->src_reg,
19868					     0, 1 + (is64 ? 0 : 1), 0),
19869				*insn,
19870				BPF_JMP_IMM(BPF_JA, 0, 0, 1),
19871				BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
19872			};
19873
19874			patchlet = isdiv ? chk_and_div : chk_and_mod;
19875			cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
19876				      ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
19877
19878			new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
19879			if (!new_prog)
19880				return -ENOMEM;
19881
19882			delta    += cnt - 1;
19883			env->prog = prog = new_prog;
19884			insn      = new_prog->insnsi + i + delta;
19885			goto next_insn;
19886		}
19887
19888		/* Make it impossible to de-reference a userspace address */
19889		if (BPF_CLASS(insn->code) == BPF_LDX &&
19890		    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
19891		     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
19892			struct bpf_insn *patch = &insn_buf[0];
19893			u64 uaddress_limit = bpf_arch_uaddress_limit();
19894
19895			if (!uaddress_limit)
19896				goto next_insn;
19897
19898			*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
19899			if (insn->off)
19900				*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
19901			*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
19902			*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
19903			*patch++ = *insn;
19904			*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
19905			*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
19906
19907			cnt = patch - insn_buf;
19908			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
19909			if (!new_prog)
19910				return -ENOMEM;
19911
19912			delta    += cnt - 1;
19913			env->prog = prog = new_prog;
19914			insn      = new_prog->insnsi + i + delta;
19915			goto next_insn;
19916		}
19917
19918		/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
19919		if (BPF_CLASS(insn->code) == BPF_LD &&
19920		    (BPF_MODE(insn->code) == BPF_ABS ||
19921		     BPF_MODE(insn->code) == BPF_IND)) {
19922			cnt = env->ops->gen_ld_abs(insn, insn_buf);
19923			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
19924				verbose(env, "bpf verifier is misconfigured\n");
19925				return -EINVAL;
19926			}
19927
19928			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
19929			if (!new_prog)
19930				return -ENOMEM;
19931
19932			delta    += cnt - 1;
19933			env->prog = prog = new_prog;
19934			insn      = new_prog->insnsi + i + delta;
19935			goto next_insn;
19936		}
19937
19938		/* Rewrite pointer arithmetic to mitigate speculation attacks. */
19939		if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
19940		    insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
19941			const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
19942			const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
19943			struct bpf_insn *patch = &insn_buf[0];
19944			bool issrc, isneg, isimm;
19945			u32 off_reg;
19946
19947			aux = &env->insn_aux_data[i + delta];
19948			if (!aux->alu_state ||
19949			    aux->alu_state == BPF_ALU_NON_POINTER)
19950				goto next_insn;
19951
19952			isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
19953			issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
19954				BPF_ALU_SANITIZE_SRC;
19955			isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
19956
19957			off_reg = issrc ? insn->src_reg : insn->dst_reg;
19958			if (isimm) {
19959				*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
19960			} else {
19961				if (isneg)
19962					*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
19963				*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
19964				*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
19965				*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
19966				*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
19967				*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
19968				*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
19969			}
19970			if (!issrc)
19971				*patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
19972			insn->src_reg = BPF_REG_AX;
19973			if (isneg)
19974				insn->code = insn->code == code_add ?
19975					     code_sub : code_add;
19976			*patch++ = *insn;
19977			if (issrc && isneg && !isimm)
19978				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
19979			cnt = patch - insn_buf;
19980
19981			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
19982			if (!new_prog)
19983				return -ENOMEM;
19984
19985			delta    += cnt - 1;
19986			env->prog = prog = new_prog;
19987			insn      = new_prog->insnsi + i + delta;
19988			goto next_insn;
19989		}
19990
19991		if (is_may_goto_insn(insn)) {
19992			int stack_off = -stack_depth - 8;
19993
19994			stack_depth_extra = 8;
19995			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
19996			insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
19997			insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
19998			insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
19999			cnt = 4;
20000
20001			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20002			if (!new_prog)
20003				return -ENOMEM;
20004
20005			delta += cnt - 1;
20006			env->prog = prog = new_prog;
20007			insn = new_prog->insnsi + i + delta;
20008			goto next_insn;
20009		}
20010
20011		if (insn->code != (BPF_JMP | BPF_CALL))
20012			goto next_insn;
20013		if (insn->src_reg == BPF_PSEUDO_CALL)
20014			goto next_insn;
20015		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
20016			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
20017			if (ret)
20018				return ret;
20019			if (cnt == 0)
20020				goto next_insn;
20021
20022			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20023			if (!new_prog)
20024				return -ENOMEM;
20025
20026			delta	 += cnt - 1;
20027			env->prog = prog = new_prog;
20028			insn	  = new_prog->insnsi + i + delta;
20029			goto next_insn;
20030		}
20031
20032		/* Skip inlining the helper call if the JIT does it. */
20033		if (bpf_jit_inlines_helper_call(insn->imm))
20034			goto next_insn;
20035
20036		if (insn->imm == BPF_FUNC_get_route_realm)
20037			prog->dst_needed = 1;
20038		if (insn->imm == BPF_FUNC_get_prandom_u32)
20039			bpf_user_rnd_init_once();
20040		if (insn->imm == BPF_FUNC_override_return)
20041			prog->kprobe_override = 1;
20042		if (insn->imm == BPF_FUNC_tail_call) {
20043			/* If we tail call into other programs, we
20044			 * cannot make any assumptions since they can
20045			 * be replaced dynamically during runtime in
20046			 * the program array.
20047			 */
20048			prog->cb_access = 1;
20049			if (!allow_tail_call_in_subprogs(env))
20050				prog->aux->stack_depth = MAX_BPF_STACK;
20051			prog->aux->max_pkt_offset = MAX_PACKET_OFF;
20052
20053			/* mark bpf_tail_call as different opcode to avoid
20054			 * conditional branch in the interpreter for every normal
20055			 * call and to prevent accidental JITing by JIT compiler
20056			 * that doesn't support bpf_tail_call yet
20057			 */
20058			insn->imm = 0;
20059			insn->code = BPF_JMP | BPF_TAIL_CALL;
20060
20061			aux = &env->insn_aux_data[i + delta];
20062			if (env->bpf_capable && !prog->blinding_requested &&
20063			    prog->jit_requested &&
20064			    !bpf_map_key_poisoned(aux) &&
20065			    !bpf_map_ptr_poisoned(aux) &&
20066			    !bpf_map_ptr_unpriv(aux)) {
20067				struct bpf_jit_poke_descriptor desc = {
20068					.reason = BPF_POKE_REASON_TAIL_CALL,
20069					.tail_call.map = aux->map_ptr_state.map_ptr,
20070					.tail_call.key = bpf_map_key_immediate(aux),
20071					.insn_idx = i + delta,
20072				};
20073
20074				ret = bpf_jit_add_poke_descriptor(prog, &desc);
20075				if (ret < 0) {
20076					verbose(env, "adding tail call poke descriptor failed\n");
20077					return ret;
20078				}
20079
20080				insn->imm = ret + 1;
20081				goto next_insn;
20082			}
20083
20084			if (!bpf_map_ptr_unpriv(aux))
20085				goto next_insn;
20086
20087			/* instead of changing every JIT dealing with tail_call
20088			 * emit two extra insns:
20089			 * if (index >= max_entries) goto out;
20090			 * index &= array->index_mask;
20091			 * to avoid out-of-bounds cpu speculation
20092			 */
20093			if (bpf_map_ptr_poisoned(aux)) {
20094				verbose(env, "tail_call abusing map_ptr\n");
20095				return -EINVAL;
20096			}
20097
20098			map_ptr = aux->map_ptr_state.map_ptr;
20099			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
20100						  map_ptr->max_entries, 2);
20101			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
20102						    container_of(map_ptr,
20103								 struct bpf_array,
20104								 map)->index_mask);
20105			insn_buf[2] = *insn;
20106			cnt = 3;
20107			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20108			if (!new_prog)
20109				return -ENOMEM;
20110
20111			delta    += cnt - 1;
20112			env->prog = prog = new_prog;
20113			insn      = new_prog->insnsi + i + delta;
20114			goto next_insn;
20115		}
20116
20117		if (insn->imm == BPF_FUNC_timer_set_callback) {
20118			/* The verifier will process callback_fn as many times as necessary
20119			 * with different maps and the register states prepared by
20120			 * set_timer_callback_state will be accurate.
20121			 *
20122			 * The following use case is valid:
20123			 *   map1 is shared by prog1, prog2, prog3.
20124			 *   prog1 calls bpf_timer_init for some map1 elements
20125			 *   prog2 calls bpf_timer_set_callback for some map1 elements.
20126			 *     Those that were not bpf_timer_init-ed will return -EINVAL.
20127			 *   prog3 calls bpf_timer_start for some map1 elements.
20128			 *     Those that were not both bpf_timer_init-ed and
20129			 *     bpf_timer_set_callback-ed will return -EINVAL.
20130			 */
20131			struct bpf_insn ld_addrs[2] = {
20132				BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
20133			};
20134
20135			insn_buf[0] = ld_addrs[0];
20136			insn_buf[1] = ld_addrs[1];
20137			insn_buf[2] = *insn;
20138			cnt = 3;
20139
20140			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20141			if (!new_prog)
20142				return -ENOMEM;
20143
20144			delta    += cnt - 1;
20145			env->prog = prog = new_prog;
20146			insn      = new_prog->insnsi + i + delta;
20147			goto patch_call_imm;
20148		}
20149
20150		if (is_storage_get_function(insn->imm)) {
20151			if (!in_sleepable(env) ||
20152			    env->insn_aux_data[i + delta].storage_get_func_atomic)
20153				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
20154			else
20155				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
20156			insn_buf[1] = *insn;
20157			cnt = 2;
20158
20159			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20160			if (!new_prog)
20161				return -ENOMEM;
20162
20163			delta += cnt - 1;
20164			env->prog = prog = new_prog;
20165			insn = new_prog->insnsi + i + delta;
20166			goto patch_call_imm;
20167		}
20168
20169		/* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
20170		if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
20171			/* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
20172			 * bpf_mem_alloc() returns a ptr to the percpu data ptr.
20173			 */
20174			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
20175			insn_buf[1] = *insn;
20176			cnt = 2;
20177
20178			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20179			if (!new_prog)
20180				return -ENOMEM;
20181
20182			delta += cnt - 1;
20183			env->prog = prog = new_prog;
20184			insn = new_prog->insnsi + i + delta;
20185			goto patch_call_imm;
20186		}
20187
20188		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
20189		 * and other inlining handlers are currently limited to 64 bit
20190		 * only.
20191		 */
20192		if (prog->jit_requested && BITS_PER_LONG == 64 &&
20193		    (insn->imm == BPF_FUNC_map_lookup_elem ||
20194		     insn->imm == BPF_FUNC_map_update_elem ||
20195		     insn->imm == BPF_FUNC_map_delete_elem ||
20196		     insn->imm == BPF_FUNC_map_push_elem   ||
20197		     insn->imm == BPF_FUNC_map_pop_elem    ||
20198		     insn->imm == BPF_FUNC_map_peek_elem   ||
20199		     insn->imm == BPF_FUNC_redirect_map    ||
20200		     insn->imm == BPF_FUNC_for_each_map_elem ||
20201		     insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
20202			aux = &env->insn_aux_data[i + delta];
20203			if (bpf_map_ptr_poisoned(aux))
20204				goto patch_call_imm;
20205
20206			map_ptr = aux->map_ptr_state.map_ptr;
20207			ops = map_ptr->ops;
20208			if (insn->imm == BPF_FUNC_map_lookup_elem &&
20209			    ops->map_gen_lookup) {
20210				cnt = ops->map_gen_lookup(map_ptr, insn_buf);
20211				if (cnt == -EOPNOTSUPP)
20212					goto patch_map_ops_generic;
20213				if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
20214					verbose(env, "bpf verifier is misconfigured\n");
20215					return -EINVAL;
20216				}
20217
20218				new_prog = bpf_patch_insn_data(env, i + delta,
20219							       insn_buf, cnt);
20220				if (!new_prog)
20221					return -ENOMEM;
20222
20223				delta    += cnt - 1;
20224				env->prog = prog = new_prog;
20225				insn      = new_prog->insnsi + i + delta;
20226				goto next_insn;
20227			}
20228
20229			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
20230				     (void *(*)(struct bpf_map *map, void *key))NULL));
20231			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
20232				     (long (*)(struct bpf_map *map, void *key))NULL));
20233			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
20234				     (long (*)(struct bpf_map *map, void *key, void *value,
20235					      u64 flags))NULL));
20236			BUILD_BUG_ON(!__same_type(ops->map_push_elem,
20237				     (long (*)(struct bpf_map *map, void *value,
20238					      u64 flags))NULL));
20239			BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
20240				     (long (*)(struct bpf_map *map, void *value))NULL));
20241			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
20242				     (long (*)(struct bpf_map *map, void *value))NULL));
20243			BUILD_BUG_ON(!__same_type(ops->map_redirect,
20244				     (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
20245			BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
20246				     (long (*)(struct bpf_map *map,
20247					      bpf_callback_t callback_fn,
20248					      void *callback_ctx,
20249					      u64 flags))NULL));
20250			BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
20251				     (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
20252
20253patch_map_ops_generic:
20254			switch (insn->imm) {
20255			case BPF_FUNC_map_lookup_elem:
20256				insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
20257				goto next_insn;
20258			case BPF_FUNC_map_update_elem:
20259				insn->imm = BPF_CALL_IMM(ops->map_update_elem);
20260				goto next_insn;
20261			case BPF_FUNC_map_delete_elem:
20262				insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
20263				goto next_insn;
20264			case BPF_FUNC_map_push_elem:
20265				insn->imm = BPF_CALL_IMM(ops->map_push_elem);
20266				goto next_insn;
20267			case BPF_FUNC_map_pop_elem:
20268				insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
20269				goto next_insn;
20270			case BPF_FUNC_map_peek_elem:
20271				insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
20272				goto next_insn;
20273			case BPF_FUNC_redirect_map:
20274				insn->imm = BPF_CALL_IMM(ops->map_redirect);
20275				goto next_insn;
20276			case BPF_FUNC_for_each_map_elem:
20277				insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
20278				goto next_insn;
20279			case BPF_FUNC_map_lookup_percpu_elem:
20280				insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
20281				goto next_insn;
20282			}
20283
20284			goto patch_call_imm;
20285		}
20286
20287		/* Implement bpf_jiffies64 inline. */
20288		if (prog->jit_requested && BITS_PER_LONG == 64 &&
20289		    insn->imm == BPF_FUNC_jiffies64) {
20290			struct bpf_insn ld_jiffies_addr[2] = {
20291				BPF_LD_IMM64(BPF_REG_0,
20292					     (unsigned long)&jiffies),
20293			};
20294
20295			insn_buf[0] = ld_jiffies_addr[0];
20296			insn_buf[1] = ld_jiffies_addr[1];
20297			insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
20298						  BPF_REG_0, 0);
20299			cnt = 3;
20300
20301			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
20302						       cnt);
20303			if (!new_prog)
20304				return -ENOMEM;
20305
20306			delta    += cnt - 1;
20307			env->prog = prog = new_prog;
20308			insn      = new_prog->insnsi + i + delta;
20309			goto next_insn;
20310		}
20311
20312#ifdef CONFIG_X86_64
20313		/* Implement bpf_get_smp_processor_id() inline. */
20314		if (insn->imm == BPF_FUNC_get_smp_processor_id &&
20315		    prog->jit_requested && bpf_jit_supports_percpu_insn()) {
20316			/* BPF_FUNC_get_smp_processor_id inlining is an
20317			 * optimization, so if pcpu_hot.cpu_number is ever
20318			 * changed in some incompatible and hard to support
20319			 * way, it's fine to back out this inlining logic
20320			 */
20321			insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
20322			insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
20323			insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
20324			cnt = 3;
20325
20326			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20327			if (!new_prog)
20328				return -ENOMEM;
20329
20330			delta    += cnt - 1;
20331			env->prog = prog = new_prog;
20332			insn      = new_prog->insnsi + i + delta;
20333			goto next_insn;
20334		}
20335#endif
20336		/* Implement bpf_get_func_arg inline. */
20337		if (prog_type == BPF_PROG_TYPE_TRACING &&
20338		    insn->imm == BPF_FUNC_get_func_arg) {
20339			/* Load nr_args from ctx - 8 */
20340			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
20341			insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
20342			insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
20343			insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
20344			insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
20345			insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
20346			insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
20347			insn_buf[7] = BPF_JMP_A(1);
20348			insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
20349			cnt = 9;
20350
20351			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20352			if (!new_prog)
20353				return -ENOMEM;
20354
20355			delta    += cnt - 1;
20356			env->prog = prog = new_prog;
20357			insn      = new_prog->insnsi + i + delta;
20358			goto next_insn;
20359		}
20360
20361		/* Implement bpf_get_func_ret inline. */
20362		if (prog_type == BPF_PROG_TYPE_TRACING &&
20363		    insn->imm == BPF_FUNC_get_func_ret) {
20364			if (eatype == BPF_TRACE_FEXIT ||
20365			    eatype == BPF_MODIFY_RETURN) {
20366				/* Load nr_args from ctx - 8 */
20367				insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
20368				insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
20369				insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
20370				insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
20371				insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
20372				insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
20373				cnt = 6;
20374			} else {
20375				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
20376				cnt = 1;
20377			}
20378
20379			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20380			if (!new_prog)
20381				return -ENOMEM;
20382
20383			delta    += cnt - 1;
20384			env->prog = prog = new_prog;
20385			insn      = new_prog->insnsi + i + delta;
20386			goto next_insn;
20387		}
20388
20389		/* Implement get_func_arg_cnt inline. */
20390		if (prog_type == BPF_PROG_TYPE_TRACING &&
20391		    insn->imm == BPF_FUNC_get_func_arg_cnt) {
20392			/* Load nr_args from ctx - 8 */
20393			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
20394
20395			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
20396			if (!new_prog)
20397				return -ENOMEM;
20398
20399			env->prog = prog = new_prog;
20400			insn      = new_prog->insnsi + i + delta;
20401			goto next_insn;
20402		}
20403
20404		/* Implement bpf_get_func_ip inline. */
20405		if (prog_type == BPF_PROG_TYPE_TRACING &&
20406		    insn->imm == BPF_FUNC_get_func_ip) {
20407			/* Load IP address from ctx - 16 */
20408			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
20409
20410			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
20411			if (!new_prog)
20412				return -ENOMEM;
20413
20414			env->prog = prog = new_prog;
20415			insn      = new_prog->insnsi + i + delta;
20416			goto next_insn;
20417		}
20418
20419		/* Implement bpf_get_branch_snapshot inline. */
20420		if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
20421		    prog->jit_requested && BITS_PER_LONG == 64 &&
20422		    insn->imm == BPF_FUNC_get_branch_snapshot) {
20423			/* We are dealing with the following func protos:
20424			 * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
20425			 * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
20426			 */
20427			const u32 br_entry_size = sizeof(struct perf_branch_entry);
20428
20429			/* struct perf_branch_entry is part of UAPI and is
20430			 * used as an array element, so extremely unlikely to
20431			 * ever grow or shrink
20432			 */
20433			BUILD_BUG_ON(br_entry_size != 24);
20434
20435			/* if (unlikely(flags)) return -EINVAL */
20436			insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
20437
20438			/* Transform size (bytes) into number of entries (cnt = size / 24).
20439			 * But to avoid expensive division instruction, we implement
20440			 * divide-by-3 through multiplication, followed by further
20441			 * division by 8 through 3-bit right shift.
20442			 * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
20443			 * p. 227, chapter "Unsigned Division by 3" for details and proofs.
20444			 *
20445			 * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
20446			 */
20447			insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
20448			insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
20449			insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
20450
20451			/* call perf_snapshot_branch_stack implementation */
20452			insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
20453			/* if (entry_cnt == 0) return -ENOENT */
20454			insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
20455			/* return entry_cnt * sizeof(struct perf_branch_entry) */
20456			insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
20457			insn_buf[7] = BPF_JMP_A(3);
20458			/* return -EINVAL; */
20459			insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
20460			insn_buf[9] = BPF_JMP_A(1);
20461			/* return -ENOENT; */
20462			insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
20463			cnt = 11;
20464
20465			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20466			if (!new_prog)
20467				return -ENOMEM;
20468
20469			delta    += cnt - 1;
20470			env->prog = prog = new_prog;
20471			insn      = new_prog->insnsi + i + delta;
20472			continue;
20473		}
20474
20475		/* Implement bpf_kptr_xchg inline */
20476		if (prog->jit_requested && BITS_PER_LONG == 64 &&
20477		    insn->imm == BPF_FUNC_kptr_xchg &&
20478		    bpf_jit_supports_ptr_xchg()) {
20479			insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
20480			insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
20481			cnt = 2;
20482
20483			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
20484			if (!new_prog)
20485				return -ENOMEM;
20486
20487			delta    += cnt - 1;
20488			env->prog = prog = new_prog;
20489			insn      = new_prog->insnsi + i + delta;
20490			goto next_insn;
20491		}
20492patch_call_imm:
20493		fn = env->ops->get_func_proto(insn->imm, env->prog);
20494		/* all functions that have prototype and verifier allowed
20495		 * programs to call them, must be real in-kernel functions
20496		 */
20497		if (!fn->func) {
20498			verbose(env,
20499				"kernel subsystem misconfigured func %s#%d\n",
20500				func_id_name(insn->imm), insn->imm);
20501			return -EFAULT;
20502		}
20503		insn->imm = fn->func - __bpf_call_base;
20504next_insn:
20505		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
20506			subprogs[cur_subprog].stack_depth += stack_depth_extra;
20507			subprogs[cur_subprog].stack_extra = stack_depth_extra;
20508			cur_subprog++;
20509			stack_depth = subprogs[cur_subprog].stack_depth;
20510			stack_depth_extra = 0;
20511		}
20512		i++;
20513		insn++;
20514	}
20515
20516	env->prog->aux->stack_depth = subprogs[0].stack_depth;
20517	for (i = 0; i < env->subprog_cnt; i++) {
20518		int subprog_start = subprogs[i].start;
20519		int stack_slots = subprogs[i].stack_extra / 8;
20520
20521		if (!stack_slots)
20522			continue;
20523		if (stack_slots > 1) {
20524			verbose(env, "verifier bug: stack_slots supports may_goto only\n");
20525			return -EFAULT;
20526		}
20527
20528		/* Add ST insn to subprog prologue to init extra stack */
20529		insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
20530					 -subprogs[i].stack_depth, BPF_MAX_LOOPS);
20531		/* Copy first actual insn to preserve it */
20532		insn_buf[1] = env->prog->insnsi[subprog_start];
20533
20534		new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
20535		if (!new_prog)
20536			return -ENOMEM;
20537		env->prog = prog = new_prog;
20538	}
20539
20540	/* Since poke tab is now finalized, publish aux to tracker. */
20541	for (i = 0; i < prog->aux->size_poke_tab; i++) {
20542		map_ptr = prog->aux->poke_tab[i].tail_call.map;
20543		if (!map_ptr->ops->map_poke_track ||
20544		    !map_ptr->ops->map_poke_untrack ||
20545		    !map_ptr->ops->map_poke_run) {
20546			verbose(env, "bpf verifier is misconfigured\n");
20547			return -EINVAL;
20548		}
20549
20550		ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
20551		if (ret < 0) {
20552			verbose(env, "tracking tail call prog failed\n");
20553			return ret;
20554		}
20555	}
20556
20557	sort_kfunc_descs_by_imm_off(env->prog);
20558
20559	return 0;
20560}
20561
20562static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
20563					int position,
20564					s32 stack_base,
20565					u32 callback_subprogno,
20566					u32 *cnt)
20567{
20568	s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
20569	s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
20570	s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
20571	int reg_loop_max = BPF_REG_6;
20572	int reg_loop_cnt = BPF_REG_7;
20573	int reg_loop_ctx = BPF_REG_8;
20574
20575	struct bpf_prog *new_prog;
20576	u32 callback_start;
20577	u32 call_insn_offset;
20578	s32 callback_offset;
20579
20580	/* This represents an inlined version of bpf_iter.c:bpf_loop,
20581	 * be careful to modify this code in sync.
20582	 */
20583	struct bpf_insn insn_buf[] = {
20584		/* Return error and jump to the end of the patch if
20585		 * expected number of iterations is too big.
20586		 */
20587		BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
20588		BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
20589		BPF_JMP_IMM(BPF_JA, 0, 0, 16),
20590		/* spill R6, R7, R8 to use these as loop vars */
20591		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
20592		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
20593		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
20594		/* initialize loop vars */
20595		BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
20596		BPF_MOV32_IMM(reg_loop_cnt, 0),
20597		BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
20598		/* loop header,
20599		 * if reg_loop_cnt >= reg_loop_max skip the loop body
20600		 */
20601		BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
20602		/* callback call,
20603		 * correct callback offset would be set after patching
20604		 */
20605		BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
20606		BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
20607		BPF_CALL_REL(0),
20608		/* increment loop counter */
20609		BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
20610		/* jump to loop header if callback returned 0 */
20611		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
20612		/* return value of bpf_loop,
20613		 * set R0 to the number of iterations
20614		 */
20615		BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
20616		/* restore original values of R6, R7, R8 */
20617		BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
20618		BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
20619		BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
20620	};
20621
20622	*cnt = ARRAY_SIZE(insn_buf);
20623	new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
20624	if (!new_prog)
20625		return new_prog;
20626
20627	/* callback start is known only after patching */
20628	callback_start = env->subprog_info[callback_subprogno].start;
20629	/* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
20630	call_insn_offset = position + 12;
20631	callback_offset = callback_start - call_insn_offset - 1;
20632	new_prog->insnsi[call_insn_offset].imm = callback_offset;
20633
20634	return new_prog;
20635}
20636
20637static bool is_bpf_loop_call(struct bpf_insn *insn)
20638{
20639	return insn->code == (BPF_JMP | BPF_CALL) &&
20640		insn->src_reg == 0 &&
20641		insn->imm == BPF_FUNC_loop;
20642}
20643
20644/* For all sub-programs in the program (including main) check
20645 * insn_aux_data to see if there are bpf_loop calls that require
20646 * inlining. If such calls are found the calls are replaced with a
20647 * sequence of instructions produced by `inline_bpf_loop` function and
20648 * subprog stack_depth is increased by the size of 3 registers.
20649 * This stack space is used to spill values of the R6, R7, R8.  These
20650 * registers are used to store the loop bound, counter and context
20651 * variables.
20652 */
20653static int optimize_bpf_loop(struct bpf_verifier_env *env)
20654{
20655	struct bpf_subprog_info *subprogs = env->subprog_info;
20656	int i, cur_subprog = 0, cnt, delta = 0;
20657	struct bpf_insn *insn = env->prog->insnsi;
20658	int insn_cnt = env->prog->len;
20659	u16 stack_depth = subprogs[cur_subprog].stack_depth;
20660	u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
20661	u16 stack_depth_extra = 0;
20662
20663	for (i = 0; i < insn_cnt; i++, insn++) {
20664		struct bpf_loop_inline_state *inline_state =
20665			&env->insn_aux_data[i + delta].loop_inline_state;
20666
20667		if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
20668			struct bpf_prog *new_prog;
20669
20670			stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
20671			new_prog = inline_bpf_loop(env,
20672						   i + delta,
20673						   -(stack_depth + stack_depth_extra),
20674						   inline_state->callback_subprogno,
20675						   &cnt);
20676			if (!new_prog)
20677				return -ENOMEM;
20678
20679			delta     += cnt - 1;
20680			env->prog  = new_prog;
20681			insn       = new_prog->insnsi + i + delta;
20682		}
20683
20684		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
20685			subprogs[cur_subprog].stack_depth += stack_depth_extra;
20686			cur_subprog++;
20687			stack_depth = subprogs[cur_subprog].stack_depth;
20688			stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
20689			stack_depth_extra = 0;
20690		}
20691	}
20692
20693	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
20694
20695	return 0;
20696}
20697
20698static void free_states(struct bpf_verifier_env *env)
20699{
20700	struct bpf_verifier_state_list *sl, *sln;
20701	int i;
20702
20703	sl = env->free_list;
20704	while (sl) {
20705		sln = sl->next;
20706		free_verifier_state(&sl->state, false);
20707		kfree(sl);
20708		sl = sln;
20709	}
20710	env->free_list = NULL;
20711
20712	if (!env->explored_states)
20713		return;
20714
20715	for (i = 0; i < state_htab_size(env); i++) {
20716		sl = env->explored_states[i];
20717
20718		while (sl) {
20719			sln = sl->next;
20720			free_verifier_state(&sl->state, false);
20721			kfree(sl);
20722			sl = sln;
20723		}
20724		env->explored_states[i] = NULL;
20725	}
20726}
20727
20728static int do_check_common(struct bpf_verifier_env *env, int subprog)
20729{
20730	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
20731	struct bpf_subprog_info *sub = subprog_info(env, subprog);
20732	struct bpf_verifier_state *state;
20733	struct bpf_reg_state *regs;
20734	int ret, i;
20735
20736	env->prev_linfo = NULL;
20737	env->pass_cnt++;
20738
20739	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
20740	if (!state)
20741		return -ENOMEM;
20742	state->curframe = 0;
20743	state->speculative = false;
20744	state->branches = 1;
20745	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
20746	if (!state->frame[0]) {
20747		kfree(state);
20748		return -ENOMEM;
20749	}
20750	env->cur_state = state;
20751	init_func_state(env, state->frame[0],
20752			BPF_MAIN_FUNC /* callsite */,
20753			0 /* frameno */,
20754			subprog);
20755	state->first_insn_idx = env->subprog_info[subprog].start;
20756	state->last_insn_idx = -1;
20757
20758	regs = state->frame[state->curframe]->regs;
20759	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
20760		const char *sub_name = subprog_name(env, subprog);
20761		struct bpf_subprog_arg_info *arg;
20762		struct bpf_reg_state *reg;
20763
20764		verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
20765		ret = btf_prepare_func_args(env, subprog);
20766		if (ret)
20767			goto out;
20768
20769		if (subprog_is_exc_cb(env, subprog)) {
20770			state->frame[0]->in_exception_callback_fn = true;
20771			/* We have already ensured that the callback returns an integer, just
20772			 * like all global subprogs. We need to determine it only has a single
20773			 * scalar argument.
20774			 */
20775			if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
20776				verbose(env, "exception cb only supports single integer argument\n");
20777				ret = -EINVAL;
20778				goto out;
20779			}
20780		}
20781		for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
20782			arg = &sub->args[i - BPF_REG_1];
20783			reg = &regs[i];
20784
20785			if (arg->arg_type == ARG_PTR_TO_CTX) {
20786				reg->type = PTR_TO_CTX;
20787				mark_reg_known_zero(env, regs, i);
20788			} else if (arg->arg_type == ARG_ANYTHING) {
20789				reg->type = SCALAR_VALUE;
20790				mark_reg_unknown(env, regs, i);
20791			} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
20792				/* assume unspecial LOCAL dynptr type */
20793				__mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
20794			} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
20795				reg->type = PTR_TO_MEM;
20796				if (arg->arg_type & PTR_MAYBE_NULL)
20797					reg->type |= PTR_MAYBE_NULL;
20798				mark_reg_known_zero(env, regs, i);
20799				reg->mem_size = arg->mem_size;
20800				reg->id = ++env->id_gen;
20801			} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
20802				reg->type = PTR_TO_BTF_ID;
20803				if (arg->arg_type & PTR_MAYBE_NULL)
20804					reg->type |= PTR_MAYBE_NULL;
20805				if (arg->arg_type & PTR_UNTRUSTED)
20806					reg->type |= PTR_UNTRUSTED;
20807				if (arg->arg_type & PTR_TRUSTED)
20808					reg->type |= PTR_TRUSTED;
20809				mark_reg_known_zero(env, regs, i);
20810				reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
20811				reg->btf_id = arg->btf_id;
20812				reg->id = ++env->id_gen;
20813			} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
20814				/* caller can pass either PTR_TO_ARENA or SCALAR */
20815				mark_reg_unknown(env, regs, i);
20816			} else {
20817				WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
20818					  i - BPF_REG_1, arg->arg_type);
20819				ret = -EFAULT;
20820				goto out;
20821			}
20822		}
20823	} else {
20824		/* if main BPF program has associated BTF info, validate that
20825		 * it's matching expected signature, and otherwise mark BTF
20826		 * info for main program as unreliable
20827		 */
20828		if (env->prog->aux->func_info_aux) {
20829			ret = btf_prepare_func_args(env, 0);
20830			if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
20831				env->prog->aux->func_info_aux[0].unreliable = true;
20832		}
20833
20834		/* 1st arg to a function */
20835		regs[BPF_REG_1].type = PTR_TO_CTX;
20836		mark_reg_known_zero(env, regs, BPF_REG_1);
20837	}
20838
20839	ret = do_check(env);
20840out:
20841	/* check for NULL is necessary, since cur_state can be freed inside
20842	 * do_check() under memory pressure.
20843	 */
20844	if (env->cur_state) {
20845		free_verifier_state(env->cur_state, true);
20846		env->cur_state = NULL;
20847	}
20848	while (!pop_stack(env, NULL, NULL, false));
20849	if (!ret && pop_log)
20850		bpf_vlog_reset(&env->log, 0);
20851	free_states(env);
20852	return ret;
20853}
20854
20855/* Lazily verify all global functions based on their BTF, if they are called
20856 * from main BPF program or any of subprograms transitively.
20857 * BPF global subprogs called from dead code are not validated.
20858 * All callable global functions must pass verification.
20859 * Otherwise the whole program is rejected.
20860 * Consider:
20861 * int bar(int);
20862 * int foo(int f)
20863 * {
20864 *    return bar(f);
20865 * }
20866 * int bar(int b)
20867 * {
20868 *    ...
20869 * }
20870 * foo() will be verified first for R1=any_scalar_value. During verification it
20871 * will be assumed that bar() already verified successfully and call to bar()
20872 * from foo() will be checked for type match only. Later bar() will be verified
20873 * independently to check that it's safe for R1=any_scalar_value.
20874 */
20875static int do_check_subprogs(struct bpf_verifier_env *env)
20876{
20877	struct bpf_prog_aux *aux = env->prog->aux;
20878	struct bpf_func_info_aux *sub_aux;
20879	int i, ret, new_cnt;
20880
20881	if (!aux->func_info)
20882		return 0;
20883
20884	/* exception callback is presumed to be always called */
20885	if (env->exception_callback_subprog)
20886		subprog_aux(env, env->exception_callback_subprog)->called = true;
20887
20888again:
20889	new_cnt = 0;
20890	for (i = 1; i < env->subprog_cnt; i++) {
20891		if (!subprog_is_global(env, i))
20892			continue;
20893
20894		sub_aux = subprog_aux(env, i);
20895		if (!sub_aux->called || sub_aux->verified)
20896			continue;
20897
20898		env->insn_idx = env->subprog_info[i].start;
20899		WARN_ON_ONCE(env->insn_idx == 0);
20900		ret = do_check_common(env, i);
20901		if (ret) {
20902			return ret;
20903		} else if (env->log.level & BPF_LOG_LEVEL) {
20904			verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
20905				i, subprog_name(env, i));
20906		}
20907
20908		/* We verified new global subprog, it might have called some
20909		 * more global subprogs that we haven't verified yet, so we
20910		 * need to do another pass over subprogs to verify those.
20911		 */
20912		sub_aux->verified = true;
20913		new_cnt++;
20914	}
20915
20916	/* We can't loop forever as we verify at least one global subprog on
20917	 * each pass.
20918	 */
20919	if (new_cnt)
20920		goto again;
20921
20922	return 0;
20923}
20924
20925static int do_check_main(struct bpf_verifier_env *env)
20926{
20927	int ret;
20928
20929	env->insn_idx = 0;
20930	ret = do_check_common(env, 0);
20931	if (!ret)
20932		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
20933	return ret;
20934}
20935
20936
20937static void print_verification_stats(struct bpf_verifier_env *env)
20938{
20939	int i;
20940
20941	if (env->log.level & BPF_LOG_STATS) {
20942		verbose(env, "verification time %lld usec\n",
20943			div_u64(env->verification_time, 1000));
20944		verbose(env, "stack depth ");
20945		for (i = 0; i < env->subprog_cnt; i++) {
20946			u32 depth = env->subprog_info[i].stack_depth;
20947
20948			verbose(env, "%d", depth);
20949			if (i + 1 < env->subprog_cnt)
20950				verbose(env, "+");
20951		}
20952		verbose(env, "\n");
20953	}
20954	verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
20955		"total_states %d peak_states %d mark_read %d\n",
20956		env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
20957		env->max_states_per_insn, env->total_states,
20958		env->peak_states, env->longest_mark_read_walk);
20959}
20960
20961static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
20962{
20963	const struct btf_type *t, *func_proto;
20964	const struct bpf_struct_ops_desc *st_ops_desc;
20965	const struct bpf_struct_ops *st_ops;
20966	const struct btf_member *member;
20967	struct bpf_prog *prog = env->prog;
20968	u32 btf_id, member_idx;
20969	struct btf *btf;
20970	const char *mname;
20971
20972	if (!prog->gpl_compatible) {
20973		verbose(env, "struct ops programs must have a GPL compatible license\n");
20974		return -EINVAL;
20975	}
20976
20977	if (!prog->aux->attach_btf_id)
20978		return -ENOTSUPP;
20979
20980	btf = prog->aux->attach_btf;
20981	if (btf_is_module(btf)) {
20982		/* Make sure st_ops is valid through the lifetime of env */
20983		env->attach_btf_mod = btf_try_get_module(btf);
20984		if (!env->attach_btf_mod) {
20985			verbose(env, "struct_ops module %s is not found\n",
20986				btf_get_name(btf));
20987			return -ENOTSUPP;
20988		}
20989	}
20990
20991	btf_id = prog->aux->attach_btf_id;
20992	st_ops_desc = bpf_struct_ops_find(btf, btf_id);
20993	if (!st_ops_desc) {
20994		verbose(env, "attach_btf_id %u is not a supported struct\n",
20995			btf_id);
20996		return -ENOTSUPP;
20997	}
20998	st_ops = st_ops_desc->st_ops;
20999
21000	t = st_ops_desc->type;
21001	member_idx = prog->expected_attach_type;
21002	if (member_idx >= btf_type_vlen(t)) {
21003		verbose(env, "attach to invalid member idx %u of struct %s\n",
21004			member_idx, st_ops->name);
21005		return -EINVAL;
21006	}
21007
21008	member = &btf_type_member(t)[member_idx];
21009	mname = btf_name_by_offset(btf, member->name_off);
21010	func_proto = btf_type_resolve_func_ptr(btf, member->type,
21011					       NULL);
21012	if (!func_proto) {
21013		verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
21014			mname, member_idx, st_ops->name);
21015		return -EINVAL;
21016	}
21017
21018	if (st_ops->check_member) {
21019		int err = st_ops->check_member(t, member, prog);
21020
21021		if (err) {
21022			verbose(env, "attach to unsupported member %s of struct %s\n",
21023				mname, st_ops->name);
21024			return err;
21025		}
21026	}
21027
21028	/* btf_ctx_access() used this to provide argument type info */
21029	prog->aux->ctx_arg_info =
21030		st_ops_desc->arg_info[member_idx].info;
21031	prog->aux->ctx_arg_info_size =
21032		st_ops_desc->arg_info[member_idx].cnt;
21033
21034	prog->aux->attach_func_proto = func_proto;
21035	prog->aux->attach_func_name = mname;
21036	env->ops = st_ops->verifier_ops;
21037
21038	return 0;
21039}
21040#define SECURITY_PREFIX "security_"
21041
21042static int check_attach_modify_return(unsigned long addr, const char *func_name)
21043{
21044	if (within_error_injection_list(addr) ||
21045	    !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
21046		return 0;
21047
21048	return -EINVAL;
21049}
21050
21051/* list of non-sleepable functions that are otherwise on
21052 * ALLOW_ERROR_INJECTION list
21053 */
21054BTF_SET_START(btf_non_sleepable_error_inject)
21055/* Three functions below can be called from sleepable and non-sleepable context.
21056 * Assume non-sleepable from bpf safety point of view.
21057 */
21058BTF_ID(func, __filemap_add_folio)
21059BTF_ID(func, should_fail_alloc_page)
21060BTF_ID(func, should_failslab)
21061BTF_SET_END(btf_non_sleepable_error_inject)
21062
21063static int check_non_sleepable_error_inject(u32 btf_id)
21064{
21065	return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
21066}
21067
21068int bpf_check_attach_target(struct bpf_verifier_log *log,
21069			    const struct bpf_prog *prog,
21070			    const struct bpf_prog *tgt_prog,
21071			    u32 btf_id,
21072			    struct bpf_attach_target_info *tgt_info)
21073{
21074	bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
21075	bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
21076	const char prefix[] = "btf_trace_";
21077	int ret = 0, subprog = -1, i;
21078	const struct btf_type *t;
21079	bool conservative = true;
21080	const char *tname;
21081	struct btf *btf;
21082	long addr = 0;
21083	struct module *mod = NULL;
21084
21085	if (!btf_id) {
21086		bpf_log(log, "Tracing programs must provide btf_id\n");
21087		return -EINVAL;
21088	}
21089	btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
21090	if (!btf) {
21091		bpf_log(log,
21092			"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
21093		return -EINVAL;
21094	}
21095	t = btf_type_by_id(btf, btf_id);
21096	if (!t) {
21097		bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
21098		return -EINVAL;
21099	}
21100	tname = btf_name_by_offset(btf, t->name_off);
21101	if (!tname) {
21102		bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
21103		return -EINVAL;
21104	}
21105	if (tgt_prog) {
21106		struct bpf_prog_aux *aux = tgt_prog->aux;
21107
21108		if (bpf_prog_is_dev_bound(prog->aux) &&
21109		    !bpf_prog_dev_bound_match(prog, tgt_prog)) {
21110			bpf_log(log, "Target program bound device mismatch");
21111			return -EINVAL;
21112		}
21113
21114		for (i = 0; i < aux->func_info_cnt; i++)
21115			if (aux->func_info[i].type_id == btf_id) {
21116				subprog = i;
21117				break;
21118			}
21119		if (subprog == -1) {
21120			bpf_log(log, "Subprog %s doesn't exist\n", tname);
21121			return -EINVAL;
21122		}
21123		if (aux->func && aux->func[subprog]->aux->exception_cb) {
21124			bpf_log(log,
21125				"%s programs cannot attach to exception callback\n",
21126				prog_extension ? "Extension" : "FENTRY/FEXIT");
21127			return -EINVAL;
21128		}
21129		conservative = aux->func_info_aux[subprog].unreliable;
21130		if (prog_extension) {
21131			if (conservative) {
21132				bpf_log(log,
21133					"Cannot replace static functions\n");
21134				return -EINVAL;
21135			}
21136			if (!prog->jit_requested) {
21137				bpf_log(log,
21138					"Extension programs should be JITed\n");
21139				return -EINVAL;
21140			}
21141		}
21142		if (!tgt_prog->jited) {
21143			bpf_log(log, "Can attach to only JITed progs\n");
21144			return -EINVAL;
21145		}
21146		if (prog_tracing) {
21147			if (aux->attach_tracing_prog) {
21148				/*
21149				 * Target program is an fentry/fexit which is already attached
21150				 * to another tracing program. More levels of nesting
21151				 * attachment are not allowed.
21152				 */
21153				bpf_log(log, "Cannot nest tracing program attach more than once\n");
21154				return -EINVAL;
21155			}
21156		} else if (tgt_prog->type == prog->type) {
21157			/*
21158			 * To avoid potential call chain cycles, prevent attaching of a
21159			 * program extension to another extension. It's ok to attach
21160			 * fentry/fexit to extension program.
21161			 */
21162			bpf_log(log, "Cannot recursively attach\n");
21163			return -EINVAL;
21164		}
21165		if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
21166		    prog_extension &&
21167		    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
21168		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
21169			/* Program extensions can extend all program types
21170			 * except fentry/fexit. The reason is the following.
21171			 * The fentry/fexit programs are used for performance
21172			 * analysis, stats and can be attached to any program
21173			 * type. When extension program is replacing XDP function
21174			 * it is necessary to allow performance analysis of all
21175			 * functions. Both original XDP program and its program
21176			 * extension. Hence attaching fentry/fexit to
21177			 * BPF_PROG_TYPE_EXT is allowed. If extending of
21178			 * fentry/fexit was allowed it would be possible to create
21179			 * long call chain fentry->extension->fentry->extension
21180			 * beyond reasonable stack size. Hence extending fentry
21181			 * is not allowed.
21182			 */
21183			bpf_log(log, "Cannot extend fentry/fexit\n");
21184			return -EINVAL;
21185		}
21186	} else {
21187		if (prog_extension) {
21188			bpf_log(log, "Cannot replace kernel functions\n");
21189			return -EINVAL;
21190		}
21191	}
21192
21193	switch (prog->expected_attach_type) {
21194	case BPF_TRACE_RAW_TP:
21195		if (tgt_prog) {
21196			bpf_log(log,
21197				"Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
21198			return -EINVAL;
21199		}
21200		if (!btf_type_is_typedef(t)) {
21201			bpf_log(log, "attach_btf_id %u is not a typedef\n",
21202				btf_id);
21203			return -EINVAL;
21204		}
21205		if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
21206			bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
21207				btf_id, tname);
21208			return -EINVAL;
21209		}
21210		tname += sizeof(prefix) - 1;
21211		t = btf_type_by_id(btf, t->type);
21212		if (!btf_type_is_ptr(t))
21213			/* should never happen in valid vmlinux build */
21214			return -EINVAL;
21215		t = btf_type_by_id(btf, t->type);
21216		if (!btf_type_is_func_proto(t))
21217			/* should never happen in valid vmlinux build */
21218			return -EINVAL;
21219
21220		break;
21221	case BPF_TRACE_ITER:
21222		if (!btf_type_is_func(t)) {
21223			bpf_log(log, "attach_btf_id %u is not a function\n",
21224				btf_id);
21225			return -EINVAL;
21226		}
21227		t = btf_type_by_id(btf, t->type);
21228		if (!btf_type_is_func_proto(t))
21229			return -EINVAL;
21230		ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
21231		if (ret)
21232			return ret;
21233		break;
21234	default:
21235		if (!prog_extension)
21236			return -EINVAL;
21237		fallthrough;
21238	case BPF_MODIFY_RETURN:
21239	case BPF_LSM_MAC:
21240	case BPF_LSM_CGROUP:
21241	case BPF_TRACE_FENTRY:
21242	case BPF_TRACE_FEXIT:
21243		if (!btf_type_is_func(t)) {
21244			bpf_log(log, "attach_btf_id %u is not a function\n",
21245				btf_id);
21246			return -EINVAL;
21247		}
21248		if (prog_extension &&
21249		    btf_check_type_match(log, prog, btf, t))
21250			return -EINVAL;
21251		t = btf_type_by_id(btf, t->type);
21252		if (!btf_type_is_func_proto(t))
21253			return -EINVAL;
21254
21255		if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) &&
21256		    (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type ||
21257		     prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
21258			return -EINVAL;
21259
21260		if (tgt_prog && conservative)
21261			t = NULL;
21262
21263		ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
21264		if (ret < 0)
21265			return ret;
21266
21267		if (tgt_prog) {
21268			if (subprog == 0)
21269				addr = (long) tgt_prog->bpf_func;
21270			else
21271				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
21272		} else {
21273			if (btf_is_module(btf)) {
21274				mod = btf_try_get_module(btf);
21275				if (mod)
21276					addr = find_kallsyms_symbol_value(mod, tname);
21277				else
21278					addr = 0;
21279			} else {
21280				addr = kallsyms_lookup_name(tname);
21281			}
21282			if (!addr) {
21283				module_put(mod);
21284				bpf_log(log,
21285					"The address of function %s cannot be found\n",
21286					tname);
21287				return -ENOENT;
21288			}
21289		}
21290
21291		if (prog->sleepable) {
21292			ret = -EINVAL;
21293			switch (prog->type) {
21294			case BPF_PROG_TYPE_TRACING:
21295
21296				/* fentry/fexit/fmod_ret progs can be sleepable if they are
21297				 * attached to ALLOW_ERROR_INJECTION and are not in denylist.
21298				 */
21299				if (!check_non_sleepable_error_inject(btf_id) &&
21300				    within_error_injection_list(addr))
21301					ret = 0;
21302				/* fentry/fexit/fmod_ret progs can also be sleepable if they are
21303				 * in the fmodret id set with the KF_SLEEPABLE flag.
21304				 */
21305				else {
21306					u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
21307										prog);
21308
21309					if (flags && (*flags & KF_SLEEPABLE))
21310						ret = 0;
21311				}
21312				break;
21313			case BPF_PROG_TYPE_LSM:
21314				/* LSM progs check that they are attached to bpf_lsm_*() funcs.
21315				 * Only some of them are sleepable.
21316				 */
21317				if (bpf_lsm_is_sleepable_hook(btf_id))
21318					ret = 0;
21319				break;
21320			default:
21321				break;
21322			}
21323			if (ret) {
21324				module_put(mod);
21325				bpf_log(log, "%s is not sleepable\n", tname);
21326				return ret;
21327			}
21328		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
21329			if (tgt_prog) {
21330				module_put(mod);
21331				bpf_log(log, "can't modify return codes of BPF programs\n");
21332				return -EINVAL;
21333			}
21334			ret = -EINVAL;
21335			if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
21336			    !check_attach_modify_return(addr, tname))
21337				ret = 0;
21338			if (ret) {
21339				module_put(mod);
21340				bpf_log(log, "%s() is not modifiable\n", tname);
21341				return ret;
21342			}
21343		}
21344
21345		break;
21346	}
21347	tgt_info->tgt_addr = addr;
21348	tgt_info->tgt_name = tname;
21349	tgt_info->tgt_type = t;
21350	tgt_info->tgt_mod = mod;
21351	return 0;
21352}
21353
21354BTF_SET_START(btf_id_deny)
21355BTF_ID_UNUSED
21356#ifdef CONFIG_SMP
21357BTF_ID(func, migrate_disable)
21358BTF_ID(func, migrate_enable)
21359#endif
21360#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
21361BTF_ID(func, rcu_read_unlock_strict)
21362#endif
21363#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
21364BTF_ID(func, preempt_count_add)
21365BTF_ID(func, preempt_count_sub)
21366#endif
21367#ifdef CONFIG_PREEMPT_RCU
21368BTF_ID(func, __rcu_read_lock)
21369BTF_ID(func, __rcu_read_unlock)
21370#endif
21371BTF_SET_END(btf_id_deny)
21372
21373static bool can_be_sleepable(struct bpf_prog *prog)
21374{
21375	if (prog->type == BPF_PROG_TYPE_TRACING) {
21376		switch (prog->expected_attach_type) {
21377		case BPF_TRACE_FENTRY:
21378		case BPF_TRACE_FEXIT:
21379		case BPF_MODIFY_RETURN:
21380		case BPF_TRACE_ITER:
21381			return true;
21382		default:
21383			return false;
21384		}
21385	}
21386	return prog->type == BPF_PROG_TYPE_LSM ||
21387	       prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
21388	       prog->type == BPF_PROG_TYPE_STRUCT_OPS;
21389}
21390
21391static int check_attach_btf_id(struct bpf_verifier_env *env)
21392{
21393	struct bpf_prog *prog = env->prog;
21394	struct bpf_prog *tgt_prog = prog->aux->dst_prog;
21395	struct bpf_attach_target_info tgt_info = {};
21396	u32 btf_id = prog->aux->attach_btf_id;
21397	struct bpf_trampoline *tr;
21398	int ret;
21399	u64 key;
21400
21401	if (prog->type == BPF_PROG_TYPE_SYSCALL) {
21402		if (prog->sleepable)
21403			/* attach_btf_id checked to be zero already */
21404			return 0;
21405		verbose(env, "Syscall programs can only be sleepable\n");
21406		return -EINVAL;
21407	}
21408
21409	if (prog->sleepable && !can_be_sleepable(prog)) {
21410		verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
21411		return -EINVAL;
21412	}
21413
21414	if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
21415		return check_struct_ops_btf_id(env);
21416
21417	if (prog->type != BPF_PROG_TYPE_TRACING &&
21418	    prog->type != BPF_PROG_TYPE_LSM &&
21419	    prog->type != BPF_PROG_TYPE_EXT)
21420		return 0;
21421
21422	ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
21423	if (ret)
21424		return ret;
21425
21426	if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
21427		/* to make freplace equivalent to their targets, they need to
21428		 * inherit env->ops and expected_attach_type for the rest of the
21429		 * verification
21430		 */
21431		env->ops = bpf_verifier_ops[tgt_prog->type];
21432		prog->expected_attach_type = tgt_prog->expected_attach_type;
21433	}
21434
21435	/* store info about the attachment target that will be used later */
21436	prog->aux->attach_func_proto = tgt_info.tgt_type;
21437	prog->aux->attach_func_name = tgt_info.tgt_name;
21438	prog->aux->mod = tgt_info.tgt_mod;
21439
21440	if (tgt_prog) {
21441		prog->aux->saved_dst_prog_type = tgt_prog->type;
21442		prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
21443	}
21444
21445	if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
21446		prog->aux->attach_btf_trace = true;
21447		return 0;
21448	} else if (prog->expected_attach_type == BPF_TRACE_ITER) {
21449		if (!bpf_iter_prog_supported(prog))
21450			return -EINVAL;
21451		return 0;
21452	}
21453
21454	if (prog->type == BPF_PROG_TYPE_LSM) {
21455		ret = bpf_lsm_verify_prog(&env->log, prog);
21456		if (ret < 0)
21457			return ret;
21458	} else if (prog->type == BPF_PROG_TYPE_TRACING &&
21459		   btf_id_set_contains(&btf_id_deny, btf_id)) {
21460		return -EINVAL;
21461	}
21462
21463	key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
21464	tr = bpf_trampoline_get(key, &tgt_info);
21465	if (!tr)
21466		return -ENOMEM;
21467
21468	if (tgt_prog && tgt_prog->aux->tail_call_reachable)
21469		tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
21470
21471	prog->aux->dst_trampoline = tr;
21472	return 0;
21473}
21474
21475struct btf *bpf_get_btf_vmlinux(void)
21476{
21477	if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
21478		mutex_lock(&bpf_verifier_lock);
21479		if (!btf_vmlinux)
21480			btf_vmlinux = btf_parse_vmlinux();
21481		mutex_unlock(&bpf_verifier_lock);
21482	}
21483	return btf_vmlinux;
21484}
21485
21486int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
21487{
21488	u64 start_time = ktime_get_ns();
21489	struct bpf_verifier_env *env;
21490	int i, len, ret = -EINVAL, err;
21491	u32 log_true_size;
21492	bool is_priv;
21493
21494	/* no program is valid */
21495	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
21496		return -EINVAL;
21497
21498	/* 'struct bpf_verifier_env' can be global, but since it's not small,
21499	 * allocate/free it every time bpf_check() is called
21500	 */
21501	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
21502	if (!env)
21503		return -ENOMEM;
21504
21505	env->bt.env = env;
21506
21507	len = (*prog)->len;
21508	env->insn_aux_data =
21509		vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
21510	ret = -ENOMEM;
21511	if (!env->insn_aux_data)
21512		goto err_free_env;
21513	for (i = 0; i < len; i++)
21514		env->insn_aux_data[i].orig_idx = i;
21515	env->prog = *prog;
21516	env->ops = bpf_verifier_ops[env->prog->type];
21517	env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
21518
21519	env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
21520	env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
21521	env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
21522	env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
21523	env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
21524
21525	bpf_get_btf_vmlinux();
21526
21527	/* grab the mutex to protect few globals used by verifier */
21528	if (!is_priv)
21529		mutex_lock(&bpf_verifier_lock);
21530
21531	/* user could have requested verbose verifier output
21532	 * and supplied buffer to store the verification trace
21533	 */
21534	ret = bpf_vlog_init(&env->log, attr->log_level,
21535			    (char __user *) (unsigned long) attr->log_buf,
21536			    attr->log_size);
21537	if (ret)
21538		goto err_unlock;
21539
21540	mark_verifier_state_clean(env);
21541
21542	if (IS_ERR(btf_vmlinux)) {
21543		/* Either gcc or pahole or kernel are broken. */
21544		verbose(env, "in-kernel BTF is malformed\n");
21545		ret = PTR_ERR(btf_vmlinux);
21546		goto skip_full_check;
21547	}
21548
21549	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
21550	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
21551		env->strict_alignment = true;
21552	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
21553		env->strict_alignment = false;
21554
21555	if (is_priv)
21556		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
21557	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
21558
21559	env->explored_states = kvcalloc(state_htab_size(env),
21560				       sizeof(struct bpf_verifier_state_list *),
21561				       GFP_USER);
21562	ret = -ENOMEM;
21563	if (!env->explored_states)
21564		goto skip_full_check;
21565
21566	ret = check_btf_info_early(env, attr, uattr);
21567	if (ret < 0)
21568		goto skip_full_check;
21569
21570	ret = add_subprog_and_kfunc(env);
21571	if (ret < 0)
21572		goto skip_full_check;
21573
21574	ret = check_subprogs(env);
21575	if (ret < 0)
21576		goto skip_full_check;
21577
21578	ret = check_btf_info(env, attr, uattr);
21579	if (ret < 0)
21580		goto skip_full_check;
21581
21582	ret = check_attach_btf_id(env);
21583	if (ret)
21584		goto skip_full_check;
21585
21586	ret = resolve_pseudo_ldimm64(env);
21587	if (ret < 0)
21588		goto skip_full_check;
21589
21590	if (bpf_prog_is_offloaded(env->prog->aux)) {
21591		ret = bpf_prog_offload_verifier_prep(env->prog);
21592		if (ret)
21593			goto skip_full_check;
21594	}
21595
21596	ret = check_cfg(env);
21597	if (ret < 0)
21598		goto skip_full_check;
21599
21600	ret = do_check_main(env);
21601	ret = ret ?: do_check_subprogs(env);
21602
21603	if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
21604		ret = bpf_prog_offload_finalize(env);
21605
21606skip_full_check:
21607	kvfree(env->explored_states);
21608
21609	if (ret == 0)
21610		ret = check_max_stack_depth(env);
21611
21612	/* instruction rewrites happen after this point */
21613	if (ret == 0)
21614		ret = optimize_bpf_loop(env);
21615
21616	if (is_priv) {
21617		if (ret == 0)
21618			opt_hard_wire_dead_code_branches(env);
21619		if (ret == 0)
21620			ret = opt_remove_dead_code(env);
21621		if (ret == 0)
21622			ret = opt_remove_nops(env);
21623	} else {
21624		if (ret == 0)
21625			sanitize_dead_code(env);
21626	}
21627
21628	if (ret == 0)
21629		/* program is valid, convert *(u32*)(ctx + off) accesses */
21630		ret = convert_ctx_accesses(env);
21631
21632	if (ret == 0)
21633		ret = do_misc_fixups(env);
21634
21635	/* do 32-bit optimization after insn patching has done so those patched
21636	 * insns could be handled correctly.
21637	 */
21638	if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
21639		ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
21640		env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
21641								     : false;
21642	}
21643
21644	if (ret == 0)
21645		ret = fixup_call_args(env);
21646
21647	env->verification_time = ktime_get_ns() - start_time;
21648	print_verification_stats(env);
21649	env->prog->aux->verified_insns = env->insn_processed;
21650
21651	/* preserve original error even if log finalization is successful */
21652	err = bpf_vlog_finalize(&env->log, &log_true_size);
21653	if (err)
21654		ret = err;
21655
21656	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
21657	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
21658				  &log_true_size, sizeof(log_true_size))) {
21659		ret = -EFAULT;
21660		goto err_release_maps;
21661	}
21662
21663	if (ret)
21664		goto err_release_maps;
21665
21666	if (env->used_map_cnt) {
21667		/* if program passed verifier, update used_maps in bpf_prog_info */
21668		env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
21669							  sizeof(env->used_maps[0]),
21670							  GFP_KERNEL);
21671
21672		if (!env->prog->aux->used_maps) {
21673			ret = -ENOMEM;
21674			goto err_release_maps;
21675		}
21676
21677		memcpy(env->prog->aux->used_maps, env->used_maps,
21678		       sizeof(env->used_maps[0]) * env->used_map_cnt);
21679		env->prog->aux->used_map_cnt = env->used_map_cnt;
21680	}
21681	if (env->used_btf_cnt) {
21682		/* if program passed verifier, update used_btfs in bpf_prog_aux */
21683		env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
21684							  sizeof(env->used_btfs[0]),
21685							  GFP_KERNEL);
21686		if (!env->prog->aux->used_btfs) {
21687			ret = -ENOMEM;
21688			goto err_release_maps;
21689		}
21690
21691		memcpy(env->prog->aux->used_btfs, env->used_btfs,
21692		       sizeof(env->used_btfs[0]) * env->used_btf_cnt);
21693		env->prog->aux->used_btf_cnt = env->used_btf_cnt;
21694	}
21695	if (env->used_map_cnt || env->used_btf_cnt) {
21696		/* program is valid. Convert pseudo bpf_ld_imm64 into generic
21697		 * bpf_ld_imm64 instructions
21698		 */
21699		convert_pseudo_ld_imm64(env);
21700	}
21701
21702	adjust_btf_func(env);
21703
21704err_release_maps:
21705	if (!env->prog->aux->used_maps)
21706		/* if we didn't copy map pointers into bpf_prog_info, release
21707		 * them now. Otherwise free_used_maps() will release them.
21708		 */
21709		release_maps(env);
21710	if (!env->prog->aux->used_btfs)
21711		release_btfs(env);
21712
21713	/* extension progs temporarily inherit the attach_type of their targets
21714	   for verification purposes, so set it back to zero before returning
21715	 */
21716	if (env->prog->type == BPF_PROG_TYPE_EXT)
21717		env->prog->expected_attach_type = 0;
21718
21719	*prog = env->prog;
21720
21721	module_put(env->attach_btf_mod);
21722err_unlock:
21723	if (!is_priv)
21724		mutex_unlock(&bpf_verifier_lock);
21725	vfree(env->insn_aux_data);
21726err_free_env:
21727	kfree(env);
21728	return ret;
21729}
21730