1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include <sys/cdefs.h>
36#include "opt_ddb.h"
37#include "opt_ktrace.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/capsicum.h>
42#include <sys/counter.h>
43#include <sys/filedesc.h>
44#include <sys/fnv_hash.h>
45#include <sys/kernel.h>
46#include <sys/ktr.h>
47#include <sys/lock.h>
48#include <sys/malloc.h>
49#include <sys/fcntl.h>
50#include <sys/jail.h>
51#include <sys/mount.h>
52#include <sys/namei.h>
53#include <sys/proc.h>
54#include <sys/seqc.h>
55#include <sys/sdt.h>
56#include <sys/smr.h>
57#include <sys/smp.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysctl.h>
60#include <sys/sysproto.h>
61#include <sys/vnode.h>
62#include <ck_queue.h>
63#ifdef KTRACE
64#include <sys/ktrace.h>
65#endif
66#ifdef INVARIANTS
67#include <machine/_inttypes.h>
68#endif
69
70#include <security/audit/audit.h>
71#include <security/mac/mac_framework.h>
72
73#ifdef DDB
74#include <ddb/ddb.h>
75#endif
76
77#include <vm/uma.h>
78
79/*
80 * High level overview of name caching in the VFS layer.
81 *
82 * Originally caching was implemented as part of UFS, later extracted to allow
83 * use by other filesystems. A decision was made to make it optional and
84 * completely detached from the rest of the kernel, which comes with limitations
85 * outlined near the end of this comment block.
86 *
87 * This fundamental choice needs to be revisited. In the meantime, the current
88 * state is described below. Significance of all notable routines is explained
89 * in comments placed above their implementation. Scattered thoroughout the
90 * file are TODO comments indicating shortcomings which can be fixed without
91 * reworking everything (most of the fixes will likely be reusable). Various
92 * details are omitted from this explanation to not clutter the overview, they
93 * have to be checked by reading the code and associated commentary.
94 *
95 * Keep in mind that it's individual path components which are cached, not full
96 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
97 * one for each name.
98 *
99 * I. Data organization
100 *
101 * Entries are described by "struct namecache" objects and stored in a hash
102 * table. See cache_get_hash for more information.
103 *
104 * "struct vnode" contains pointers to source entries (names which can be found
105 * when traversing through said vnode), destination entries (names of that
106 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
107 * the parent vnode.
108 *
109 * The (directory vnode; name) tuple reliably determines the target entry if
110 * it exists.
111 *
112 * Since there are no small locks at this time (all are 32 bytes in size on
113 * LP64), the code works around the problem by introducing lock arrays to
114 * protect hash buckets and vnode lists.
115 *
116 * II. Filesystem integration
117 *
118 * Filesystems participating in name caching do the following:
119 * - set vop_lookup routine to vfs_cache_lookup
120 * - set vop_cachedlookup to whatever can perform the lookup if the above fails
121 * - if they support lockless lookup (see below), vop_fplookup_vexec and
122 *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
123 *   mount point
124 * - call cache_purge or cache_vop_* routines to eliminate stale entries as
125 *   applicable
126 * - call cache_enter to add entries depending on the MAKEENTRY flag
127 *
128 * With the above in mind, there are 2 entry points when doing lookups:
129 * - ... -> namei -> cache_fplookup -- this is the default
130 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
131 *   should the above fail
132 *
133 * Example code flow how an entry is added:
134 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
135 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
136 *
137 * III. Performance considerations
138 *
139 * For lockless case forward lookup avoids any writes to shared areas apart
140 * from the terminal path component. In other words non-modifying lookups of
141 * different files don't suffer any scalability problems in the namecache.
142 * Looking up the same file is limited by VFS and goes beyond the scope of this
143 * file.
144 *
145 * At least on amd64 the single-threaded bottleneck for long paths is hashing
146 * (see cache_get_hash). There are cases where the code issues acquire fence
147 * multiple times, they can be combined on architectures which suffer from it.
148 *
149 * For locked case each encountered vnode has to be referenced and locked in
150 * order to be handed out to the caller (normally that's namei). This
151 * introduces significant hit single-threaded and serialization multi-threaded.
152 *
153 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
154 * avoids any writes to shared areas to any components.
155 *
156 * Unrelated insertions are partially serialized on updating the global entry
157 * counter and possibly serialized on colliding bucket or vnode locks.
158 *
159 * IV. Observability
160 *
161 * Note not everything has an explicit dtrace probe nor it should have, thus
162 * some of the one-liners below depend on implementation details.
163 *
164 * Examples:
165 *
166 * # Check what lookups failed to be handled in a lockless manner. Column 1 is
167 * # line number, column 2 is status code (see cache_fpl_status)
168 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
169 *
170 * # Lengths of names added by binary name
171 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
172 *
173 * # Same as above but only those which exceed 64 characters
174 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
175 *
176 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
177 * # path is it
178 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
179 *
180 * V. Limitations and implementation defects
181 *
182 * - since it is possible there is no entry for an open file, tools like
183 *   "procstat" may fail to resolve fd -> vnode -> path to anything
184 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
185 *   shortage) in which case the above problem applies
186 * - hardlinks are not tracked, thus if a vnode is reachable in more than one
187 *   way, resolving a name may return a different path than the one used to
188 *   open it (even if said path is still valid)
189 * - by default entries are not added for newly created files
190 * - adding an entry may need to evict negative entry first, which happens in 2
191 *   distinct places (evicting on lookup, adding in a later VOP) making it
192 *   impossible to simply reuse it
193 * - there is a simple scheme to evict negative entries as the cache is approaching
194 *   its capacity, but it is very unclear if doing so is a good idea to begin with
195 * - vnodes are subject to being recycled even if target inode is left in memory,
196 *   which loses the name cache entries when it perhaps should not. in case of tmpfs
197 *   names get duplicated -- kept by filesystem itself and namecache separately
198 * - struct namecache has a fixed size and comes in 2 variants, often wasting
199 *   space.  now hard to replace with malloc due to dependence on SMR, which
200 *   requires UMA zones to opt in
201 * - lack of better integration with the kernel also turns nullfs into a layered
202 *   filesystem instead of something which can take advantage of caching
203 *
204 * Appendix A: where is the time lost, expanding on paragraph III
205 *
206 * While some care went into optimizing lookups, there is still plenty of
207 * performance left on the table, most notably from single-threaded standpoint.
208 * Below is a woefully incomplete list of changes which can help.  Ideas are
209 * mostly sketched out, no claim is made all kinks or prerequisites are laid
210 * out.
211 *
212 * Note there is performance lost all over VFS.
213 *
214 * === SMR-only lookup
215 *
216 * For commonly used ops like stat(2), when the terminal vnode *is* cached,
217 * lockless lookup could refrain from refing/locking the found vnode and
218 * instead return while within the SMR section. Then a call to, say,
219 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result
220 * would be validated with seqc not changing. This would be faster
221 * single-threaded as it dodges atomics and would provide full scalability for
222 * multicore uses. This would *not* work for open(2) or other calls which need
223 * the vnode to hang around for the long haul, but would work for aforementioned
224 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
225 *
226 * === hotpatching for sdt probes
227 *
228 * They result in *tons* of branches all over with rather regrettable codegen
229 * at times. Removing sdt probes altogether gives over 2% boost in lookup rate.
230 * Reworking the code to patch itself at runtime with asm goto would solve it.
231 * asm goto is fully supported by gcc and clang.
232 *
233 * === copyinstr
234 *
235 * On all architectures it operates one byte at a time, while it could be
236 * word-sized instead thanks to the Mycroft trick.
237 *
238 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and
239 * *optionally* filling in the length parameter.
240 *
241 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer
242 * size which is a multiply of the word (and never zero), with the length
243 * always returned. On top of it the routine could be allowed to transform the
244 * buffer in arbitrary ways, most notably writing past the found length (not to
245 * be confused with writing past buffer size) -- this would allow word-sized
246 * movs while checking for '\0' later.
247 *
248 * === detour through namei
249 *
250 * Currently one suffers being called from namei, which then has to check if
251 * things worked out locklessly. Instead the lockless lookup could be the
252 * actual entry point which calls what is currently namei as a fallback.
253 *
254 * === avoidable branches in cache_can_fplookup
255 *
256 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
257 * this is off, none of fplookup code should execute).
258 *
259 * Both audit and capsicum branches can be combined into one, but it requires
260 * paying off a lot of tech debt first.
261 *
262 * ni_startdir could be indicated with a flag in cn_flags, eliminating the
263 * branch.
264 *
265 * === mount stacks
266 *
267 * Crossing a mount requires checking if perhaps something is mounted on top.
268 * Instead, an additional entry could be added to struct mount with a pointer
269 * to the final mount on the stack. This would be recalculated on each
270 * mount/unmount.
271 *
272 * === root vnodes
273 *
274 * It could become part of the API contract to *always* have a rootvnode set in
275 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have
276 * to be modified to always skip them.
277 *
278 * === inactive on v_usecount reaching 0
279 *
280 * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such
281 * processing with a bit in usecount.
282 *
283 * === v_holdcnt
284 *
285 * Hold count should probably get eliminated, but one can argue it is a useful
286 * feature. Even if so, handling of v_usecount could be decoupled from it --
287 * vnlru et al would consider the vnode not-freeable if has either hold or
288 * usecount on it.
289 *
290 * This would eliminate 2 atomics.
291 */
292
293static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
294    "Name cache");
295
296SDT_PROVIDER_DECLARE(vfs);
297SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
298    "struct vnode *");
299SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
300    "struct vnode *");
301SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
302    "char *");
303SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
304    "const char *");
305SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
306    "struct namecache *", "int", "int");
307SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
308SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
309    "char *", "struct vnode *");
310SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
311SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
312    "struct vnode *", "char *");
313SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
314    "struct vnode *");
315SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
316    "struct vnode *", "char *");
317SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
318    "char *");
319SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
320    "struct componentname *");
321SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
322    "struct componentname *");
323SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
324SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
325SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
326SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
327SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
328    "struct vnode *");
329SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
330    "char *");
331SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
332    "char *");
333SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
334
335SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
336SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
337SDT_PROBE_DECLARE(vfs, namei, lookup, return);
338
339static char __read_frequently cache_fast_lookup_enabled = true;
340
341/*
342 * This structure describes the elements in the cache of recent
343 * names looked up by namei.
344 */
345struct negstate {
346	u_char neg_flag;
347	u_char neg_hit;
348};
349_Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
350    "the state must fit in a union with a pointer without growing it");
351
352struct	namecache {
353	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
354	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
355	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
356	struct	vnode *nc_dvp;		/* vnode of parent of name */
357	union {
358		struct	vnode *nu_vp;	/* vnode the name refers to */
359		struct	negstate nu_neg;/* negative entry state */
360	} n_un;
361	u_char	nc_flag;		/* flag bits */
362	u_char	nc_nlen;		/* length of name */
363	char	nc_name[];		/* segment name + nul */
364};
365
366/*
367 * struct namecache_ts repeats struct namecache layout up to the
368 * nc_nlen member.
369 * struct namecache_ts is used in place of struct namecache when time(s) need
370 * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
371 * both a non-dotdot directory name plus dotdot for the directory's
372 * parent.
373 *
374 * See below for alignment requirement.
375 */
376struct	namecache_ts {
377	struct	timespec nc_time;	/* timespec provided by fs */
378	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
379	int	nc_ticks;		/* ticks value when entry was added */
380	int	nc_pad;
381	struct namecache nc_nc;
382};
383
384TAILQ_HEAD(cache_freebatch, namecache);
385
386/*
387 * At least mips n32 performs 64-bit accesses to timespec as found
388 * in namecache_ts and requires them to be aligned. Since others
389 * may be in the same spot suffer a little bit and enforce the
390 * alignment for everyone. Note this is a nop for 64-bit platforms.
391 */
392#define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
393
394/*
395 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
396 * 4.4 BSD codebase. Later on struct namecache was tweaked to become
397 * smaller and the value was bumped to retain the total size, but it
398 * was never re-evaluated for suitability. A simple test counting
399 * lengths during package building shows that the value of 45 covers
400 * about 86% of all added entries, reaching 99% at 65.
401 *
402 * Regardless of the above, use of dedicated zones instead of malloc may be
403 * inducing additional waste. This may be hard to address as said zones are
404 * tied to VFS SMR. Even if retaining them, the current split should be
405 * re-evaluated.
406 */
407#ifdef __LP64__
408#define	CACHE_PATH_CUTOFF	45
409#define	CACHE_LARGE_PAD		6
410#else
411#define	CACHE_PATH_CUTOFF	41
412#define	CACHE_LARGE_PAD		2
413#endif
414
415#define CACHE_ZONE_SMALL_SIZE		(offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
416#define CACHE_ZONE_SMALL_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
417#define CACHE_ZONE_LARGE_SIZE		(offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
418#define CACHE_ZONE_LARGE_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
419
420_Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
421_Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
422_Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
423_Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
424
425#define	nc_vp		n_un.nu_vp
426#define	nc_neg		n_un.nu_neg
427
428/*
429 * Flags in namecache.nc_flag
430 */
431#define NCF_WHITE	0x01
432#define NCF_ISDOTDOT	0x02
433#define	NCF_TS		0x04
434#define	NCF_DTS		0x08
435#define	NCF_DVDROP	0x10
436#define	NCF_NEGATIVE	0x20
437#define	NCF_INVALID	0x40
438#define	NCF_WIP		0x80
439
440/*
441 * Flags in negstate.neg_flag
442 */
443#define NEG_HOT		0x01
444
445static bool	cache_neg_evict_cond(u_long lnumcache);
446
447/*
448 * Mark an entry as invalid.
449 *
450 * This is called before it starts getting deconstructed.
451 */
452static void
453cache_ncp_invalidate(struct namecache *ncp)
454{
455
456	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
457	    ("%s: entry %p already invalid", __func__, ncp));
458	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
459	atomic_thread_fence_rel();
460}
461
462/*
463 * Check whether the entry can be safely used.
464 *
465 * All places which elide locks are supposed to call this after they are
466 * done with reading from an entry.
467 */
468#define cache_ncp_canuse(ncp)	({					\
469	struct namecache *_ncp = (ncp);					\
470	u_char _nc_flag;						\
471									\
472	atomic_thread_fence_acq();					\
473	_nc_flag = atomic_load_char(&_ncp->nc_flag);			\
474	__predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);	\
475})
476
477/*
478 * Like the above but also checks NCF_WHITE.
479 */
480#define cache_fpl_neg_ncp_canuse(ncp)	({				\
481	struct namecache *_ncp = (ncp);					\
482	u_char _nc_flag;						\
483									\
484	atomic_thread_fence_acq();					\
485	_nc_flag = atomic_load_char(&_ncp->nc_flag);			\
486	__predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);	\
487})
488
489VFS_SMR_DECLARE;
490
491static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
492    "Name cache parameters");
493
494static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
495SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
496    "Total namecache capacity");
497
498u_int ncsizefactor = 2;
499SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
500    "Size factor for namecache");
501
502static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
503SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
504    "Ratio of negative namecache entries");
505
506/*
507 * Negative entry % of namecache capacity above which automatic eviction is allowed.
508 *
509 * Check cache_neg_evict_cond for details.
510 */
511static u_int ncnegminpct = 3;
512
513static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
514SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
515    "Negative entry count above which automatic eviction is allowed");
516
517/*
518 * Structures associated with name caching.
519 */
520#define NCHHASH(hash) \
521	(&nchashtbl[(hash) & nchash])
522static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
523static u_long __read_mostly	nchash;			/* size of hash table */
524SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
525    "Size of namecache hash table");
526static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
527static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
528
529struct nchstats	nchstats;		/* cache effectiveness statistics */
530
531static u_int __exclusive_cache_line neg_cycle;
532
533#define ncneghash	3
534#define	numneglists	(ncneghash + 1)
535
536struct neglist {
537	struct mtx		nl_evict_lock;
538	struct mtx		nl_lock __aligned(CACHE_LINE_SIZE);
539	TAILQ_HEAD(, namecache) nl_list;
540	TAILQ_HEAD(, namecache) nl_hotlist;
541	u_long			nl_hotnum;
542} __aligned(CACHE_LINE_SIZE);
543
544static struct neglist neglists[numneglists];
545
546static inline struct neglist *
547NCP2NEGLIST(struct namecache *ncp)
548{
549
550	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
551}
552
553static inline struct negstate *
554NCP2NEGSTATE(struct namecache *ncp)
555{
556
557	MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
558	return (&ncp->nc_neg);
559}
560
561#define	numbucketlocks (ncbuckethash + 1)
562static u_int __read_mostly  ncbuckethash;
563static struct mtx_padalign __read_mostly  *bucketlocks;
564#define	HASH2BUCKETLOCK(hash) \
565	((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
566
567#define	numvnodelocks (ncvnodehash + 1)
568static u_int __read_mostly  ncvnodehash;
569static struct mtx __read_mostly *vnodelocks;
570static inline struct mtx *
571VP2VNODELOCK(struct vnode *vp)
572{
573
574	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
575}
576
577static void
578cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
579{
580	struct namecache_ts *ncp_ts;
581
582	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
583	    (tsp == NULL && ticksp == NULL),
584	    ("No NCF_TS"));
585
586	if (tsp == NULL)
587		return;
588
589	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
590	*tsp = ncp_ts->nc_time;
591	*ticksp = ncp_ts->nc_ticks;
592}
593
594#ifdef DEBUG_CACHE
595static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
596SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
597    "VFS namecache enabled");
598#endif
599
600/* Export size information to userland */
601SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
602    sizeof(struct namecache), "sizeof(struct namecache)");
603
604/*
605 * The new name cache statistics
606 */
607static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
608    "Name cache statistics");
609
610#define STATNODE_ULONG(name, varname, descr)					\
611	SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
612#define STATNODE_COUNTER(name, varname, descr)					\
613	static COUNTER_U64_DEFINE_EARLY(varname);				\
614	SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
615	    descr);
616STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
617STATNODE_ULONG(count, numcache, "Number of cache entries");
618STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
619STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
620STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
621STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
622STATNODE_COUNTER(poszaps, numposzaps,
623    "Number of cache hits (positive) we do not want to cache");
624STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
625STATNODE_COUNTER(negzaps, numnegzaps,
626    "Number of cache hits (negative) we do not want to cache");
627STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
628/* These count for vn_getcwd(), too. */
629STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
630STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
631    "Number of fullpath search errors (VOP_VPTOCNP failures)");
632STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
633STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
634STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
635
636/*
637 * Debug or developer statistics.
638 */
639static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
640    "Name cache debugging");
641#define DEBUGNODE_ULONG(name, varname, descr)					\
642	SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
643static u_long zap_bucket_relock_success;
644DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
645    "Number of successful removals after relocking");
646static u_long zap_bucket_fail;
647DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
648static u_long zap_bucket_fail2;
649DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
650static u_long cache_lock_vnodes_cel_3_failures;
651DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
652    "Number of times 3-way vnode locking failed");
653
654static void cache_zap_locked(struct namecache *ncp);
655static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
656    char **retbuf, size_t *buflen, size_t addend);
657static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
658    char **retbuf, size_t *buflen);
659static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
660    char **retbuf, size_t *len, size_t addend);
661
662static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
663
664static inline void
665cache_assert_vlp_locked(struct mtx *vlp)
666{
667
668	if (vlp != NULL)
669		mtx_assert(vlp, MA_OWNED);
670}
671
672static inline void
673cache_assert_vnode_locked(struct vnode *vp)
674{
675	struct mtx *vlp;
676
677	vlp = VP2VNODELOCK(vp);
678	cache_assert_vlp_locked(vlp);
679}
680
681/*
682 * Directory vnodes with entries are held for two reasons:
683 * 1. make them less of a target for reclamation in vnlru
684 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
685 *
686 * It will be feasible to stop doing it altogether if all filesystems start
687 * supporting lockless lookup.
688 */
689static void
690cache_hold_vnode(struct vnode *vp)
691{
692
693	cache_assert_vnode_locked(vp);
694	VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
695	vhold(vp);
696	counter_u64_add(numcachehv, 1);
697}
698
699static void
700cache_drop_vnode(struct vnode *vp)
701{
702
703	/*
704	 * Called after all locks are dropped, meaning we can't assert
705	 * on the state of v_cache_src.
706	 */
707	vdrop(vp);
708	counter_u64_add(numcachehv, -1);
709}
710
711/*
712 * UMA zones.
713 */
714static uma_zone_t __read_mostly cache_zone_small;
715static uma_zone_t __read_mostly cache_zone_small_ts;
716static uma_zone_t __read_mostly cache_zone_large;
717static uma_zone_t __read_mostly cache_zone_large_ts;
718
719char *
720cache_symlink_alloc(size_t size, int flags)
721{
722
723	if (size < CACHE_ZONE_SMALL_SIZE) {
724		return (uma_zalloc_smr(cache_zone_small, flags));
725	}
726	if (size < CACHE_ZONE_LARGE_SIZE) {
727		return (uma_zalloc_smr(cache_zone_large, flags));
728	}
729	counter_u64_add(symlinktoobig, 1);
730	SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
731	return (NULL);
732}
733
734void
735cache_symlink_free(char *string, size_t size)
736{
737
738	MPASS(string != NULL);
739	KASSERT(size < CACHE_ZONE_LARGE_SIZE,
740	    ("%s: size %zu too big", __func__, size));
741
742	if (size < CACHE_ZONE_SMALL_SIZE) {
743		uma_zfree_smr(cache_zone_small, string);
744		return;
745	}
746	if (size < CACHE_ZONE_LARGE_SIZE) {
747		uma_zfree_smr(cache_zone_large, string);
748		return;
749	}
750	__assert_unreachable();
751}
752
753static struct namecache *
754cache_alloc_uma(int len, bool ts)
755{
756	struct namecache_ts *ncp_ts;
757	struct namecache *ncp;
758
759	if (__predict_false(ts)) {
760		if (len <= CACHE_PATH_CUTOFF)
761			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
762		else
763			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
764		ncp = &ncp_ts->nc_nc;
765	} else {
766		if (len <= CACHE_PATH_CUTOFF)
767			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
768		else
769			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
770	}
771	return (ncp);
772}
773
774static void
775cache_free_uma(struct namecache *ncp)
776{
777	struct namecache_ts *ncp_ts;
778
779	if (__predict_false(ncp->nc_flag & NCF_TS)) {
780		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
781		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
782			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
783		else
784			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
785	} else {
786		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
787			uma_zfree_smr(cache_zone_small, ncp);
788		else
789			uma_zfree_smr(cache_zone_large, ncp);
790	}
791}
792
793static struct namecache *
794cache_alloc(int len, bool ts)
795{
796	u_long lnumcache;
797
798	/*
799	 * Avoid blowout in namecache entries.
800	 *
801	 * Bugs:
802	 * 1. filesystems may end up trying to add an already existing entry
803	 * (for example this can happen after a cache miss during concurrent
804	 * lookup), in which case we will call cache_neg_evict despite not
805	 * adding anything.
806	 * 2. the routine may fail to free anything and no provisions are made
807	 * to make it try harder (see the inside for failure modes)
808	 * 3. it only ever looks at negative entries.
809	 */
810	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
811	if (cache_neg_evict_cond(lnumcache)) {
812		lnumcache = atomic_load_long(&numcache);
813	}
814	if (__predict_false(lnumcache >= ncsize)) {
815		atomic_subtract_long(&numcache, 1);
816		counter_u64_add(numdrops, 1);
817		return (NULL);
818	}
819	return (cache_alloc_uma(len, ts));
820}
821
822static void
823cache_free(struct namecache *ncp)
824{
825
826	MPASS(ncp != NULL);
827	if ((ncp->nc_flag & NCF_DVDROP) != 0) {
828		cache_drop_vnode(ncp->nc_dvp);
829	}
830	cache_free_uma(ncp);
831	atomic_subtract_long(&numcache, 1);
832}
833
834static void
835cache_free_batch(struct cache_freebatch *batch)
836{
837	struct namecache *ncp, *nnp;
838	int i;
839
840	i = 0;
841	if (TAILQ_EMPTY(batch))
842		goto out;
843	TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
844		if ((ncp->nc_flag & NCF_DVDROP) != 0) {
845			cache_drop_vnode(ncp->nc_dvp);
846		}
847		cache_free_uma(ncp);
848		i++;
849	}
850	atomic_subtract_long(&numcache, i);
851out:
852	SDT_PROBE1(vfs, namecache, purge, batch, i);
853}
854
855/*
856 * Hashing.
857 *
858 * The code was made to use FNV in 2001 and this choice needs to be revisited.
859 *
860 * Short summary of the difficulty:
861 * The longest name which can be inserted is NAME_MAX characters in length (or
862 * 255 at the time of writing this comment), while majority of names used in
863 * practice are significantly shorter (mostly below 10). More importantly
864 * majority of lookups performed find names are even shorter than that.
865 *
866 * This poses a problem where hashes which do better than FNV past word size
867 * (or so) tend to come with additional overhead when finalizing the result,
868 * making them noticeably slower for the most commonly used range.
869 *
870 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
871 *
872 * When looking it up the most time consuming part by a large margin (at least
873 * on amd64) is hashing.  Replacing FNV with something which pessimizes short
874 * input would make the slowest part stand out even more.
875 */
876
877/*
878 * TODO: With the value stored we can do better than computing the hash based
879 * on the address.
880 */
881static void
882cache_prehash(struct vnode *vp)
883{
884
885	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
886}
887
888static uint32_t
889cache_get_hash(char *name, u_char len, struct vnode *dvp)
890{
891
892	return (fnv_32_buf(name, len, dvp->v_nchash));
893}
894
895static uint32_t
896cache_get_hash_iter_start(struct vnode *dvp)
897{
898
899	return (dvp->v_nchash);
900}
901
902static uint32_t
903cache_get_hash_iter(char c, uint32_t hash)
904{
905
906	return (fnv_32_buf(&c, 1, hash));
907}
908
909static uint32_t
910cache_get_hash_iter_finish(uint32_t hash)
911{
912
913	return (hash);
914}
915
916static inline struct nchashhead *
917NCP2BUCKET(struct namecache *ncp)
918{
919	uint32_t hash;
920
921	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
922	return (NCHHASH(hash));
923}
924
925static inline struct mtx *
926NCP2BUCKETLOCK(struct namecache *ncp)
927{
928	uint32_t hash;
929
930	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
931	return (HASH2BUCKETLOCK(hash));
932}
933
934#ifdef INVARIANTS
935static void
936cache_assert_bucket_locked(struct namecache *ncp)
937{
938	struct mtx *blp;
939
940	blp = NCP2BUCKETLOCK(ncp);
941	mtx_assert(blp, MA_OWNED);
942}
943
944static void
945cache_assert_bucket_unlocked(struct namecache *ncp)
946{
947	struct mtx *blp;
948
949	blp = NCP2BUCKETLOCK(ncp);
950	mtx_assert(blp, MA_NOTOWNED);
951}
952#else
953#define cache_assert_bucket_locked(x) do { } while (0)
954#define cache_assert_bucket_unlocked(x) do { } while (0)
955#endif
956
957#define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
958static void
959_cache_sort_vnodes(void **p1, void **p2)
960{
961	void *tmp;
962
963	MPASS(*p1 != NULL || *p2 != NULL);
964
965	if (*p1 > *p2) {
966		tmp = *p2;
967		*p2 = *p1;
968		*p1 = tmp;
969	}
970}
971
972static void
973cache_lock_all_buckets(void)
974{
975	u_int i;
976
977	for (i = 0; i < numbucketlocks; i++)
978		mtx_lock(&bucketlocks[i]);
979}
980
981static void
982cache_unlock_all_buckets(void)
983{
984	u_int i;
985
986	for (i = 0; i < numbucketlocks; i++)
987		mtx_unlock(&bucketlocks[i]);
988}
989
990static void
991cache_lock_all_vnodes(void)
992{
993	u_int i;
994
995	for (i = 0; i < numvnodelocks; i++)
996		mtx_lock(&vnodelocks[i]);
997}
998
999static void
1000cache_unlock_all_vnodes(void)
1001{
1002	u_int i;
1003
1004	for (i = 0; i < numvnodelocks; i++)
1005		mtx_unlock(&vnodelocks[i]);
1006}
1007
1008static int
1009cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1010{
1011
1012	cache_sort_vnodes(&vlp1, &vlp2);
1013
1014	if (vlp1 != NULL) {
1015		if (!mtx_trylock(vlp1))
1016			return (EAGAIN);
1017	}
1018	if (!mtx_trylock(vlp2)) {
1019		if (vlp1 != NULL)
1020			mtx_unlock(vlp1);
1021		return (EAGAIN);
1022	}
1023
1024	return (0);
1025}
1026
1027static void
1028cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1029{
1030
1031	MPASS(vlp1 != NULL || vlp2 != NULL);
1032	MPASS(vlp1 <= vlp2);
1033
1034	if (vlp1 != NULL)
1035		mtx_lock(vlp1);
1036	if (vlp2 != NULL)
1037		mtx_lock(vlp2);
1038}
1039
1040static void
1041cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1042{
1043
1044	MPASS(vlp1 != NULL || vlp2 != NULL);
1045
1046	if (vlp1 != NULL)
1047		mtx_unlock(vlp1);
1048	if (vlp2 != NULL)
1049		mtx_unlock(vlp2);
1050}
1051
1052static int
1053sysctl_nchstats(SYSCTL_HANDLER_ARGS)
1054{
1055	struct nchstats snap;
1056
1057	if (req->oldptr == NULL)
1058		return (SYSCTL_OUT(req, 0, sizeof(snap)));
1059
1060	snap = nchstats;
1061	snap.ncs_goodhits = counter_u64_fetch(numposhits);
1062	snap.ncs_neghits = counter_u64_fetch(numneghits);
1063	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
1064	    counter_u64_fetch(numnegzaps);
1065	snap.ncs_miss = counter_u64_fetch(nummisszap) +
1066	    counter_u64_fetch(nummiss);
1067
1068	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
1069}
1070SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
1071    CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
1072    "VFS cache effectiveness statistics");
1073
1074static void
1075cache_recalc_neg_min(void)
1076{
1077
1078	neg_min = (ncsize * ncnegminpct) / 100;
1079}
1080
1081static int
1082sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1083{
1084	u_int val;
1085	int error;
1086
1087	val = ncnegminpct;
1088	error = sysctl_handle_int(oidp, &val, 0, req);
1089	if (error != 0 || req->newptr == NULL)
1090		return (error);
1091
1092	if (val == ncnegminpct)
1093		return (0);
1094	if (val < 0 || val > 99)
1095		return (EINVAL);
1096	ncnegminpct = val;
1097	cache_recalc_neg_min();
1098	return (0);
1099}
1100
1101SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1102    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1103    "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1104
1105#ifdef DEBUG_CACHE
1106/*
1107 * Grab an atomic snapshot of the name cache hash chain lengths
1108 */
1109static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1110    CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1111    "hash table stats");
1112
1113static int
1114sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1115{
1116	struct nchashhead *ncpp;
1117	struct namecache *ncp;
1118	int i, error, n_nchash, *cntbuf;
1119
1120retry:
1121	n_nchash = nchash + 1;	/* nchash is max index, not count */
1122	if (req->oldptr == NULL)
1123		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1124	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1125	cache_lock_all_buckets();
1126	if (n_nchash != nchash + 1) {
1127		cache_unlock_all_buckets();
1128		free(cntbuf, M_TEMP);
1129		goto retry;
1130	}
1131	/* Scan hash tables counting entries */
1132	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1133		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1134			cntbuf[i]++;
1135	cache_unlock_all_buckets();
1136	for (error = 0, i = 0; i < n_nchash; i++)
1137		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1138			break;
1139	free(cntbuf, M_TEMP);
1140	return (error);
1141}
1142SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1143    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1144    "nchash chain lengths");
1145
1146static int
1147sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1148{
1149	int error;
1150	struct nchashhead *ncpp;
1151	struct namecache *ncp;
1152	int n_nchash;
1153	int count, maxlength, used, pct;
1154
1155	if (!req->oldptr)
1156		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1157
1158	cache_lock_all_buckets();
1159	n_nchash = nchash + 1;	/* nchash is max index, not count */
1160	used = 0;
1161	maxlength = 0;
1162
1163	/* Scan hash tables for applicable entries */
1164	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1165		count = 0;
1166		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1167			count++;
1168		}
1169		if (count)
1170			used++;
1171		if (maxlength < count)
1172			maxlength = count;
1173	}
1174	n_nchash = nchash + 1;
1175	cache_unlock_all_buckets();
1176	pct = (used * 100) / (n_nchash / 100);
1177	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1178	if (error)
1179		return (error);
1180	error = SYSCTL_OUT(req, &used, sizeof(used));
1181	if (error)
1182		return (error);
1183	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1184	if (error)
1185		return (error);
1186	error = SYSCTL_OUT(req, &pct, sizeof(pct));
1187	if (error)
1188		return (error);
1189	return (0);
1190}
1191SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1192    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1193    "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1194#endif
1195
1196/*
1197 * Negative entries management
1198 *
1199 * Various workloads create plenty of negative entries and barely use them
1200 * afterwards. Moreover malicious users can keep performing bogus lookups
1201 * adding even more entries. For example "make tinderbox" as of writing this
1202 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1203 * negative.
1204 *
1205 * As such, a rather aggressive eviction method is needed. The currently
1206 * employed method is a placeholder.
1207 *
1208 * Entries are split over numneglists separate lists, each of which is further
1209 * split into hot and cold entries. Entries get promoted after getting a hit.
1210 * Eviction happens on addition of new entry.
1211 */
1212static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1213    "Name cache negative entry statistics");
1214
1215SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1216    "Number of negative cache entries");
1217
1218static COUNTER_U64_DEFINE_EARLY(neg_created);
1219SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1220    "Number of created negative entries");
1221
1222static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1223SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1224    "Number of evicted negative entries");
1225
1226static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1227SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1228    &neg_evict_skipped_empty,
1229    "Number of times evicting failed due to lack of entries");
1230
1231static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1232SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1233    &neg_evict_skipped_missed,
1234    "Number of times evicting failed due to target entry disappearing");
1235
1236static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1237SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1238    &neg_evict_skipped_contended,
1239    "Number of times evicting failed due to contention");
1240
1241SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1242    "Number of cache hits (negative)");
1243
1244static int
1245sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1246{
1247	int i, out;
1248
1249	out = 0;
1250	for (i = 0; i < numneglists; i++)
1251		out += neglists[i].nl_hotnum;
1252
1253	return (SYSCTL_OUT(req, &out, sizeof(out)));
1254}
1255SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1256    CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1257    "Number of hot negative entries");
1258
1259static void
1260cache_neg_init(struct namecache *ncp)
1261{
1262	struct negstate *ns;
1263
1264	ncp->nc_flag |= NCF_NEGATIVE;
1265	ns = NCP2NEGSTATE(ncp);
1266	ns->neg_flag = 0;
1267	ns->neg_hit = 0;
1268	counter_u64_add(neg_created, 1);
1269}
1270
1271#define CACHE_NEG_PROMOTION_THRESH 2
1272
1273static bool
1274cache_neg_hit_prep(struct namecache *ncp)
1275{
1276	struct negstate *ns;
1277	u_char n;
1278
1279	ns = NCP2NEGSTATE(ncp);
1280	n = atomic_load_char(&ns->neg_hit);
1281	for (;;) {
1282		if (n >= CACHE_NEG_PROMOTION_THRESH)
1283			return (false);
1284		if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1285			break;
1286	}
1287	return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1288}
1289
1290/*
1291 * Nothing to do here but it is provided for completeness as some
1292 * cache_neg_hit_prep callers may end up returning without even
1293 * trying to promote.
1294 */
1295#define cache_neg_hit_abort(ncp)	do { } while (0)
1296
1297static void
1298cache_neg_hit_finish(struct namecache *ncp)
1299{
1300
1301	SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1302	counter_u64_add(numneghits, 1);
1303}
1304
1305/*
1306 * Move a negative entry to the hot list.
1307 */
1308static void
1309cache_neg_promote_locked(struct namecache *ncp)
1310{
1311	struct neglist *nl;
1312	struct negstate *ns;
1313
1314	ns = NCP2NEGSTATE(ncp);
1315	nl = NCP2NEGLIST(ncp);
1316	mtx_assert(&nl->nl_lock, MA_OWNED);
1317	if ((ns->neg_flag & NEG_HOT) == 0) {
1318		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1319		TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1320		nl->nl_hotnum++;
1321		ns->neg_flag |= NEG_HOT;
1322	}
1323}
1324
1325/*
1326 * Move a hot negative entry to the cold list.
1327 */
1328static void
1329cache_neg_demote_locked(struct namecache *ncp)
1330{
1331	struct neglist *nl;
1332	struct negstate *ns;
1333
1334	ns = NCP2NEGSTATE(ncp);
1335	nl = NCP2NEGLIST(ncp);
1336	mtx_assert(&nl->nl_lock, MA_OWNED);
1337	MPASS(ns->neg_flag & NEG_HOT);
1338	TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1339	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1340	nl->nl_hotnum--;
1341	ns->neg_flag &= ~NEG_HOT;
1342	atomic_store_char(&ns->neg_hit, 0);
1343}
1344
1345/*
1346 * Move a negative entry to the hot list if it matches the lookup.
1347 *
1348 * We have to take locks, but they may be contended and in the worst
1349 * case we may need to go off CPU. We don't want to spin within the
1350 * smr section and we can't block with it. Exiting the section means
1351 * the found entry could have been evicted. We are going to look it
1352 * up again.
1353 */
1354static bool
1355cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1356    struct namecache *oncp, uint32_t hash)
1357{
1358	struct namecache *ncp;
1359	struct neglist *nl;
1360	u_char nc_flag;
1361
1362	nl = NCP2NEGLIST(oncp);
1363
1364	mtx_lock(&nl->nl_lock);
1365	/*
1366	 * For hash iteration.
1367	 */
1368	vfs_smr_enter();
1369
1370	/*
1371	 * Avoid all surprises by only succeeding if we got the same entry and
1372	 * bailing completely otherwise.
1373	 * XXX There are no provisions to keep the vnode around, meaning we may
1374	 * end up promoting a negative entry for a *new* vnode and returning
1375	 * ENOENT on its account. This is the error we want to return anyway
1376	 * and promotion is harmless.
1377	 *
1378	 * In particular at this point there can be a new ncp which matches the
1379	 * search but hashes to a different neglist.
1380	 */
1381	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1382		if (ncp == oncp)
1383			break;
1384	}
1385
1386	/*
1387	 * No match to begin with.
1388	 */
1389	if (__predict_false(ncp == NULL)) {
1390		goto out_abort;
1391	}
1392
1393	/*
1394	 * The newly found entry may be something different...
1395	 */
1396	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1397	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1398		goto out_abort;
1399	}
1400
1401	/*
1402	 * ... and not even negative.
1403	 */
1404	nc_flag = atomic_load_char(&ncp->nc_flag);
1405	if ((nc_flag & NCF_NEGATIVE) == 0) {
1406		goto out_abort;
1407	}
1408
1409	if (!cache_ncp_canuse(ncp)) {
1410		goto out_abort;
1411	}
1412
1413	cache_neg_promote_locked(ncp);
1414	cache_neg_hit_finish(ncp);
1415	vfs_smr_exit();
1416	mtx_unlock(&nl->nl_lock);
1417	return (true);
1418out_abort:
1419	vfs_smr_exit();
1420	mtx_unlock(&nl->nl_lock);
1421	return (false);
1422}
1423
1424static void
1425cache_neg_promote(struct namecache *ncp)
1426{
1427	struct neglist *nl;
1428
1429	nl = NCP2NEGLIST(ncp);
1430	mtx_lock(&nl->nl_lock);
1431	cache_neg_promote_locked(ncp);
1432	mtx_unlock(&nl->nl_lock);
1433}
1434
1435static void
1436cache_neg_insert(struct namecache *ncp)
1437{
1438	struct neglist *nl;
1439
1440	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1441	cache_assert_bucket_locked(ncp);
1442	nl = NCP2NEGLIST(ncp);
1443	mtx_lock(&nl->nl_lock);
1444	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1445	mtx_unlock(&nl->nl_lock);
1446	atomic_add_long(&numneg, 1);
1447}
1448
1449static void
1450cache_neg_remove(struct namecache *ncp)
1451{
1452	struct neglist *nl;
1453	struct negstate *ns;
1454
1455	cache_assert_bucket_locked(ncp);
1456	nl = NCP2NEGLIST(ncp);
1457	ns = NCP2NEGSTATE(ncp);
1458	mtx_lock(&nl->nl_lock);
1459	if ((ns->neg_flag & NEG_HOT) != 0) {
1460		TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1461		nl->nl_hotnum--;
1462	} else {
1463		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1464	}
1465	mtx_unlock(&nl->nl_lock);
1466	atomic_subtract_long(&numneg, 1);
1467}
1468
1469static struct neglist *
1470cache_neg_evict_select_list(void)
1471{
1472	struct neglist *nl;
1473	u_int c;
1474
1475	c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1476	nl = &neglists[c % numneglists];
1477	if (!mtx_trylock(&nl->nl_evict_lock)) {
1478		counter_u64_add(neg_evict_skipped_contended, 1);
1479		return (NULL);
1480	}
1481	return (nl);
1482}
1483
1484static struct namecache *
1485cache_neg_evict_select_entry(struct neglist *nl)
1486{
1487	struct namecache *ncp, *lncp;
1488	struct negstate *ns, *lns;
1489	int i;
1490
1491	mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1492	mtx_assert(&nl->nl_lock, MA_OWNED);
1493	ncp = TAILQ_FIRST(&nl->nl_list);
1494	if (ncp == NULL)
1495		return (NULL);
1496	lncp = ncp;
1497	lns = NCP2NEGSTATE(lncp);
1498	for (i = 1; i < 4; i++) {
1499		ncp = TAILQ_NEXT(ncp, nc_dst);
1500		if (ncp == NULL)
1501			break;
1502		ns = NCP2NEGSTATE(ncp);
1503		if (ns->neg_hit < lns->neg_hit) {
1504			lncp = ncp;
1505			lns = ns;
1506		}
1507	}
1508	return (lncp);
1509}
1510
1511static bool
1512cache_neg_evict(void)
1513{
1514	struct namecache *ncp, *ncp2;
1515	struct neglist *nl;
1516	struct vnode *dvp;
1517	struct mtx *dvlp;
1518	struct mtx *blp;
1519	uint32_t hash;
1520	u_char nlen;
1521	bool evicted;
1522
1523	nl = cache_neg_evict_select_list();
1524	if (nl == NULL) {
1525		return (false);
1526	}
1527
1528	mtx_lock(&nl->nl_lock);
1529	ncp = TAILQ_FIRST(&nl->nl_hotlist);
1530	if (ncp != NULL) {
1531		cache_neg_demote_locked(ncp);
1532	}
1533	ncp = cache_neg_evict_select_entry(nl);
1534	if (ncp == NULL) {
1535		counter_u64_add(neg_evict_skipped_empty, 1);
1536		mtx_unlock(&nl->nl_lock);
1537		mtx_unlock(&nl->nl_evict_lock);
1538		return (false);
1539	}
1540	nlen = ncp->nc_nlen;
1541	dvp = ncp->nc_dvp;
1542	hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1543	dvlp = VP2VNODELOCK(dvp);
1544	blp = HASH2BUCKETLOCK(hash);
1545	mtx_unlock(&nl->nl_lock);
1546	mtx_unlock(&nl->nl_evict_lock);
1547	mtx_lock(dvlp);
1548	mtx_lock(blp);
1549	/*
1550	 * Note that since all locks were dropped above, the entry may be
1551	 * gone or reallocated to be something else.
1552	 */
1553	CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1554		if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1555		    ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1556			break;
1557	}
1558	if (ncp2 == NULL) {
1559		counter_u64_add(neg_evict_skipped_missed, 1);
1560		ncp = NULL;
1561		evicted = false;
1562	} else {
1563		MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1564		MPASS(blp == NCP2BUCKETLOCK(ncp));
1565		SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1566		    ncp->nc_name);
1567		cache_zap_locked(ncp);
1568		counter_u64_add(neg_evicted, 1);
1569		evicted = true;
1570	}
1571	mtx_unlock(blp);
1572	mtx_unlock(dvlp);
1573	if (ncp != NULL)
1574		cache_free(ncp);
1575	return (evicted);
1576}
1577
1578/*
1579 * Maybe evict a negative entry to create more room.
1580 *
1581 * The ncnegfactor parameter limits what fraction of the total count
1582 * can comprise of negative entries. However, if the cache is just
1583 * warming up this leads to excessive evictions.  As such, ncnegminpct
1584 * (recomputed to neg_min) dictates whether the above should be
1585 * applied.
1586 *
1587 * Try evicting if the cache is close to full capacity regardless of
1588 * other considerations.
1589 */
1590static bool
1591cache_neg_evict_cond(u_long lnumcache)
1592{
1593	u_long lnumneg;
1594
1595	if (ncsize - 1000 < lnumcache)
1596		goto out_evict;
1597	lnumneg = atomic_load_long(&numneg);
1598	if (lnumneg < neg_min)
1599		return (false);
1600	if (lnumneg * ncnegfactor < lnumcache)
1601		return (false);
1602out_evict:
1603	return (cache_neg_evict());
1604}
1605
1606/*
1607 * cache_zap_locked():
1608 *
1609 *   Removes a namecache entry from cache, whether it contains an actual
1610 *   pointer to a vnode or if it is just a negative cache entry.
1611 */
1612static void
1613cache_zap_locked(struct namecache *ncp)
1614{
1615	struct nchashhead *ncpp;
1616	struct vnode *dvp, *vp;
1617
1618	dvp = ncp->nc_dvp;
1619	vp = ncp->nc_vp;
1620
1621	if (!(ncp->nc_flag & NCF_NEGATIVE))
1622		cache_assert_vnode_locked(vp);
1623	cache_assert_vnode_locked(dvp);
1624	cache_assert_bucket_locked(ncp);
1625
1626	cache_ncp_invalidate(ncp);
1627
1628	ncpp = NCP2BUCKET(ncp);
1629	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1630	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1631		SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1632		TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1633		if (ncp == vp->v_cache_dd) {
1634			atomic_store_ptr(&vp->v_cache_dd, NULL);
1635		}
1636	} else {
1637		SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1638		cache_neg_remove(ncp);
1639	}
1640	if (ncp->nc_flag & NCF_ISDOTDOT) {
1641		if (ncp == dvp->v_cache_dd) {
1642			atomic_store_ptr(&dvp->v_cache_dd, NULL);
1643		}
1644	} else {
1645		LIST_REMOVE(ncp, nc_src);
1646		if (LIST_EMPTY(&dvp->v_cache_src)) {
1647			ncp->nc_flag |= NCF_DVDROP;
1648		}
1649	}
1650}
1651
1652static void
1653cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1654{
1655	struct mtx *blp;
1656
1657	MPASS(ncp->nc_dvp == vp);
1658	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1659	cache_assert_vnode_locked(vp);
1660
1661	blp = NCP2BUCKETLOCK(ncp);
1662	mtx_lock(blp);
1663	cache_zap_locked(ncp);
1664	mtx_unlock(blp);
1665}
1666
1667static bool
1668cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1669    struct mtx **vlpp)
1670{
1671	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1672	struct mtx *blp;
1673
1674	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1675	cache_assert_vnode_locked(vp);
1676
1677	if (ncp->nc_flag & NCF_NEGATIVE) {
1678		if (*vlpp != NULL) {
1679			mtx_unlock(*vlpp);
1680			*vlpp = NULL;
1681		}
1682		cache_zap_negative_locked_vnode_kl(ncp, vp);
1683		return (true);
1684	}
1685
1686	pvlp = VP2VNODELOCK(vp);
1687	blp = NCP2BUCKETLOCK(ncp);
1688	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1689	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1690
1691	if (*vlpp == vlp1 || *vlpp == vlp2) {
1692		to_unlock = *vlpp;
1693		*vlpp = NULL;
1694	} else {
1695		if (*vlpp != NULL) {
1696			mtx_unlock(*vlpp);
1697			*vlpp = NULL;
1698		}
1699		cache_sort_vnodes(&vlp1, &vlp2);
1700		if (vlp1 == pvlp) {
1701			mtx_lock(vlp2);
1702			to_unlock = vlp2;
1703		} else {
1704			if (!mtx_trylock(vlp1))
1705				goto out_relock;
1706			to_unlock = vlp1;
1707		}
1708	}
1709	mtx_lock(blp);
1710	cache_zap_locked(ncp);
1711	mtx_unlock(blp);
1712	if (to_unlock != NULL)
1713		mtx_unlock(to_unlock);
1714	return (true);
1715
1716out_relock:
1717	mtx_unlock(vlp2);
1718	mtx_lock(vlp1);
1719	mtx_lock(vlp2);
1720	MPASS(*vlpp == NULL);
1721	*vlpp = vlp1;
1722	return (false);
1723}
1724
1725/*
1726 * If trylocking failed we can get here. We know enough to take all needed locks
1727 * in the right order and re-lookup the entry.
1728 */
1729static int
1730cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1731    struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1732    struct mtx *blp)
1733{
1734	struct namecache *rncp;
1735	struct mtx *rvlp;
1736
1737	cache_assert_bucket_unlocked(ncp);
1738
1739	cache_sort_vnodes(&dvlp, &vlp);
1740	cache_lock_vnodes(dvlp, vlp);
1741	mtx_lock(blp);
1742	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1743		if (rncp == ncp && rncp->nc_dvp == dvp &&
1744		    rncp->nc_nlen == cnp->cn_namelen &&
1745		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1746			break;
1747	}
1748
1749	if (rncp == NULL)
1750		goto out_mismatch;
1751
1752	if (!(ncp->nc_flag & NCF_NEGATIVE))
1753		rvlp = VP2VNODELOCK(rncp->nc_vp);
1754	else
1755		rvlp = NULL;
1756	if (rvlp != vlp)
1757		goto out_mismatch;
1758
1759	cache_zap_locked(rncp);
1760	mtx_unlock(blp);
1761	cache_unlock_vnodes(dvlp, vlp);
1762	atomic_add_long(&zap_bucket_relock_success, 1);
1763	return (0);
1764
1765out_mismatch:
1766	mtx_unlock(blp);
1767	cache_unlock_vnodes(dvlp, vlp);
1768	return (EAGAIN);
1769}
1770
1771static int __noinline
1772cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1773    uint32_t hash, struct mtx *blp)
1774{
1775	struct mtx *dvlp, *vlp;
1776	struct vnode *dvp;
1777
1778	cache_assert_bucket_locked(ncp);
1779
1780	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1781	vlp = NULL;
1782	if (!(ncp->nc_flag & NCF_NEGATIVE))
1783		vlp = VP2VNODELOCK(ncp->nc_vp);
1784	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1785		cache_zap_locked(ncp);
1786		mtx_unlock(blp);
1787		cache_unlock_vnodes(dvlp, vlp);
1788		return (0);
1789	}
1790
1791	dvp = ncp->nc_dvp;
1792	mtx_unlock(blp);
1793	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1794}
1795
1796static __noinline int
1797cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1798{
1799	struct namecache *ncp;
1800	struct mtx *blp;
1801	struct mtx *dvlp, *dvlp2;
1802	uint32_t hash;
1803	int error;
1804
1805	if (cnp->cn_namelen == 2 &&
1806	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1807		dvlp = VP2VNODELOCK(dvp);
1808		dvlp2 = NULL;
1809		mtx_lock(dvlp);
1810retry_dotdot:
1811		ncp = dvp->v_cache_dd;
1812		if (ncp == NULL) {
1813			mtx_unlock(dvlp);
1814			if (dvlp2 != NULL)
1815				mtx_unlock(dvlp2);
1816			SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1817			return (0);
1818		}
1819		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1820			if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1821				goto retry_dotdot;
1822			MPASS(dvp->v_cache_dd == NULL);
1823			mtx_unlock(dvlp);
1824			if (dvlp2 != NULL)
1825				mtx_unlock(dvlp2);
1826			cache_free(ncp);
1827		} else {
1828			atomic_store_ptr(&dvp->v_cache_dd, NULL);
1829			mtx_unlock(dvlp);
1830			if (dvlp2 != NULL)
1831				mtx_unlock(dvlp2);
1832		}
1833		SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1834		return (1);
1835	}
1836
1837	/*
1838	 * XXX note that access here is completely unlocked with no provisions
1839	 * to keep the hash allocated. If one is sufficiently unlucky a
1840	 * parallel cache resize can reallocate the hash, unmap backing pages
1841	 * and cause the empty check below to fault.
1842	 *
1843	 * Fixing this has epsilon priority, but can be done with no overhead
1844	 * for this codepath with sufficient effort.
1845	 */
1846	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1847	blp = HASH2BUCKETLOCK(hash);
1848retry:
1849	if (CK_SLIST_EMPTY(NCHHASH(hash)))
1850		goto out_no_entry;
1851
1852	mtx_lock(blp);
1853
1854	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1855		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1856		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1857			break;
1858	}
1859
1860	if (ncp == NULL) {
1861		mtx_unlock(blp);
1862		goto out_no_entry;
1863	}
1864
1865	error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1866	if (__predict_false(error != 0)) {
1867		atomic_add_long(&zap_bucket_fail, 1);
1868		goto retry;
1869	}
1870	counter_u64_add(numposzaps, 1);
1871	SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1872	cache_free(ncp);
1873	return (1);
1874out_no_entry:
1875	counter_u64_add(nummisszap, 1);
1876	SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1877	return (0);
1878}
1879
1880static int __noinline
1881cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1882    struct timespec *tsp, int *ticksp)
1883{
1884	int ltype;
1885
1886	*vpp = dvp;
1887	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1888	if (tsp != NULL)
1889		timespecclear(tsp);
1890	if (ticksp != NULL)
1891		*ticksp = ticks;
1892	vrefact(*vpp);
1893	/*
1894	 * When we lookup "." we still can be asked to lock it
1895	 * differently...
1896	 */
1897	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1898	if (ltype != VOP_ISLOCKED(*vpp)) {
1899		if (ltype == LK_EXCLUSIVE) {
1900			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1901			if (VN_IS_DOOMED((*vpp))) {
1902				/* forced unmount */
1903				vrele(*vpp);
1904				*vpp = NULL;
1905				return (ENOENT);
1906			}
1907		} else
1908			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1909	}
1910	return (-1);
1911}
1912
1913static int __noinline
1914cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1915    struct timespec *tsp, int *ticksp)
1916{
1917	struct namecache_ts *ncp_ts;
1918	struct namecache *ncp;
1919	struct mtx *dvlp;
1920	enum vgetstate vs;
1921	int error, ltype;
1922	bool whiteout;
1923
1924	MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1925
1926	if ((cnp->cn_flags & MAKEENTRY) == 0) {
1927		cache_remove_cnp(dvp, cnp);
1928		return (0);
1929	}
1930
1931retry:
1932	dvlp = VP2VNODELOCK(dvp);
1933	mtx_lock(dvlp);
1934	ncp = dvp->v_cache_dd;
1935	if (ncp == NULL) {
1936		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1937		mtx_unlock(dvlp);
1938		return (0);
1939	}
1940	if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1941		if (ncp->nc_flag & NCF_NEGATIVE)
1942			*vpp = NULL;
1943		else
1944			*vpp = ncp->nc_vp;
1945	} else
1946		*vpp = ncp->nc_dvp;
1947	if (*vpp == NULL)
1948		goto negative_success;
1949	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1950	cache_out_ts(ncp, tsp, ticksp);
1951	if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1952	    NCF_DTS && tsp != NULL) {
1953		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1954		*tsp = ncp_ts->nc_dotdottime;
1955	}
1956
1957	MPASS(dvp != *vpp);
1958	ltype = VOP_ISLOCKED(dvp);
1959	VOP_UNLOCK(dvp);
1960	vs = vget_prep(*vpp);
1961	mtx_unlock(dvlp);
1962	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1963	vn_lock(dvp, ltype | LK_RETRY);
1964	if (VN_IS_DOOMED(dvp)) {
1965		if (error == 0)
1966			vput(*vpp);
1967		*vpp = NULL;
1968		return (ENOENT);
1969	}
1970	if (error) {
1971		*vpp = NULL;
1972		goto retry;
1973	}
1974	return (-1);
1975negative_success:
1976	if (__predict_false(cnp->cn_nameiop == CREATE)) {
1977		if (cnp->cn_flags & ISLASTCN) {
1978			counter_u64_add(numnegzaps, 1);
1979			cache_zap_negative_locked_vnode_kl(ncp, dvp);
1980			mtx_unlock(dvlp);
1981			cache_free(ncp);
1982			return (0);
1983		}
1984	}
1985
1986	whiteout = (ncp->nc_flag & NCF_WHITE);
1987	cache_out_ts(ncp, tsp, ticksp);
1988	if (cache_neg_hit_prep(ncp))
1989		cache_neg_promote(ncp);
1990	else
1991		cache_neg_hit_finish(ncp);
1992	mtx_unlock(dvlp);
1993	if (whiteout)
1994		cnp->cn_flags |= ISWHITEOUT;
1995	return (ENOENT);
1996}
1997
1998/**
1999 * Lookup a name in the name cache
2000 *
2001 * # Arguments
2002 *
2003 * - dvp:	Parent directory in which to search.
2004 * - vpp:	Return argument.  Will contain desired vnode on cache hit.
2005 * - cnp:	Parameters of the name search.  The most interesting bits of
2006 *   		the cn_flags field have the following meanings:
2007 *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
2008 *   			it up.
2009 *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
2010 * - tsp:	Return storage for cache timestamp.  On a successful (positive
2011 *   		or negative) lookup, tsp will be filled with any timespec that
2012 *   		was stored when this cache entry was created.  However, it will
2013 *   		be clear for "." entries.
2014 * - ticks:	Return storage for alternate cache timestamp.  On a successful
2015 *   		(positive or negative) lookup, it will contain the ticks value
2016 *   		that was current when the cache entry was created, unless cnp
2017 *   		was ".".
2018 *
2019 * Either both tsp and ticks have to be provided or neither of them.
2020 *
2021 * # Returns
2022 *
2023 * - -1:	A positive cache hit.  vpp will contain the desired vnode.
2024 * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
2025 *		to a forced unmount.  vpp will not be modified.  If the entry
2026 *		is a whiteout, then the ISWHITEOUT flag will be set in
2027 *		cnp->cn_flags.
2028 * - 0:		A cache miss.  vpp will not be modified.
2029 *
2030 * # Locking
2031 *
2032 * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
2033 * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
2034 * lock is not recursively acquired.
2035 */
2036static int __noinline
2037cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2038    struct timespec *tsp, int *ticksp)
2039{
2040	struct namecache *ncp;
2041	struct mtx *blp;
2042	uint32_t hash;
2043	enum vgetstate vs;
2044	int error;
2045	bool whiteout;
2046
2047	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2048	MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
2049
2050retry:
2051	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2052	blp = HASH2BUCKETLOCK(hash);
2053	mtx_lock(blp);
2054
2055	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2056		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2057		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
2058			break;
2059	}
2060
2061	if (__predict_false(ncp == NULL)) {
2062		mtx_unlock(blp);
2063		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2064		counter_u64_add(nummiss, 1);
2065		return (0);
2066	}
2067
2068	if (ncp->nc_flag & NCF_NEGATIVE)
2069		goto negative_success;
2070
2071	counter_u64_add(numposhits, 1);
2072	*vpp = ncp->nc_vp;
2073	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2074	cache_out_ts(ncp, tsp, ticksp);
2075	MPASS(dvp != *vpp);
2076	vs = vget_prep(*vpp);
2077	mtx_unlock(blp);
2078	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2079	if (error) {
2080		*vpp = NULL;
2081		goto retry;
2082	}
2083	return (-1);
2084negative_success:
2085	/*
2086	 * We don't get here with regular lookup apart from corner cases.
2087	 */
2088	if (__predict_true(cnp->cn_nameiop == CREATE)) {
2089		if (cnp->cn_flags & ISLASTCN) {
2090			counter_u64_add(numnegzaps, 1);
2091			error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2092			if (__predict_false(error != 0)) {
2093				atomic_add_long(&zap_bucket_fail2, 1);
2094				goto retry;
2095			}
2096			cache_free(ncp);
2097			return (0);
2098		}
2099	}
2100
2101	whiteout = (ncp->nc_flag & NCF_WHITE);
2102	cache_out_ts(ncp, tsp, ticksp);
2103	if (cache_neg_hit_prep(ncp))
2104		cache_neg_promote(ncp);
2105	else
2106		cache_neg_hit_finish(ncp);
2107	mtx_unlock(blp);
2108	if (whiteout)
2109		cnp->cn_flags |= ISWHITEOUT;
2110	return (ENOENT);
2111}
2112
2113int
2114cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2115    struct timespec *tsp, int *ticksp)
2116{
2117	struct namecache *ncp;
2118	uint32_t hash;
2119	enum vgetstate vs;
2120	int error;
2121	bool whiteout, neg_promote;
2122	u_short nc_flag;
2123
2124	MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2125
2126#ifdef DEBUG_CACHE
2127	if (__predict_false(!doingcache)) {
2128		cnp->cn_flags &= ~MAKEENTRY;
2129		return (0);
2130	}
2131#endif
2132
2133	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2134		if (cnp->cn_namelen == 1)
2135			return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2136		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2137			return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2138	}
2139
2140	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2141
2142	if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2143		cache_remove_cnp(dvp, cnp);
2144		return (0);
2145	}
2146
2147	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2148	vfs_smr_enter();
2149
2150	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2151		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2152		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
2153			break;
2154	}
2155
2156	if (__predict_false(ncp == NULL)) {
2157		vfs_smr_exit();
2158		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2159		counter_u64_add(nummiss, 1);
2160		return (0);
2161	}
2162
2163	nc_flag = atomic_load_char(&ncp->nc_flag);
2164	if (nc_flag & NCF_NEGATIVE)
2165		goto negative_success;
2166
2167	counter_u64_add(numposhits, 1);
2168	*vpp = ncp->nc_vp;
2169	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2170	cache_out_ts(ncp, tsp, ticksp);
2171	MPASS(dvp != *vpp);
2172	if (!cache_ncp_canuse(ncp)) {
2173		vfs_smr_exit();
2174		*vpp = NULL;
2175		goto out_fallback;
2176	}
2177	vs = vget_prep_smr(*vpp);
2178	vfs_smr_exit();
2179	if (__predict_false(vs == VGET_NONE)) {
2180		*vpp = NULL;
2181		goto out_fallback;
2182	}
2183	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2184	if (error) {
2185		*vpp = NULL;
2186		goto out_fallback;
2187	}
2188	return (-1);
2189negative_success:
2190	if (cnp->cn_nameiop == CREATE) {
2191		if (cnp->cn_flags & ISLASTCN) {
2192			vfs_smr_exit();
2193			goto out_fallback;
2194		}
2195	}
2196
2197	cache_out_ts(ncp, tsp, ticksp);
2198	whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2199	neg_promote = cache_neg_hit_prep(ncp);
2200	if (!cache_ncp_canuse(ncp)) {
2201		cache_neg_hit_abort(ncp);
2202		vfs_smr_exit();
2203		goto out_fallback;
2204	}
2205	if (neg_promote) {
2206		vfs_smr_exit();
2207		if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2208			goto out_fallback;
2209	} else {
2210		cache_neg_hit_finish(ncp);
2211		vfs_smr_exit();
2212	}
2213	if (whiteout)
2214		cnp->cn_flags |= ISWHITEOUT;
2215	return (ENOENT);
2216out_fallback:
2217	return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2218}
2219
2220struct celockstate {
2221	struct mtx *vlp[3];
2222	struct mtx *blp[2];
2223};
2224CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2225CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2226
2227static inline void
2228cache_celockstate_init(struct celockstate *cel)
2229{
2230
2231	bzero(cel, sizeof(*cel));
2232}
2233
2234static void
2235cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2236    struct vnode *dvp)
2237{
2238	struct mtx *vlp1, *vlp2;
2239
2240	MPASS(cel->vlp[0] == NULL);
2241	MPASS(cel->vlp[1] == NULL);
2242	MPASS(cel->vlp[2] == NULL);
2243
2244	MPASS(vp != NULL || dvp != NULL);
2245
2246	vlp1 = VP2VNODELOCK(vp);
2247	vlp2 = VP2VNODELOCK(dvp);
2248	cache_sort_vnodes(&vlp1, &vlp2);
2249
2250	if (vlp1 != NULL) {
2251		mtx_lock(vlp1);
2252		cel->vlp[0] = vlp1;
2253	}
2254	mtx_lock(vlp2);
2255	cel->vlp[1] = vlp2;
2256}
2257
2258static void
2259cache_unlock_vnodes_cel(struct celockstate *cel)
2260{
2261
2262	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2263
2264	if (cel->vlp[0] != NULL)
2265		mtx_unlock(cel->vlp[0]);
2266	if (cel->vlp[1] != NULL)
2267		mtx_unlock(cel->vlp[1]);
2268	if (cel->vlp[2] != NULL)
2269		mtx_unlock(cel->vlp[2]);
2270}
2271
2272static bool
2273cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2274{
2275	struct mtx *vlp;
2276	bool ret;
2277
2278	cache_assert_vlp_locked(cel->vlp[0]);
2279	cache_assert_vlp_locked(cel->vlp[1]);
2280	MPASS(cel->vlp[2] == NULL);
2281
2282	MPASS(vp != NULL);
2283	vlp = VP2VNODELOCK(vp);
2284
2285	ret = true;
2286	if (vlp >= cel->vlp[1]) {
2287		mtx_lock(vlp);
2288	} else {
2289		if (mtx_trylock(vlp))
2290			goto out;
2291		cache_unlock_vnodes_cel(cel);
2292		atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
2293		if (vlp < cel->vlp[0]) {
2294			mtx_lock(vlp);
2295			mtx_lock(cel->vlp[0]);
2296			mtx_lock(cel->vlp[1]);
2297		} else {
2298			if (cel->vlp[0] != NULL)
2299				mtx_lock(cel->vlp[0]);
2300			mtx_lock(vlp);
2301			mtx_lock(cel->vlp[1]);
2302		}
2303		ret = false;
2304	}
2305out:
2306	cel->vlp[2] = vlp;
2307	return (ret);
2308}
2309
2310static void
2311cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2312    struct mtx *blp2)
2313{
2314
2315	MPASS(cel->blp[0] == NULL);
2316	MPASS(cel->blp[1] == NULL);
2317
2318	cache_sort_vnodes(&blp1, &blp2);
2319
2320	if (blp1 != NULL) {
2321		mtx_lock(blp1);
2322		cel->blp[0] = blp1;
2323	}
2324	mtx_lock(blp2);
2325	cel->blp[1] = blp2;
2326}
2327
2328static void
2329cache_unlock_buckets_cel(struct celockstate *cel)
2330{
2331
2332	if (cel->blp[0] != NULL)
2333		mtx_unlock(cel->blp[0]);
2334	mtx_unlock(cel->blp[1]);
2335}
2336
2337/*
2338 * Lock part of the cache affected by the insertion.
2339 *
2340 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2341 * However, insertion can result in removal of an old entry. In this
2342 * case we have an additional vnode and bucketlock pair to lock.
2343 *
2344 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2345 * preserving the locking order (smaller address first).
2346 */
2347static void
2348cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2349    uint32_t hash)
2350{
2351	struct namecache *ncp;
2352	struct mtx *blps[2];
2353	u_char nc_flag;
2354
2355	blps[0] = HASH2BUCKETLOCK(hash);
2356	for (;;) {
2357		blps[1] = NULL;
2358		cache_lock_vnodes_cel(cel, dvp, vp);
2359		if (vp == NULL || vp->v_type != VDIR)
2360			break;
2361		ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2362		if (ncp == NULL)
2363			break;
2364		nc_flag = atomic_load_char(&ncp->nc_flag);
2365		if ((nc_flag & NCF_ISDOTDOT) == 0)
2366			break;
2367		MPASS(ncp->nc_dvp == vp);
2368		blps[1] = NCP2BUCKETLOCK(ncp);
2369		if ((nc_flag & NCF_NEGATIVE) != 0)
2370			break;
2371		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2372			break;
2373		/*
2374		 * All vnodes got re-locked. Re-validate the state and if
2375		 * nothing changed we are done. Otherwise restart.
2376		 */
2377		if (ncp == vp->v_cache_dd &&
2378		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2379		    blps[1] == NCP2BUCKETLOCK(ncp) &&
2380		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2381			break;
2382		cache_unlock_vnodes_cel(cel);
2383		cel->vlp[0] = NULL;
2384		cel->vlp[1] = NULL;
2385		cel->vlp[2] = NULL;
2386	}
2387	cache_lock_buckets_cel(cel, blps[0], blps[1]);
2388}
2389
2390static void
2391cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2392    uint32_t hash)
2393{
2394	struct namecache *ncp;
2395	struct mtx *blps[2];
2396	u_char nc_flag;
2397
2398	blps[0] = HASH2BUCKETLOCK(hash);
2399	for (;;) {
2400		blps[1] = NULL;
2401		cache_lock_vnodes_cel(cel, dvp, vp);
2402		ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2403		if (ncp == NULL)
2404			break;
2405		nc_flag = atomic_load_char(&ncp->nc_flag);
2406		if ((nc_flag & NCF_ISDOTDOT) == 0)
2407			break;
2408		MPASS(ncp->nc_dvp == dvp);
2409		blps[1] = NCP2BUCKETLOCK(ncp);
2410		if ((nc_flag & NCF_NEGATIVE) != 0)
2411			break;
2412		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2413			break;
2414		if (ncp == dvp->v_cache_dd &&
2415		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2416		    blps[1] == NCP2BUCKETLOCK(ncp) &&
2417		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2418			break;
2419		cache_unlock_vnodes_cel(cel);
2420		cel->vlp[0] = NULL;
2421		cel->vlp[1] = NULL;
2422		cel->vlp[2] = NULL;
2423	}
2424	cache_lock_buckets_cel(cel, blps[0], blps[1]);
2425}
2426
2427static void
2428cache_enter_unlock(struct celockstate *cel)
2429{
2430
2431	cache_unlock_buckets_cel(cel);
2432	cache_unlock_vnodes_cel(cel);
2433}
2434
2435static void __noinline
2436cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2437    struct componentname *cnp)
2438{
2439	struct celockstate cel;
2440	struct namecache *ncp;
2441	uint32_t hash;
2442	int len;
2443
2444	if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2445		return;
2446	len = cnp->cn_namelen;
2447	cache_celockstate_init(&cel);
2448	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2449	cache_enter_lock_dd(&cel, dvp, vp, hash);
2450	ncp = dvp->v_cache_dd;
2451	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2452		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2453		cache_zap_locked(ncp);
2454	} else {
2455		ncp = NULL;
2456	}
2457	atomic_store_ptr(&dvp->v_cache_dd, NULL);
2458	cache_enter_unlock(&cel);
2459	if (ncp != NULL)
2460		cache_free(ncp);
2461}
2462
2463/*
2464 * Add an entry to the cache.
2465 */
2466void
2467cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2468    struct timespec *tsp, struct timespec *dtsp)
2469{
2470	struct celockstate cel;
2471	struct namecache *ncp, *n2, *ndd;
2472	struct namecache_ts *ncp_ts;
2473	struct nchashhead *ncpp;
2474	uint32_t hash;
2475	int flag;
2476	int len;
2477
2478	KASSERT(cnp->cn_namelen <= NAME_MAX,
2479	    ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2480	    NAME_MAX));
2481	VNPASS(!VN_IS_DOOMED(dvp), dvp);
2482	VNPASS(dvp->v_type != VNON, dvp);
2483	if (vp != NULL) {
2484		VNPASS(!VN_IS_DOOMED(vp), vp);
2485		VNPASS(vp->v_type != VNON, vp);
2486	}
2487	if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2488		KASSERT(dvp == vp,
2489		    ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2490		    dvp, vp));
2491	} else {
2492		KASSERT(dvp != vp,
2493		    ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2494		    cnp->cn_nameptr, dvp));
2495	}
2496
2497#ifdef DEBUG_CACHE
2498	if (__predict_false(!doingcache))
2499		return;
2500#endif
2501
2502	flag = 0;
2503	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2504		if (cnp->cn_namelen == 1)
2505			return;
2506		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2507			cache_enter_dotdot_prep(dvp, vp, cnp);
2508			flag = NCF_ISDOTDOT;
2509		}
2510	}
2511
2512	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2513	if (ncp == NULL)
2514		return;
2515
2516	cache_celockstate_init(&cel);
2517	ndd = NULL;
2518	ncp_ts = NULL;
2519
2520	/*
2521	 * Calculate the hash key and setup as much of the new
2522	 * namecache entry as possible before acquiring the lock.
2523	 */
2524	ncp->nc_flag = flag | NCF_WIP;
2525	ncp->nc_vp = vp;
2526	if (vp == NULL)
2527		cache_neg_init(ncp);
2528	ncp->nc_dvp = dvp;
2529	if (tsp != NULL) {
2530		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2531		ncp_ts->nc_time = *tsp;
2532		ncp_ts->nc_ticks = ticks;
2533		ncp_ts->nc_nc.nc_flag |= NCF_TS;
2534		if (dtsp != NULL) {
2535			ncp_ts->nc_dotdottime = *dtsp;
2536			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2537		}
2538	}
2539	len = ncp->nc_nlen = cnp->cn_namelen;
2540	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2541	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2542	ncp->nc_name[len] = '\0';
2543	cache_enter_lock(&cel, dvp, vp, hash);
2544
2545	/*
2546	 * See if this vnode or negative entry is already in the cache
2547	 * with this name.  This can happen with concurrent lookups of
2548	 * the same path name.
2549	 */
2550	ncpp = NCHHASH(hash);
2551	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2552		if (n2->nc_dvp == dvp &&
2553		    n2->nc_nlen == cnp->cn_namelen &&
2554		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2555			MPASS(cache_ncp_canuse(n2));
2556			if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2557				KASSERT(vp == NULL,
2558				    ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2559				    __func__, NULL, vp, cnp->cn_nameptr));
2560			else
2561				KASSERT(n2->nc_vp == vp,
2562				    ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2563				    __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2564			/*
2565			 * Entries are supposed to be immutable unless in the
2566			 * process of getting destroyed. Accommodating for
2567			 * changing timestamps is possible but not worth it.
2568			 * This should be harmless in terms of correctness, in
2569			 * the worst case resulting in an earlier expiration.
2570			 * Alternatively, the found entry can be replaced
2571			 * altogether.
2572			 */
2573			MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2574#if 0
2575			if (tsp != NULL) {
2576				KASSERT((n2->nc_flag & NCF_TS) != 0,
2577				    ("no NCF_TS"));
2578				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2579				n2_ts->nc_time = ncp_ts->nc_time;
2580				n2_ts->nc_ticks = ncp_ts->nc_ticks;
2581				if (dtsp != NULL) {
2582					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2583					n2_ts->nc_nc.nc_flag |= NCF_DTS;
2584				}
2585			}
2586#endif
2587			SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2588			    vp);
2589			goto out_unlock_free;
2590		}
2591	}
2592
2593	if (flag == NCF_ISDOTDOT) {
2594		/*
2595		 * See if we are trying to add .. entry, but some other lookup
2596		 * has populated v_cache_dd pointer already.
2597		 */
2598		if (dvp->v_cache_dd != NULL)
2599			goto out_unlock_free;
2600		KASSERT(vp == NULL || vp->v_type == VDIR,
2601		    ("wrong vnode type %p", vp));
2602		atomic_thread_fence_rel();
2603		atomic_store_ptr(&dvp->v_cache_dd, ncp);
2604	}
2605
2606	if (vp != NULL) {
2607		if (flag != NCF_ISDOTDOT) {
2608			/*
2609			 * For this case, the cache entry maps both the
2610			 * directory name in it and the name ".." for the
2611			 * directory's parent.
2612			 */
2613			if ((ndd = vp->v_cache_dd) != NULL) {
2614				if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2615					cache_zap_locked(ndd);
2616				else
2617					ndd = NULL;
2618			}
2619			atomic_thread_fence_rel();
2620			atomic_store_ptr(&vp->v_cache_dd, ncp);
2621		} else if (vp->v_type != VDIR) {
2622			if (vp->v_cache_dd != NULL) {
2623				atomic_store_ptr(&vp->v_cache_dd, NULL);
2624			}
2625		}
2626	}
2627
2628	if (flag != NCF_ISDOTDOT) {
2629		if (LIST_EMPTY(&dvp->v_cache_src)) {
2630			cache_hold_vnode(dvp);
2631		}
2632		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2633	}
2634
2635	/*
2636	 * If the entry is "negative", we place it into the
2637	 * "negative" cache queue, otherwise, we place it into the
2638	 * destination vnode's cache entries queue.
2639	 */
2640	if (vp != NULL) {
2641		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2642		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2643		    vp);
2644	} else {
2645		if (cnp->cn_flags & ISWHITEOUT)
2646			atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2647		cache_neg_insert(ncp);
2648		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2649		    ncp->nc_name);
2650	}
2651
2652	/*
2653	 * Insert the new namecache entry into the appropriate chain
2654	 * within the cache entries table.
2655	 */
2656	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2657
2658	atomic_thread_fence_rel();
2659	/*
2660	 * Mark the entry as fully constructed.
2661	 * It is immutable past this point until its removal.
2662	 */
2663	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2664
2665	cache_enter_unlock(&cel);
2666	if (ndd != NULL)
2667		cache_free(ndd);
2668	return;
2669out_unlock_free:
2670	cache_enter_unlock(&cel);
2671	cache_free(ncp);
2672	return;
2673}
2674
2675/*
2676 * A variant of the above accepting flags.
2677 *
2678 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2679 *
2680 * TODO: this routine is a hack. It blindly removes the old entry, even if it
2681 * happens to match and it is doing it in an inefficient manner. It was added
2682 * to accommodate NFS which runs into a case where the target for a given name
2683 * may change from under it. Note this does nothing to solve the following
2684 * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2685 * the same [dvp, cnp]. It may be argued that code doing this is broken.
2686 */
2687void
2688cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2689    struct timespec *tsp, struct timespec *dtsp, int flags)
2690{
2691
2692	MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2693
2694	if (flags & VFS_CACHE_DROPOLD)
2695		cache_remove_cnp(dvp, cnp);
2696	cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2697}
2698
2699static u_long
2700cache_roundup_2(u_long val)
2701{
2702	u_long res;
2703
2704	for (res = 1; res <= val; res <<= 1)
2705		continue;
2706
2707	return (res);
2708}
2709
2710static struct nchashhead *
2711nchinittbl(u_long elements, u_long *hashmask)
2712{
2713	struct nchashhead *hashtbl;
2714	u_long hashsize, i;
2715
2716	hashsize = cache_roundup_2(elements) / 2;
2717
2718	hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2719	for (i = 0; i < hashsize; i++)
2720		CK_SLIST_INIT(&hashtbl[i]);
2721	*hashmask = hashsize - 1;
2722	return (hashtbl);
2723}
2724
2725static void
2726ncfreetbl(struct nchashhead *hashtbl)
2727{
2728
2729	free(hashtbl, M_VFSCACHE);
2730}
2731
2732/*
2733 * Name cache initialization, from vfs_init() when we are booting
2734 */
2735static void
2736nchinit(void *dummy __unused)
2737{
2738	u_int i;
2739
2740	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2741	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2742	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2743	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2744	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2745	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2746	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2747	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2748
2749	VFS_SMR_ZONE_SET(cache_zone_small);
2750	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2751	VFS_SMR_ZONE_SET(cache_zone_large);
2752	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2753
2754	ncsize = desiredvnodes * ncsizefactor;
2755	cache_recalc_neg_min();
2756	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2757	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2758	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2759		ncbuckethash = 7;
2760	if (ncbuckethash > nchash)
2761		ncbuckethash = nchash;
2762	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2763	    M_WAITOK | M_ZERO);
2764	for (i = 0; i < numbucketlocks; i++)
2765		mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2766	ncvnodehash = ncbuckethash;
2767	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2768	    M_WAITOK | M_ZERO);
2769	for (i = 0; i < numvnodelocks; i++)
2770		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2771
2772	for (i = 0; i < numneglists; i++) {
2773		mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2774		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2775		TAILQ_INIT(&neglists[i].nl_list);
2776		TAILQ_INIT(&neglists[i].nl_hotlist);
2777	}
2778}
2779SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2780
2781void
2782cache_vnode_init(struct vnode *vp)
2783{
2784
2785	LIST_INIT(&vp->v_cache_src);
2786	TAILQ_INIT(&vp->v_cache_dst);
2787	vp->v_cache_dd = NULL;
2788	cache_prehash(vp);
2789}
2790
2791/*
2792 * Induce transient cache misses for lockless operation in cache_lookup() by
2793 * using a temporary hash table.
2794 *
2795 * This will force a fs lookup.
2796 *
2797 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2798 * to observe all CPUs not performing the lookup.
2799 */
2800static void
2801cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2802{
2803
2804	MPASS(temphash < nchash);
2805	/*
2806	 * Change the size. The new size is smaller and can safely be used
2807	 * against the existing table. All lookups which now hash wrong will
2808	 * result in a cache miss, which all callers are supposed to know how
2809	 * to handle.
2810	 */
2811	atomic_store_long(&nchash, temphash);
2812	atomic_thread_fence_rel();
2813	vfs_smr_synchronize();
2814	/*
2815	 * At this point everyone sees the updated hash value, but they still
2816	 * see the old table.
2817	 */
2818	atomic_store_ptr(&nchashtbl, temptbl);
2819	atomic_thread_fence_rel();
2820	vfs_smr_synchronize();
2821	/*
2822	 * At this point everyone sees the updated table pointer and size pair.
2823	 */
2824}
2825
2826/*
2827 * Set the new hash table.
2828 *
2829 * Similarly to cache_changesize_set_temp(), this has to synchronize against
2830 * lockless operation in cache_lookup().
2831 */
2832static void
2833cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2834{
2835
2836	MPASS(nchash < new_hash);
2837	/*
2838	 * Change the pointer first. This wont result in out of bounds access
2839	 * since the temporary table is guaranteed to be smaller.
2840	 */
2841	atomic_store_ptr(&nchashtbl, new_tbl);
2842	atomic_thread_fence_rel();
2843	vfs_smr_synchronize();
2844	/*
2845	 * At this point everyone sees the updated pointer value, but they
2846	 * still see the old size.
2847	 */
2848	atomic_store_long(&nchash, new_hash);
2849	atomic_thread_fence_rel();
2850	vfs_smr_synchronize();
2851	/*
2852	 * At this point everyone sees the updated table pointer and size pair.
2853	 */
2854}
2855
2856void
2857cache_changesize(u_long newmaxvnodes)
2858{
2859	struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2860	u_long new_nchash, old_nchash, temphash;
2861	struct namecache *ncp;
2862	uint32_t hash;
2863	u_long newncsize;
2864	u_long i;
2865
2866	newncsize = newmaxvnodes * ncsizefactor;
2867	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2868	if (newmaxvnodes < numbucketlocks)
2869		newmaxvnodes = numbucketlocks;
2870
2871	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2872	/* If same hash table size, nothing to do */
2873	if (nchash == new_nchash) {
2874		ncfreetbl(new_nchashtbl);
2875		return;
2876	}
2877
2878	temptbl = nchinittbl(1, &temphash);
2879
2880	/*
2881	 * Move everything from the old hash table to the new table.
2882	 * None of the namecache entries in the table can be removed
2883	 * because to do so, they have to be removed from the hash table.
2884	 */
2885	cache_lock_all_vnodes();
2886	cache_lock_all_buckets();
2887	old_nchashtbl = nchashtbl;
2888	old_nchash = nchash;
2889	cache_changesize_set_temp(temptbl, temphash);
2890	for (i = 0; i <= old_nchash; i++) {
2891		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2892			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2893			    ncp->nc_dvp);
2894			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2895			CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2896		}
2897	}
2898	ncsize = newncsize;
2899	cache_recalc_neg_min();
2900	cache_changesize_set_new(new_nchashtbl, new_nchash);
2901	cache_unlock_all_buckets();
2902	cache_unlock_all_vnodes();
2903	ncfreetbl(old_nchashtbl);
2904	ncfreetbl(temptbl);
2905}
2906
2907/*
2908 * Remove all entries from and to a particular vnode.
2909 */
2910static void
2911cache_purge_impl(struct vnode *vp)
2912{
2913	struct cache_freebatch batch;
2914	struct namecache *ncp;
2915	struct mtx *vlp, *vlp2;
2916
2917	TAILQ_INIT(&batch);
2918	vlp = VP2VNODELOCK(vp);
2919	vlp2 = NULL;
2920	mtx_lock(vlp);
2921retry:
2922	while (!LIST_EMPTY(&vp->v_cache_src)) {
2923		ncp = LIST_FIRST(&vp->v_cache_src);
2924		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2925			goto retry;
2926		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2927	}
2928	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2929		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2930		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2931			goto retry;
2932		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2933	}
2934	ncp = vp->v_cache_dd;
2935	if (ncp != NULL) {
2936		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2937		   ("lost dotdot link"));
2938		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2939			goto retry;
2940		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2941	}
2942	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2943	mtx_unlock(vlp);
2944	if (vlp2 != NULL)
2945		mtx_unlock(vlp2);
2946	cache_free_batch(&batch);
2947}
2948
2949/*
2950 * Opportunistic check to see if there is anything to do.
2951 */
2952static bool
2953cache_has_entries(struct vnode *vp)
2954{
2955
2956	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2957	    atomic_load_ptr(&vp->v_cache_dd) == NULL)
2958		return (false);
2959	return (true);
2960}
2961
2962void
2963cache_purge(struct vnode *vp)
2964{
2965
2966	SDT_PROBE1(vfs, namecache, purge, done, vp);
2967	if (!cache_has_entries(vp))
2968		return;
2969	cache_purge_impl(vp);
2970}
2971
2972/*
2973 * Only to be used by vgone.
2974 */
2975void
2976cache_purge_vgone(struct vnode *vp)
2977{
2978	struct mtx *vlp;
2979
2980	VNPASS(VN_IS_DOOMED(vp), vp);
2981	if (cache_has_entries(vp)) {
2982		cache_purge_impl(vp);
2983		return;
2984	}
2985
2986	/*
2987	 * Serialize against a potential thread doing cache_purge.
2988	 */
2989	vlp = VP2VNODELOCK(vp);
2990	mtx_wait_unlocked(vlp);
2991	if (cache_has_entries(vp)) {
2992		cache_purge_impl(vp);
2993		return;
2994	}
2995	return;
2996}
2997
2998/*
2999 * Remove all negative entries for a particular directory vnode.
3000 */
3001void
3002cache_purge_negative(struct vnode *vp)
3003{
3004	struct cache_freebatch batch;
3005	struct namecache *ncp, *nnp;
3006	struct mtx *vlp;
3007
3008	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
3009	if (LIST_EMPTY(&vp->v_cache_src))
3010		return;
3011	TAILQ_INIT(&batch);
3012	vlp = VP2VNODELOCK(vp);
3013	mtx_lock(vlp);
3014	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
3015		if (!(ncp->nc_flag & NCF_NEGATIVE))
3016			continue;
3017		cache_zap_negative_locked_vnode_kl(ncp, vp);
3018		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
3019	}
3020	mtx_unlock(vlp);
3021	cache_free_batch(&batch);
3022}
3023
3024/*
3025 * Entry points for modifying VOP operations.
3026 */
3027void
3028cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
3029    struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
3030{
3031
3032	ASSERT_VOP_IN_SEQC(fdvp);
3033	ASSERT_VOP_IN_SEQC(fvp);
3034	ASSERT_VOP_IN_SEQC(tdvp);
3035	if (tvp != NULL)
3036		ASSERT_VOP_IN_SEQC(tvp);
3037
3038	cache_purge(fvp);
3039	if (tvp != NULL) {
3040		cache_purge(tvp);
3041		KASSERT(!cache_remove_cnp(tdvp, tcnp),
3042		    ("%s: lingering negative entry", __func__));
3043	} else {
3044		cache_remove_cnp(tdvp, tcnp);
3045	}
3046
3047	/*
3048	 * TODO
3049	 *
3050	 * Historically renaming was always purging all revelang entries,
3051	 * but that's quite wasteful. In particular turns out that in many cases
3052	 * the target file is immediately accessed after rename, inducing a cache
3053	 * miss.
3054	 *
3055	 * Recode this to reduce relocking and reuse the existing entry (if any)
3056	 * instead of just removing it above and allocating a new one here.
3057	 */
3058	cache_enter(tdvp, fvp, tcnp);
3059}
3060
3061void
3062cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
3063{
3064
3065	ASSERT_VOP_IN_SEQC(dvp);
3066	ASSERT_VOP_IN_SEQC(vp);
3067	cache_purge(vp);
3068}
3069
3070#ifdef INVARIANTS
3071/*
3072 * Validate that if an entry exists it matches.
3073 */
3074void
3075cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
3076{
3077	struct namecache *ncp;
3078	struct mtx *blp;
3079	uint32_t hash;
3080
3081	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3082	if (CK_SLIST_EMPTY(NCHHASH(hash)))
3083		return;
3084	blp = HASH2BUCKETLOCK(hash);
3085	mtx_lock(blp);
3086	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3087		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3088		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
3089			if (ncp->nc_vp != vp)
3090				panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3091				    __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3092		}
3093	}
3094	mtx_unlock(blp);
3095}
3096
3097void
3098cache_assert_no_entries(struct vnode *vp)
3099{
3100
3101	VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
3102	VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
3103	VNPASS(vp->v_cache_dd == NULL, vp);
3104}
3105#endif
3106
3107/*
3108 * Flush all entries referencing a particular filesystem.
3109 */
3110void
3111cache_purgevfs(struct mount *mp)
3112{
3113	struct vnode *vp, *mvp;
3114	size_t visited __sdt_used, purged __sdt_used;
3115
3116	visited = purged = 0;
3117	/*
3118	 * Somewhat wasteful iteration over all vnodes. Would be better to
3119	 * support filtering and avoid the interlock to begin with.
3120	 */
3121	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3122		visited++;
3123		if (!cache_has_entries(vp)) {
3124			VI_UNLOCK(vp);
3125			continue;
3126		}
3127		vholdl(vp);
3128		VI_UNLOCK(vp);
3129		cache_purge(vp);
3130		purged++;
3131		vdrop(vp);
3132	}
3133
3134	SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3135}
3136
3137/*
3138 * Perform canonical checks and cache lookup and pass on to filesystem
3139 * through the vop_cachedlookup only if needed.
3140 */
3141
3142int
3143vfs_cache_lookup(struct vop_lookup_args *ap)
3144{
3145	struct vnode *dvp;
3146	int error;
3147	struct vnode **vpp = ap->a_vpp;
3148	struct componentname *cnp = ap->a_cnp;
3149	int flags = cnp->cn_flags;
3150
3151	*vpp = NULL;
3152	dvp = ap->a_dvp;
3153
3154	if (dvp->v_type != VDIR)
3155		return (ENOTDIR);
3156
3157	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3158	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3159		return (EROFS);
3160
3161	error = vn_dir_check_exec(dvp, cnp);
3162	if (error != 0)
3163		return (error);
3164
3165	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3166	if (error == 0)
3167		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3168	if (error == -1)
3169		return (0);
3170	return (error);
3171}
3172
3173/* Implementation of the getcwd syscall. */
3174int
3175sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3176{
3177	char *buf, *retbuf;
3178	size_t buflen;
3179	int error;
3180
3181	buflen = uap->buflen;
3182	if (__predict_false(buflen < 2))
3183		return (EINVAL);
3184	if (buflen > MAXPATHLEN)
3185		buflen = MAXPATHLEN;
3186
3187	buf = uma_zalloc(namei_zone, M_WAITOK);
3188	error = vn_getcwd(buf, &retbuf, &buflen);
3189	if (error == 0)
3190		error = copyout(retbuf, uap->buf, buflen);
3191	uma_zfree(namei_zone, buf);
3192	return (error);
3193}
3194
3195int
3196vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3197{
3198	struct pwd *pwd;
3199	int error;
3200
3201	vfs_smr_enter();
3202	pwd = pwd_get_smr();
3203	error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3204	    buflen, 0);
3205	VFS_SMR_ASSERT_NOT_ENTERED();
3206	if (error < 0) {
3207		pwd = pwd_hold(curthread);
3208		error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3209		    retbuf, buflen);
3210		pwd_drop(pwd);
3211	}
3212
3213#ifdef KTRACE
3214	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3215		ktrnamei(*retbuf);
3216#endif
3217	return (error);
3218}
3219
3220/*
3221 * Canonicalize a path by walking it forward and back.
3222 *
3223 * BUGS:
3224 * - Nothing guarantees the integrity of the entire chain. Consider the case
3225 *   where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
3226 *   "foo" into "quux" during the backwards walk. The result will be
3227 *   "quux/bar/baz/qux", which could not have been obtained by an incremental
3228 *   walk in userspace. Moreover, the path we return is inaccessible if the
3229 *   calling thread lacks permission to traverse "quux".
3230 */
3231static int
3232kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3233    size_t size, int flags, enum uio_seg pathseg)
3234{
3235	struct nameidata nd;
3236	char *retbuf, *freebuf;
3237	int error;
3238
3239	if (flags != 0)
3240		return (EINVAL);
3241	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
3242	    pathseg, path, fd, &cap_fstat_rights);
3243	if ((error = namei(&nd)) != 0)
3244		return (error);
3245
3246	if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3247	    (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3248		/*
3249		 * This happens if vp is a file mount. The call to
3250		 * vn_fullpath_hardlink can panic if path resolution can't be
3251		 * handled without the directory.
3252		 *
3253		 * To resolve this, we find the vnode which was mounted on -
3254		 * this should have a unique global path since we disallow
3255		 * mounting on linked files.
3256		 */
3257		struct vnode *covered_vp;
3258		error = vn_lock(nd.ni_vp, LK_SHARED);
3259		if (error != 0)
3260			goto out;
3261		covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3262		vref(covered_vp);
3263		VOP_UNLOCK(nd.ni_vp);
3264		error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3265		vrele(covered_vp);
3266	} else {
3267		error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr,
3268		    nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size);
3269	}
3270	if (error == 0) {
3271		error = copyout(retbuf, buf, size);
3272		free(freebuf, M_TEMP);
3273	}
3274out:
3275	vrele(nd.ni_vp);
3276	vrele(nd.ni_dvp);
3277	NDFREE_PNBUF(&nd);
3278	return (error);
3279}
3280
3281int
3282sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3283{
3284
3285	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3286	    uap->flags, UIO_USERSPACE));
3287}
3288
3289/*
3290 * Retrieve the full filesystem path that correspond to a vnode from the name
3291 * cache (if available)
3292 */
3293int
3294vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3295{
3296	struct pwd *pwd;
3297	char *buf;
3298	size_t buflen;
3299	int error;
3300
3301	if (__predict_false(vp == NULL))
3302		return (EINVAL);
3303
3304	buflen = MAXPATHLEN;
3305	buf = malloc(buflen, M_TEMP, M_WAITOK);
3306	vfs_smr_enter();
3307	pwd = pwd_get_smr();
3308	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3309	VFS_SMR_ASSERT_NOT_ENTERED();
3310	if (error < 0) {
3311		pwd = pwd_hold(curthread);
3312		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3313		pwd_drop(pwd);
3314	}
3315	if (error == 0)
3316		*freebuf = buf;
3317	else
3318		free(buf, M_TEMP);
3319	return (error);
3320}
3321
3322/*
3323 * This function is similar to vn_fullpath, but it attempts to lookup the
3324 * pathname relative to the global root mount point.  This is required for the
3325 * auditing sub-system, as audited pathnames must be absolute, relative to the
3326 * global root mount point.
3327 */
3328int
3329vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3330{
3331	char *buf;
3332	size_t buflen;
3333	int error;
3334
3335	if (__predict_false(vp == NULL))
3336		return (EINVAL);
3337	buflen = MAXPATHLEN;
3338	buf = malloc(buflen, M_TEMP, M_WAITOK);
3339	vfs_smr_enter();
3340	error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3341	VFS_SMR_ASSERT_NOT_ENTERED();
3342	if (error < 0) {
3343		error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3344	}
3345	if (error == 0)
3346		*freebuf = buf;
3347	else
3348		free(buf, M_TEMP);
3349	return (error);
3350}
3351
3352static struct namecache *
3353vn_dd_from_dst(struct vnode *vp)
3354{
3355	struct namecache *ncp;
3356
3357	cache_assert_vnode_locked(vp);
3358	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3359		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3360			return (ncp);
3361	}
3362	return (NULL);
3363}
3364
3365int
3366vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3367{
3368	struct vnode *dvp;
3369	struct namecache *ncp;
3370	struct mtx *vlp;
3371	int error;
3372
3373	vlp = VP2VNODELOCK(*vp);
3374	mtx_lock(vlp);
3375	ncp = (*vp)->v_cache_dd;
3376	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3377		KASSERT(ncp == vn_dd_from_dst(*vp),
3378		    ("%s: mismatch for dd entry (%p != %p)", __func__,
3379		    ncp, vn_dd_from_dst(*vp)));
3380	} else {
3381		ncp = vn_dd_from_dst(*vp);
3382	}
3383	if (ncp != NULL) {
3384		if (*buflen < ncp->nc_nlen) {
3385			mtx_unlock(vlp);
3386			vrele(*vp);
3387			counter_u64_add(numfullpathfail4, 1);
3388			error = ENOMEM;
3389			SDT_PROBE3(vfs, namecache, fullpath, return, error,
3390			    vp, NULL);
3391			return (error);
3392		}
3393		*buflen -= ncp->nc_nlen;
3394		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3395		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3396		    ncp->nc_name, vp);
3397		dvp = *vp;
3398		*vp = ncp->nc_dvp;
3399		vref(*vp);
3400		mtx_unlock(vlp);
3401		vrele(dvp);
3402		return (0);
3403	}
3404	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3405
3406	mtx_unlock(vlp);
3407	vn_lock(*vp, LK_SHARED | LK_RETRY);
3408	error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3409	vput(*vp);
3410	if (error) {
3411		counter_u64_add(numfullpathfail2, 1);
3412		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
3413		return (error);
3414	}
3415
3416	*vp = dvp;
3417	if (VN_IS_DOOMED(dvp)) {
3418		/* forced unmount */
3419		vrele(dvp);
3420		error = ENOENT;
3421		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3422		return (error);
3423	}
3424	/*
3425	 * *vp has its use count incremented still.
3426	 */
3427
3428	return (0);
3429}
3430
3431/*
3432 * Resolve a directory to a pathname.
3433 *
3434 * The name of the directory can always be found in the namecache or fetched
3435 * from the filesystem. There is also guaranteed to be only one parent, meaning
3436 * we can just follow vnodes up until we find the root.
3437 *
3438 * The vnode must be referenced.
3439 */
3440static int
3441vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3442    size_t *len, size_t addend)
3443{
3444#ifdef KDTRACE_HOOKS
3445	struct vnode *startvp = vp;
3446#endif
3447	struct vnode *vp1;
3448	size_t buflen;
3449	int error;
3450	bool slash_prefixed;
3451
3452	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3453	VNPASS(vp->v_usecount > 0, vp);
3454
3455	buflen = *len;
3456
3457	slash_prefixed = true;
3458	if (addend == 0) {
3459		MPASS(*len >= 2);
3460		buflen--;
3461		buf[buflen] = '\0';
3462		slash_prefixed = false;
3463	}
3464
3465	error = 0;
3466
3467	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3468	counter_u64_add(numfullpathcalls, 1);
3469	while (vp != rdir && vp != rootvnode) {
3470		/*
3471		 * The vp vnode must be already fully constructed,
3472		 * since it is either found in namecache or obtained
3473		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3474		 * without obtaining the vnode lock.
3475		 */
3476		if ((vp->v_vflag & VV_ROOT) != 0) {
3477			vn_lock(vp, LK_RETRY | LK_SHARED);
3478
3479			/*
3480			 * With the vnode locked, check for races with
3481			 * unmount, forced or not.  Note that we
3482			 * already verified that vp is not equal to
3483			 * the root vnode, which means that
3484			 * mnt_vnodecovered can be NULL only for the
3485			 * case of unmount.
3486			 */
3487			if (VN_IS_DOOMED(vp) ||
3488			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3489			    vp1->v_mountedhere != vp->v_mount) {
3490				vput(vp);
3491				error = ENOENT;
3492				SDT_PROBE3(vfs, namecache, fullpath, return,
3493				    error, vp, NULL);
3494				break;
3495			}
3496
3497			vref(vp1);
3498			vput(vp);
3499			vp = vp1;
3500			continue;
3501		}
3502		VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3503		error = vn_vptocnp(&vp, buf, &buflen);
3504		if (error)
3505			break;
3506		if (buflen == 0) {
3507			vrele(vp);
3508			error = ENOMEM;
3509			SDT_PROBE3(vfs, namecache, fullpath, return, error,
3510			    startvp, NULL);
3511			break;
3512		}
3513		buf[--buflen] = '/';
3514		slash_prefixed = true;
3515	}
3516	if (error)
3517		return (error);
3518	if (!slash_prefixed) {
3519		if (buflen == 0) {
3520			vrele(vp);
3521			counter_u64_add(numfullpathfail4, 1);
3522			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3523			    startvp, NULL);
3524			return (ENOMEM);
3525		}
3526		buf[--buflen] = '/';
3527	}
3528	counter_u64_add(numfullpathfound, 1);
3529	vrele(vp);
3530
3531	*retbuf = buf + buflen;
3532	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3533	*len -= buflen;
3534	*len += addend;
3535	return (0);
3536}
3537
3538/*
3539 * Resolve an arbitrary vnode to a pathname.
3540 *
3541 * Note 2 caveats:
3542 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3543 *   resolve to a different path than the one used to find it
3544 * - namecache is not mandatory, meaning names are not guaranteed to be added
3545 *   (in which case resolving fails)
3546 */
3547static void __inline
3548cache_rev_failed_impl(int *reason, int line)
3549{
3550
3551	*reason = line;
3552}
3553#define cache_rev_failed(var)	cache_rev_failed_impl((var), __LINE__)
3554
3555static int
3556vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3557    char **retbuf, size_t *buflen, size_t addend)
3558{
3559#ifdef KDTRACE_HOOKS
3560	struct vnode *startvp = vp;
3561#endif
3562	struct vnode *tvp;
3563	struct mount *mp;
3564	struct namecache *ncp;
3565	size_t orig_buflen;
3566	int reason;
3567	int error;
3568#ifdef KDTRACE_HOOKS
3569	int i;
3570#endif
3571	seqc_t vp_seqc, tvp_seqc;
3572	u_char nc_flag;
3573
3574	VFS_SMR_ASSERT_ENTERED();
3575
3576	if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3577		vfs_smr_exit();
3578		return (-1);
3579	}
3580
3581	orig_buflen = *buflen;
3582
3583	if (addend == 0) {
3584		MPASS(*buflen >= 2);
3585		*buflen -= 1;
3586		buf[*buflen] = '\0';
3587	}
3588
3589	if (vp == rdir || vp == rootvnode) {
3590		if (addend == 0) {
3591			*buflen -= 1;
3592			buf[*buflen] = '/';
3593		}
3594		goto out_ok;
3595	}
3596
3597#ifdef KDTRACE_HOOKS
3598	i = 0;
3599#endif
3600	error = -1;
3601	ncp = NULL; /* for sdt probe down below */
3602	vp_seqc = vn_seqc_read_any(vp);
3603	if (seqc_in_modify(vp_seqc)) {
3604		cache_rev_failed(&reason);
3605		goto out_abort;
3606	}
3607
3608	for (;;) {
3609#ifdef KDTRACE_HOOKS
3610		i++;
3611#endif
3612		if ((vp->v_vflag & VV_ROOT) != 0) {
3613			mp = atomic_load_ptr(&vp->v_mount);
3614			if (mp == NULL) {
3615				cache_rev_failed(&reason);
3616				goto out_abort;
3617			}
3618			tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3619			tvp_seqc = vn_seqc_read_any(tvp);
3620			if (seqc_in_modify(tvp_seqc)) {
3621				cache_rev_failed(&reason);
3622				goto out_abort;
3623			}
3624			if (!vn_seqc_consistent(vp, vp_seqc)) {
3625				cache_rev_failed(&reason);
3626				goto out_abort;
3627			}
3628			vp = tvp;
3629			vp_seqc = tvp_seqc;
3630			continue;
3631		}
3632		ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3633		if (ncp == NULL) {
3634			cache_rev_failed(&reason);
3635			goto out_abort;
3636		}
3637		nc_flag = atomic_load_char(&ncp->nc_flag);
3638		if ((nc_flag & NCF_ISDOTDOT) != 0) {
3639			cache_rev_failed(&reason);
3640			goto out_abort;
3641		}
3642		if (ncp->nc_nlen >= *buflen) {
3643			cache_rev_failed(&reason);
3644			error = ENOMEM;
3645			goto out_abort;
3646		}
3647		*buflen -= ncp->nc_nlen;
3648		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3649		*buflen -= 1;
3650		buf[*buflen] = '/';
3651		tvp = ncp->nc_dvp;
3652		tvp_seqc = vn_seqc_read_any(tvp);
3653		if (seqc_in_modify(tvp_seqc)) {
3654			cache_rev_failed(&reason);
3655			goto out_abort;
3656		}
3657		if (!vn_seqc_consistent(vp, vp_seqc)) {
3658			cache_rev_failed(&reason);
3659			goto out_abort;
3660		}
3661		/*
3662		 * Acquire fence provided by vn_seqc_read_any above.
3663		 */
3664		if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3665			cache_rev_failed(&reason);
3666			goto out_abort;
3667		}
3668		if (!cache_ncp_canuse(ncp)) {
3669			cache_rev_failed(&reason);
3670			goto out_abort;
3671		}
3672		vp = tvp;
3673		vp_seqc = tvp_seqc;
3674		if (vp == rdir || vp == rootvnode)
3675			break;
3676	}
3677out_ok:
3678	vfs_smr_exit();
3679	*retbuf = buf + *buflen;
3680	*buflen = orig_buflen - *buflen + addend;
3681	SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3682	return (0);
3683
3684out_abort:
3685	*buflen = orig_buflen;
3686	SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3687	vfs_smr_exit();
3688	return (error);
3689}
3690
3691static int
3692vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3693    size_t *buflen)
3694{
3695	size_t orig_buflen, addend;
3696	int error;
3697
3698	if (*buflen < 2)
3699		return (EINVAL);
3700
3701	orig_buflen = *buflen;
3702
3703	vref(vp);
3704	addend = 0;
3705	if (vp->v_type != VDIR) {
3706		*buflen -= 1;
3707		buf[*buflen] = '\0';
3708		error = vn_vptocnp(&vp, buf, buflen);
3709		if (error)
3710			return (error);
3711		if (*buflen == 0) {
3712			vrele(vp);
3713			return (ENOMEM);
3714		}
3715		*buflen -= 1;
3716		buf[*buflen] = '/';
3717		addend = orig_buflen - *buflen;
3718	}
3719
3720	return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3721}
3722
3723/*
3724 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3725 *
3726 * Since the namecache does not track hardlinks, the caller is expected to
3727 * first look up the target vnode with WANTPARENT flag passed to namei to get
3728 * dvp and vp.
3729 *
3730 * Then we have 2 cases:
3731 * - if the found vnode is a directory, the path can be constructed just by
3732 *   following names up the chain
3733 * - otherwise we populate the buffer with the saved name and start resolving
3734 *   from the parent
3735 */
3736int
3737vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3738    const char *hrdl_name, size_t hrdl_name_length,
3739    char **retbuf, char **freebuf, size_t *buflen)
3740{
3741	char *buf, *tmpbuf;
3742	struct pwd *pwd;
3743	size_t addend;
3744	int error;
3745	__enum_uint8(vtype) type;
3746
3747	if (*buflen < 2)
3748		return (EINVAL);
3749	if (*buflen > MAXPATHLEN)
3750		*buflen = MAXPATHLEN;
3751
3752	buf = malloc(*buflen, M_TEMP, M_WAITOK);
3753
3754	addend = 0;
3755
3756	/*
3757	 * Check for VBAD to work around the vp_crossmp bug in lookup().
3758	 *
3759	 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3760	 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3761	 * If the type is VDIR (like in this very case) we can skip looking
3762	 * at ni_dvp in the first place. However, since vnodes get passed here
3763	 * unlocked the target may transition to doomed state (type == VBAD)
3764	 * before we get to evaluate the condition. If this happens, we will
3765	 * populate part of the buffer and descend to vn_fullpath_dir with
3766	 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3767	 */
3768	type = atomic_load_8(&vp->v_type);
3769	if (type == VBAD) {
3770		error = ENOENT;
3771		goto out_bad;
3772	}
3773	if (type != VDIR) {
3774		addend = hrdl_name_length + 2;
3775		if (*buflen < addend) {
3776			error = ENOMEM;
3777			goto out_bad;
3778		}
3779		*buflen -= addend;
3780		tmpbuf = buf + *buflen;
3781		tmpbuf[0] = '/';
3782		memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3783		tmpbuf[addend - 1] = '\0';
3784		vp = dvp;
3785	}
3786
3787	vfs_smr_enter();
3788	pwd = pwd_get_smr();
3789	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3790	    addend);
3791	VFS_SMR_ASSERT_NOT_ENTERED();
3792	if (error < 0) {
3793		pwd = pwd_hold(curthread);
3794		vref(vp);
3795		error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3796		    addend);
3797		pwd_drop(pwd);
3798	}
3799	if (error != 0)
3800		goto out_bad;
3801
3802	*freebuf = buf;
3803
3804	return (0);
3805out_bad:
3806	free(buf, M_TEMP);
3807	return (error);
3808}
3809
3810struct vnode *
3811vn_dir_dd_ino(struct vnode *vp)
3812{
3813	struct namecache *ncp;
3814	struct vnode *ddvp;
3815	struct mtx *vlp;
3816	enum vgetstate vs;
3817
3818	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3819	vlp = VP2VNODELOCK(vp);
3820	mtx_lock(vlp);
3821	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3822		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3823			continue;
3824		ddvp = ncp->nc_dvp;
3825		vs = vget_prep(ddvp);
3826		mtx_unlock(vlp);
3827		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3828			return (NULL);
3829		return (ddvp);
3830	}
3831	mtx_unlock(vlp);
3832	return (NULL);
3833}
3834
3835int
3836vn_commname(struct vnode *vp, char *buf, u_int buflen)
3837{
3838	struct namecache *ncp;
3839	struct mtx *vlp;
3840	int l;
3841
3842	vlp = VP2VNODELOCK(vp);
3843	mtx_lock(vlp);
3844	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3845		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3846			break;
3847	if (ncp == NULL) {
3848		mtx_unlock(vlp);
3849		return (ENOENT);
3850	}
3851	l = min(ncp->nc_nlen, buflen - 1);
3852	memcpy(buf, ncp->nc_name, l);
3853	mtx_unlock(vlp);
3854	buf[l] = '\0';
3855	return (0);
3856}
3857
3858/*
3859 * This function updates path string to vnode's full global path
3860 * and checks the size of the new path string against the pathlen argument.
3861 *
3862 * Requires a locked, referenced vnode.
3863 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3864 *
3865 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3866 * because it falls back to the ".." lookup if the namecache lookup fails.
3867 */
3868int
3869vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3870    u_int pathlen)
3871{
3872	struct nameidata nd;
3873	struct vnode *vp1;
3874	char *rpath, *fbuf;
3875	int error;
3876
3877	ASSERT_VOP_ELOCKED(vp, __func__);
3878
3879	/* Construct global filesystem path from vp. */
3880	VOP_UNLOCK(vp);
3881	error = vn_fullpath_global(vp, &rpath, &fbuf);
3882
3883	if (error != 0) {
3884		vrele(vp);
3885		return (error);
3886	}
3887
3888	if (strlen(rpath) >= pathlen) {
3889		vrele(vp);
3890		error = ENAMETOOLONG;
3891		goto out;
3892	}
3893
3894	/*
3895	 * Re-lookup the vnode by path to detect a possible rename.
3896	 * As a side effect, the vnode is relocked.
3897	 * If vnode was renamed, return ENOENT.
3898	 */
3899	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3900	error = namei(&nd);
3901	if (error != 0) {
3902		vrele(vp);
3903		goto out;
3904	}
3905	NDFREE_PNBUF(&nd);
3906	vp1 = nd.ni_vp;
3907	vrele(vp);
3908	if (vp1 == vp)
3909		strcpy(path, rpath);
3910	else {
3911		vput(vp1);
3912		error = ENOENT;
3913	}
3914
3915out:
3916	free(fbuf, M_TEMP);
3917	return (error);
3918}
3919
3920/*
3921 * This is similar to vn_path_to_global_path but allows for regular
3922 * files which may not be present in the cache.
3923 *
3924 * Requires a locked, referenced vnode.
3925 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3926 */
3927int
3928vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
3929    struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
3930    size_t leaf_length)
3931{
3932	struct nameidata nd;
3933	struct vnode *vp1;
3934	char *rpath, *fbuf;
3935	size_t len;
3936	int error;
3937
3938	ASSERT_VOP_ELOCKED(vp, __func__);
3939
3940	/*
3941	 * Construct global filesystem path from dvp, vp and leaf
3942	 * name.
3943	 */
3944	VOP_UNLOCK(vp);
3945	len = pathlen;
3946	error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
3947	    &rpath, &fbuf, &len);
3948
3949	if (error != 0) {
3950		vrele(vp);
3951		return (error);
3952	}
3953
3954	if (strlen(rpath) >= pathlen) {
3955		vrele(vp);
3956		error = ENAMETOOLONG;
3957		goto out;
3958	}
3959
3960	/*
3961	 * Re-lookup the vnode by path to detect a possible rename.
3962	 * As a side effect, the vnode is relocked.
3963	 * If vnode was renamed, return ENOENT.
3964	 */
3965	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3966	error = namei(&nd);
3967	if (error != 0) {
3968		vrele(vp);
3969		goto out;
3970	}
3971	NDFREE_PNBUF(&nd);
3972	vp1 = nd.ni_vp;
3973	vrele(vp);
3974	if (vp1 == vp)
3975		strcpy(path, rpath);
3976	else {
3977		vput(vp1);
3978		error = ENOENT;
3979	}
3980
3981out:
3982	free(fbuf, M_TEMP);
3983	return (error);
3984}
3985
3986#ifdef DDB
3987static void
3988db_print_vpath(struct vnode *vp)
3989{
3990
3991	while (vp != NULL) {
3992		db_printf("%p: ", vp);
3993		if (vp == rootvnode) {
3994			db_printf("/");
3995			vp = NULL;
3996		} else {
3997			if (vp->v_vflag & VV_ROOT) {
3998				db_printf("<mount point>");
3999				vp = vp->v_mount->mnt_vnodecovered;
4000			} else {
4001				struct namecache *ncp;
4002				char *ncn;
4003				int i;
4004
4005				ncp = TAILQ_FIRST(&vp->v_cache_dst);
4006				if (ncp != NULL) {
4007					ncn = ncp->nc_name;
4008					for (i = 0; i < ncp->nc_nlen; i++)
4009						db_printf("%c", *ncn++);
4010					vp = ncp->nc_dvp;
4011				} else {
4012					vp = NULL;
4013				}
4014			}
4015		}
4016		db_printf("\n");
4017	}
4018
4019	return;
4020}
4021
4022DB_SHOW_COMMAND(vpath, db_show_vpath)
4023{
4024	struct vnode *vp;
4025
4026	if (!have_addr) {
4027		db_printf("usage: show vpath <struct vnode *>\n");
4028		return;
4029	}
4030
4031	vp = (struct vnode *)addr;
4032	db_print_vpath(vp);
4033}
4034
4035#endif
4036
4037static int cache_fast_lookup = 1;
4038
4039#define CACHE_FPL_FAILED	-2020
4040
4041static int
4042cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
4043{
4044	vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
4045	panic("no proper vop_fplookup_vexec");
4046}
4047
4048static int
4049cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
4050{
4051	vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
4052	panic("no proper vop_fplookup_symlink");
4053}
4054
4055void
4056cache_vop_vector_register(struct vop_vector *v)
4057{
4058	size_t ops;
4059
4060	ops = 0;
4061	if (v->vop_fplookup_vexec != NULL) {
4062		ops++;
4063	}
4064	if (v->vop_fplookup_symlink != NULL) {
4065		ops++;
4066	}
4067
4068	if (ops == 2) {
4069		return;
4070	}
4071
4072	if (ops == 0) {
4073		v->vop_fplookup_vexec = cache_vop_bad_vexec;
4074		v->vop_fplookup_symlink = cache_vop_bad_symlink;
4075		return;
4076	}
4077
4078	printf("%s: invalid vop vector %p -- either all or none fplookup vops "
4079	    "need to be provided",  __func__, v);
4080	if (v->vop_fplookup_vexec == NULL) {
4081		printf("%s: missing vop_fplookup_vexec\n", __func__);
4082	}
4083	if (v->vop_fplookup_symlink == NULL) {
4084		printf("%s: missing vop_fplookup_symlink\n", __func__);
4085	}
4086	panic("bad vop vector %p", v);
4087}
4088
4089#ifdef INVARIANTS
4090void
4091cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
4092{
4093	if (mp == NULL)
4094		return;
4095
4096	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4097		return;
4098
4099	if (vops->vop_fplookup_vexec == NULL ||
4100	    vops->vop_fplookup_vexec == cache_vop_bad_vexec)
4101		panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
4102		    vops, mp->mnt_vfc->vfc_name);
4103
4104	if (vops->vop_fplookup_symlink == NULL ||
4105	    vops->vop_fplookup_symlink == cache_vop_bad_symlink)
4106		panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
4107		    vops, mp->mnt_vfc->vfc_name);
4108}
4109#endif
4110
4111void
4112cache_fast_lookup_enabled_recalc(void)
4113{
4114	int lookup_flag;
4115	int mac_on;
4116
4117#ifdef MAC
4118	mac_on = mac_vnode_check_lookup_enabled();
4119	mac_on |= mac_vnode_check_readlink_enabled();
4120#else
4121	mac_on = 0;
4122#endif
4123
4124	lookup_flag = atomic_load_int(&cache_fast_lookup);
4125	if (lookup_flag && !mac_on) {
4126		atomic_store_char(&cache_fast_lookup_enabled, true);
4127	} else {
4128		atomic_store_char(&cache_fast_lookup_enabled, false);
4129	}
4130}
4131
4132static int
4133syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
4134{
4135	int error, old;
4136
4137	old = atomic_load_int(&cache_fast_lookup);
4138	error = sysctl_handle_int(oidp, arg1, arg2, req);
4139	if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
4140		cache_fast_lookup_enabled_recalc();
4141	return (error);
4142}
4143SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
4144    &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
4145
4146/*
4147 * Components of nameidata (or objects it can point to) which may
4148 * need restoring in case fast path lookup fails.
4149 */
4150struct nameidata_outer {
4151	size_t ni_pathlen;
4152	int cn_flags;
4153};
4154
4155struct nameidata_saved {
4156#ifdef INVARIANTS
4157	char *cn_nameptr;
4158	size_t ni_pathlen;
4159#endif
4160};
4161
4162#ifdef INVARIANTS
4163struct cache_fpl_debug {
4164	size_t ni_pathlen;
4165};
4166#endif
4167
4168struct cache_fpl {
4169	struct nameidata *ndp;
4170	struct componentname *cnp;
4171	char *nulchar;
4172	struct vnode *dvp;
4173	struct vnode *tvp;
4174	seqc_t dvp_seqc;
4175	seqc_t tvp_seqc;
4176	uint32_t hash;
4177	struct nameidata_saved snd;
4178	struct nameidata_outer snd_outer;
4179	int line;
4180	enum cache_fpl_status status:8;
4181	bool in_smr;
4182	bool fsearch;
4183	struct pwd **pwd;
4184#ifdef INVARIANTS
4185	struct cache_fpl_debug debug;
4186#endif
4187};
4188
4189static bool cache_fplookup_mp_supported(struct mount *mp);
4190static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4191static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4192static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4193static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4194static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4195static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4196static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4197static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4198static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4199
4200static void
4201cache_fpl_cleanup_cnp(struct componentname *cnp)
4202{
4203
4204	uma_zfree(namei_zone, cnp->cn_pnbuf);
4205	cnp->cn_pnbuf = NULL;
4206	cnp->cn_nameptr = NULL;
4207}
4208
4209static struct vnode *
4210cache_fpl_handle_root(struct cache_fpl *fpl)
4211{
4212	struct nameidata *ndp;
4213	struct componentname *cnp;
4214
4215	ndp = fpl->ndp;
4216	cnp = fpl->cnp;
4217
4218	MPASS(*(cnp->cn_nameptr) == '/');
4219	cnp->cn_nameptr++;
4220	cache_fpl_pathlen_dec(fpl);
4221
4222	if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4223		do {
4224			cnp->cn_nameptr++;
4225			cache_fpl_pathlen_dec(fpl);
4226		} while (*(cnp->cn_nameptr) == '/');
4227	}
4228
4229	return (ndp->ni_rootdir);
4230}
4231
4232static void
4233cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4234{
4235
4236	fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4237	fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4238}
4239
4240static void
4241cache_fpl_checkpoint(struct cache_fpl *fpl)
4242{
4243
4244#ifdef INVARIANTS
4245	fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4246	fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4247#endif
4248}
4249
4250static void
4251cache_fpl_restore_partial(struct cache_fpl *fpl)
4252{
4253
4254	fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4255#ifdef INVARIANTS
4256	fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4257#endif
4258}
4259
4260static void
4261cache_fpl_restore_abort(struct cache_fpl *fpl)
4262{
4263
4264	cache_fpl_restore_partial(fpl);
4265	/*
4266	 * It is 0 on entry by API contract.
4267	 */
4268	fpl->ndp->ni_resflags = 0;
4269	fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4270	fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4271}
4272
4273#ifdef INVARIANTS
4274#define cache_fpl_smr_assert_entered(fpl) ({			\
4275	struct cache_fpl *_fpl = (fpl);				\
4276	MPASS(_fpl->in_smr == true);				\
4277	VFS_SMR_ASSERT_ENTERED();				\
4278})
4279#define cache_fpl_smr_assert_not_entered(fpl) ({		\
4280	struct cache_fpl *_fpl = (fpl);				\
4281	MPASS(_fpl->in_smr == false);				\
4282	VFS_SMR_ASSERT_NOT_ENTERED();				\
4283})
4284static void
4285cache_fpl_assert_status(struct cache_fpl *fpl)
4286{
4287
4288	switch (fpl->status) {
4289	case CACHE_FPL_STATUS_UNSET:
4290		__assert_unreachable();
4291		break;
4292	case CACHE_FPL_STATUS_DESTROYED:
4293	case CACHE_FPL_STATUS_ABORTED:
4294	case CACHE_FPL_STATUS_PARTIAL:
4295	case CACHE_FPL_STATUS_HANDLED:
4296		break;
4297	}
4298}
4299#else
4300#define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4301#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4302#define cache_fpl_assert_status(fpl) do { } while (0)
4303#endif
4304
4305#define cache_fpl_smr_enter_initial(fpl) ({			\
4306	struct cache_fpl *_fpl = (fpl);				\
4307	vfs_smr_enter();					\
4308	_fpl->in_smr = true;					\
4309})
4310
4311#define cache_fpl_smr_enter(fpl) ({				\
4312	struct cache_fpl *_fpl = (fpl);				\
4313	MPASS(_fpl->in_smr == false);				\
4314	vfs_smr_enter();					\
4315	_fpl->in_smr = true;					\
4316})
4317
4318#define cache_fpl_smr_exit(fpl) ({				\
4319	struct cache_fpl *_fpl = (fpl);				\
4320	MPASS(_fpl->in_smr == true);				\
4321	vfs_smr_exit();						\
4322	_fpl->in_smr = false;					\
4323})
4324
4325static int
4326cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4327{
4328
4329	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4330		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4331		    ("%s: converting to abort from %d at %d, set at %d\n",
4332		    __func__, fpl->status, line, fpl->line));
4333	}
4334	cache_fpl_smr_assert_not_entered(fpl);
4335	fpl->status = CACHE_FPL_STATUS_ABORTED;
4336	fpl->line = line;
4337	return (CACHE_FPL_FAILED);
4338}
4339
4340#define cache_fpl_aborted_early(x)	cache_fpl_aborted_early_impl((x), __LINE__)
4341
4342static int __noinline
4343cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4344{
4345	struct nameidata *ndp;
4346	struct componentname *cnp;
4347
4348	ndp = fpl->ndp;
4349	cnp = fpl->cnp;
4350
4351	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4352		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4353		    ("%s: converting to abort from %d at %d, set at %d\n",
4354		    __func__, fpl->status, line, fpl->line));
4355	}
4356	fpl->status = CACHE_FPL_STATUS_ABORTED;
4357	fpl->line = line;
4358	if (fpl->in_smr)
4359		cache_fpl_smr_exit(fpl);
4360	cache_fpl_restore_abort(fpl);
4361	/*
4362	 * Resolving symlinks overwrites data passed by the caller.
4363	 * Let namei know.
4364	 */
4365	if (ndp->ni_loopcnt > 0) {
4366		fpl->status = CACHE_FPL_STATUS_DESTROYED;
4367		cache_fpl_cleanup_cnp(cnp);
4368	}
4369	return (CACHE_FPL_FAILED);
4370}
4371
4372#define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
4373
4374static int __noinline
4375cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4376{
4377
4378	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4379	    ("%s: setting to partial at %d, but already set to %d at %d\n",
4380	    __func__, line, fpl->status, fpl->line));
4381	cache_fpl_smr_assert_entered(fpl);
4382	fpl->status = CACHE_FPL_STATUS_PARTIAL;
4383	fpl->line = line;
4384	return (cache_fplookup_partial_setup(fpl));
4385}
4386
4387#define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
4388
4389static int
4390cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4391{
4392
4393	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4394	    ("%s: setting to handled at %d, but already set to %d at %d\n",
4395	    __func__, line, fpl->status, fpl->line));
4396	cache_fpl_smr_assert_not_entered(fpl);
4397	fpl->status = CACHE_FPL_STATUS_HANDLED;
4398	fpl->line = line;
4399	return (0);
4400}
4401
4402#define cache_fpl_handled(x)	cache_fpl_handled_impl((x), __LINE__)
4403
4404static int
4405cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4406{
4407
4408	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4409	    ("%s: setting to handled at %d, but already set to %d at %d\n",
4410	    __func__, line, fpl->status, fpl->line));
4411	MPASS(error != 0);
4412	MPASS(error != CACHE_FPL_FAILED);
4413	cache_fpl_smr_assert_not_entered(fpl);
4414	fpl->status = CACHE_FPL_STATUS_HANDLED;
4415	fpl->line = line;
4416	fpl->dvp = NULL;
4417	fpl->tvp = NULL;
4418	return (error);
4419}
4420
4421#define cache_fpl_handled_error(x, e)	cache_fpl_handled_error_impl((x), (e), __LINE__)
4422
4423static bool
4424cache_fpl_terminated(struct cache_fpl *fpl)
4425{
4426
4427	return (fpl->status != CACHE_FPL_STATUS_UNSET);
4428}
4429
4430#define CACHE_FPL_SUPPORTED_CN_FLAGS \
4431	(NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4432	 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
4433	 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
4434	 OPENWRITE | WANTIOCTLCAPS)
4435
4436#define CACHE_FPL_INTERNAL_CN_FLAGS \
4437	(ISDOTDOT | MAKEENTRY | ISLASTCN)
4438
4439_Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4440    "supported and internal flags overlap");
4441
4442static bool
4443cache_fpl_islastcn(struct nameidata *ndp)
4444{
4445
4446	return (*ndp->ni_next == 0);
4447}
4448
4449static bool
4450cache_fpl_istrailingslash(struct cache_fpl *fpl)
4451{
4452
4453	MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4454	return (*(fpl->nulchar - 1) == '/');
4455}
4456
4457static bool
4458cache_fpl_isdotdot(struct componentname *cnp)
4459{
4460
4461	if (cnp->cn_namelen == 2 &&
4462	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4463		return (true);
4464	return (false);
4465}
4466
4467static bool
4468cache_can_fplookup(struct cache_fpl *fpl)
4469{
4470	struct nameidata *ndp;
4471	struct componentname *cnp;
4472	struct thread *td;
4473
4474	ndp = fpl->ndp;
4475	cnp = fpl->cnp;
4476	td = curthread;
4477
4478	if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4479		cache_fpl_aborted_early(fpl);
4480		return (false);
4481	}
4482	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4483		cache_fpl_aborted_early(fpl);
4484		return (false);
4485	}
4486	if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) {
4487		cache_fpl_aborted_early(fpl);
4488		return (false);
4489	}
4490	if (AUDITING_TD(td)) {
4491		cache_fpl_aborted_early(fpl);
4492		return (false);
4493	}
4494	if (ndp->ni_startdir != NULL) {
4495		cache_fpl_aborted_early(fpl);
4496		return (false);
4497	}
4498	return (true);
4499}
4500
4501static int __noinline
4502cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4503{
4504	struct nameidata *ndp;
4505	struct componentname *cnp;
4506	int error;
4507	bool fsearch;
4508
4509	ndp = fpl->ndp;
4510	cnp = fpl->cnp;
4511
4512	error = fgetvp_lookup_smr(ndp, vpp, &fsearch);
4513	if (__predict_false(error != 0)) {
4514		return (cache_fpl_aborted(fpl));
4515	}
4516	fpl->fsearch = fsearch;
4517	if ((*vpp)->v_type != VDIR) {
4518		if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4519			cache_fpl_smr_exit(fpl);
4520			return (cache_fpl_handled_error(fpl, ENOTDIR));
4521		}
4522	}
4523	return (0);
4524}
4525
4526static int __noinline
4527cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4528    uint32_t hash)
4529{
4530	struct componentname *cnp;
4531	struct vnode *dvp;
4532
4533	cnp = fpl->cnp;
4534	dvp = fpl->dvp;
4535
4536	cache_fpl_smr_exit(fpl);
4537	if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4538		return (cache_fpl_handled_error(fpl, ENOENT));
4539	else
4540		return (cache_fpl_aborted(fpl));
4541}
4542
4543/*
4544 * The target vnode is not supported, prepare for the slow path to take over.
4545 */
4546static int __noinline
4547cache_fplookup_partial_setup(struct cache_fpl *fpl)
4548{
4549	struct nameidata *ndp;
4550	struct componentname *cnp;
4551	enum vgetstate dvs;
4552	struct vnode *dvp;
4553	struct pwd *pwd;
4554	seqc_t dvp_seqc;
4555
4556	ndp = fpl->ndp;
4557	cnp = fpl->cnp;
4558	pwd = *(fpl->pwd);
4559	dvp = fpl->dvp;
4560	dvp_seqc = fpl->dvp_seqc;
4561
4562	if (!pwd_hold_smr(pwd)) {
4563		return (cache_fpl_aborted(fpl));
4564	}
4565
4566	/*
4567	 * Note that seqc is checked before the vnode is locked, so by
4568	 * the time regular lookup gets to it it may have moved.
4569	 *
4570	 * Ultimately this does not affect correctness, any lookup errors
4571	 * are userspace racing with itself. It is guaranteed that any
4572	 * path which ultimately gets found could also have been found
4573	 * by regular lookup going all the way in absence of concurrent
4574	 * modifications.
4575	 */
4576	dvs = vget_prep_smr(dvp);
4577	cache_fpl_smr_exit(fpl);
4578	if (__predict_false(dvs == VGET_NONE)) {
4579		pwd_drop(pwd);
4580		return (cache_fpl_aborted(fpl));
4581	}
4582
4583	vget_finish_ref(dvp, dvs);
4584	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4585		vrele(dvp);
4586		pwd_drop(pwd);
4587		return (cache_fpl_aborted(fpl));
4588	}
4589
4590	cache_fpl_restore_partial(fpl);
4591#ifdef INVARIANTS
4592	if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4593		panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4594		    cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4595	}
4596#endif
4597
4598	ndp->ni_startdir = dvp;
4599	cnp->cn_flags |= MAKEENTRY;
4600	if (cache_fpl_islastcn(ndp))
4601		cnp->cn_flags |= ISLASTCN;
4602	if (cache_fpl_isdotdot(cnp))
4603		cnp->cn_flags |= ISDOTDOT;
4604
4605	/*
4606	 * Skip potential extra slashes parsing did not take care of.
4607	 * cache_fplookup_skip_slashes explains the mechanism.
4608	 */
4609	if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4610		do {
4611			cnp->cn_nameptr++;
4612			cache_fpl_pathlen_dec(fpl);
4613		} while (*(cnp->cn_nameptr) == '/');
4614	}
4615
4616	ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4617#ifdef INVARIANTS
4618	if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4619		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4620		    __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4621		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4622	}
4623#endif
4624	return (0);
4625}
4626
4627static int
4628cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4629{
4630	struct componentname *cnp;
4631	struct vnode *tvp;
4632	seqc_t tvp_seqc;
4633	int error, lkflags;
4634
4635	cnp = fpl->cnp;
4636	tvp = fpl->tvp;
4637	tvp_seqc = fpl->tvp_seqc;
4638
4639	if ((cnp->cn_flags & LOCKLEAF) != 0) {
4640		lkflags = LK_SHARED;
4641		if ((cnp->cn_flags & LOCKSHARED) == 0)
4642			lkflags = LK_EXCLUSIVE;
4643		error = vget_finish(tvp, lkflags, tvs);
4644		if (__predict_false(error != 0)) {
4645			return (cache_fpl_aborted(fpl));
4646		}
4647	} else {
4648		vget_finish_ref(tvp, tvs);
4649	}
4650
4651	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4652		if ((cnp->cn_flags & LOCKLEAF) != 0)
4653			vput(tvp);
4654		else
4655			vrele(tvp);
4656		return (cache_fpl_aborted(fpl));
4657	}
4658
4659	return (cache_fpl_handled(fpl));
4660}
4661
4662/*
4663 * They want to possibly modify the state of the namecache.
4664 */
4665static int __noinline
4666cache_fplookup_final_modifying(struct cache_fpl *fpl)
4667{
4668	struct nameidata *ndp __diagused;
4669	struct componentname *cnp;
4670	enum vgetstate dvs;
4671	struct vnode *dvp, *tvp;
4672	struct mount *mp;
4673	seqc_t dvp_seqc;
4674	int error;
4675	bool docache;
4676
4677	ndp = fpl->ndp;
4678	cnp = fpl->cnp;
4679	dvp = fpl->dvp;
4680	dvp_seqc = fpl->dvp_seqc;
4681
4682	MPASS(*(cnp->cn_nameptr) != '/');
4683	MPASS(cache_fpl_islastcn(ndp));
4684	if ((cnp->cn_flags & LOCKPARENT) == 0)
4685		MPASS((cnp->cn_flags & WANTPARENT) != 0);
4686	MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4687	MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4688	    cnp->cn_nameiop == RENAME);
4689	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4690	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4691
4692	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4693	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4694		docache = false;
4695
4696	/*
4697	 * Regular lookup nulifies the slash, which we don't do here.
4698	 * Don't take chances with filesystem routines seeing it for
4699	 * the last entry.
4700	 */
4701	if (cache_fpl_istrailingslash(fpl)) {
4702		return (cache_fpl_partial(fpl));
4703	}
4704
4705	mp = atomic_load_ptr(&dvp->v_mount);
4706	if (__predict_false(mp == NULL)) {
4707		return (cache_fpl_aborted(fpl));
4708	}
4709
4710	if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4711		cache_fpl_smr_exit(fpl);
4712		/*
4713		 * Original code keeps not checking for CREATE which
4714		 * might be a bug. For now let the old lookup decide.
4715		 */
4716		if (cnp->cn_nameiop == CREATE) {
4717			return (cache_fpl_aborted(fpl));
4718		}
4719		return (cache_fpl_handled_error(fpl, EROFS));
4720	}
4721
4722	if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4723		cache_fpl_smr_exit(fpl);
4724		return (cache_fpl_handled_error(fpl, EEXIST));
4725	}
4726
4727	/*
4728	 * Secure access to dvp; check cache_fplookup_partial_setup for
4729	 * reasoning.
4730	 *
4731	 * XXX At least UFS requires its lookup routine to be called for
4732	 * the last path component, which leads to some level of complication
4733	 * and inefficiency:
4734	 * - the target routine always locks the target vnode, but our caller
4735	 *   may not need it locked
4736	 * - some of the VOP machinery asserts that the parent is locked, which
4737	 *   once more may be not required
4738	 *
4739	 * TODO: add a flag for filesystems which don't need this.
4740	 */
4741	dvs = vget_prep_smr(dvp);
4742	cache_fpl_smr_exit(fpl);
4743	if (__predict_false(dvs == VGET_NONE)) {
4744		return (cache_fpl_aborted(fpl));
4745	}
4746
4747	vget_finish_ref(dvp, dvs);
4748	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4749		vrele(dvp);
4750		return (cache_fpl_aborted(fpl));
4751	}
4752
4753	error = vn_lock(dvp, LK_EXCLUSIVE);
4754	if (__predict_false(error != 0)) {
4755		vrele(dvp);
4756		return (cache_fpl_aborted(fpl));
4757	}
4758
4759	tvp = NULL;
4760	cnp->cn_flags |= ISLASTCN;
4761	if (docache)
4762		cnp->cn_flags |= MAKEENTRY;
4763	if (cache_fpl_isdotdot(cnp))
4764		cnp->cn_flags |= ISDOTDOT;
4765	cnp->cn_lkflags = LK_EXCLUSIVE;
4766	error = VOP_LOOKUP(dvp, &tvp, cnp);
4767	switch (error) {
4768	case EJUSTRETURN:
4769	case 0:
4770		break;
4771	case ENOTDIR:
4772	case ENOENT:
4773		vput(dvp);
4774		return (cache_fpl_handled_error(fpl, error));
4775	default:
4776		vput(dvp);
4777		return (cache_fpl_aborted(fpl));
4778	}
4779
4780	fpl->tvp = tvp;
4781
4782	if (tvp == NULL) {
4783		MPASS(error == EJUSTRETURN);
4784		if ((cnp->cn_flags & LOCKPARENT) == 0) {
4785			VOP_UNLOCK(dvp);
4786		}
4787		return (cache_fpl_handled(fpl));
4788	}
4789
4790	/*
4791	 * There are very hairy corner cases concerning various flag combinations
4792	 * and locking state. In particular here we only hold one lock instead of
4793	 * two.
4794	 *
4795	 * Skip the complexity as it is of no significance for normal workloads.
4796	 */
4797	if (__predict_false(tvp == dvp)) {
4798		vput(dvp);
4799		vrele(tvp);
4800		return (cache_fpl_aborted(fpl));
4801	}
4802
4803	/*
4804	 * If they want the symlink itself we are fine, but if they want to
4805	 * follow it regular lookup has to be engaged.
4806	 */
4807	if (tvp->v_type == VLNK) {
4808		if ((cnp->cn_flags & FOLLOW) != 0) {
4809			vput(dvp);
4810			vput(tvp);
4811			return (cache_fpl_aborted(fpl));
4812		}
4813	}
4814
4815	/*
4816	 * Since we expect this to be the terminal vnode it should almost never
4817	 * be a mount point.
4818	 */
4819	if (__predict_false(cache_fplookup_is_mp(fpl))) {
4820		vput(dvp);
4821		vput(tvp);
4822		return (cache_fpl_aborted(fpl));
4823	}
4824
4825	if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4826		vput(dvp);
4827		vput(tvp);
4828		return (cache_fpl_handled_error(fpl, EEXIST));
4829	}
4830
4831	if ((cnp->cn_flags & LOCKLEAF) == 0) {
4832		VOP_UNLOCK(tvp);
4833	}
4834
4835	if ((cnp->cn_flags & LOCKPARENT) == 0) {
4836		VOP_UNLOCK(dvp);
4837	}
4838
4839	return (cache_fpl_handled(fpl));
4840}
4841
4842static int __noinline
4843cache_fplookup_modifying(struct cache_fpl *fpl)
4844{
4845	struct nameidata *ndp;
4846
4847	ndp = fpl->ndp;
4848
4849	if (!cache_fpl_islastcn(ndp)) {
4850		return (cache_fpl_partial(fpl));
4851	}
4852	return (cache_fplookup_final_modifying(fpl));
4853}
4854
4855static int __noinline
4856cache_fplookup_final_withparent(struct cache_fpl *fpl)
4857{
4858	struct componentname *cnp;
4859	enum vgetstate dvs, tvs;
4860	struct vnode *dvp, *tvp;
4861	seqc_t dvp_seqc;
4862	int error;
4863
4864	cnp = fpl->cnp;
4865	dvp = fpl->dvp;
4866	dvp_seqc = fpl->dvp_seqc;
4867	tvp = fpl->tvp;
4868
4869	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4870
4871	/*
4872	 * This is less efficient than it can be for simplicity.
4873	 */
4874	dvs = vget_prep_smr(dvp);
4875	if (__predict_false(dvs == VGET_NONE)) {
4876		return (cache_fpl_aborted(fpl));
4877	}
4878	tvs = vget_prep_smr(tvp);
4879	if (__predict_false(tvs == VGET_NONE)) {
4880		cache_fpl_smr_exit(fpl);
4881		vget_abort(dvp, dvs);
4882		return (cache_fpl_aborted(fpl));
4883	}
4884
4885	cache_fpl_smr_exit(fpl);
4886
4887	if ((cnp->cn_flags & LOCKPARENT) != 0) {
4888		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4889		if (__predict_false(error != 0)) {
4890			vget_abort(tvp, tvs);
4891			return (cache_fpl_aborted(fpl));
4892		}
4893	} else {
4894		vget_finish_ref(dvp, dvs);
4895	}
4896
4897	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4898		vget_abort(tvp, tvs);
4899		if ((cnp->cn_flags & LOCKPARENT) != 0)
4900			vput(dvp);
4901		else
4902			vrele(dvp);
4903		return (cache_fpl_aborted(fpl));
4904	}
4905
4906	error = cache_fplookup_final_child(fpl, tvs);
4907	if (__predict_false(error != 0)) {
4908		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4909		    fpl->status == CACHE_FPL_STATUS_DESTROYED);
4910		if ((cnp->cn_flags & LOCKPARENT) != 0)
4911			vput(dvp);
4912		else
4913			vrele(dvp);
4914		return (error);
4915	}
4916
4917	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4918	return (0);
4919}
4920
4921static int
4922cache_fplookup_final(struct cache_fpl *fpl)
4923{
4924	struct componentname *cnp;
4925	enum vgetstate tvs;
4926	struct vnode *dvp, *tvp;
4927	seqc_t dvp_seqc;
4928
4929	cnp = fpl->cnp;
4930	dvp = fpl->dvp;
4931	dvp_seqc = fpl->dvp_seqc;
4932	tvp = fpl->tvp;
4933
4934	MPASS(*(cnp->cn_nameptr) != '/');
4935
4936	if (cnp->cn_nameiop != LOOKUP) {
4937		return (cache_fplookup_final_modifying(fpl));
4938	}
4939
4940	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4941		return (cache_fplookup_final_withparent(fpl));
4942
4943	tvs = vget_prep_smr(tvp);
4944	if (__predict_false(tvs == VGET_NONE)) {
4945		return (cache_fpl_partial(fpl));
4946	}
4947
4948	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4949		cache_fpl_smr_exit(fpl);
4950		vget_abort(tvp, tvs);
4951		return (cache_fpl_aborted(fpl));
4952	}
4953
4954	cache_fpl_smr_exit(fpl);
4955	return (cache_fplookup_final_child(fpl, tvs));
4956}
4957
4958/*
4959 * Comment from locked lookup:
4960 * Check for degenerate name (e.g. / or "") which is a way of talking about a
4961 * directory, e.g. like "/." or ".".
4962 */
4963static int __noinline
4964cache_fplookup_degenerate(struct cache_fpl *fpl)
4965{
4966	struct componentname *cnp;
4967	struct vnode *dvp;
4968	enum vgetstate dvs;
4969	int error, lkflags;
4970#ifdef INVARIANTS
4971	char *cp;
4972#endif
4973
4974	fpl->tvp = fpl->dvp;
4975	fpl->tvp_seqc = fpl->dvp_seqc;
4976
4977	cnp = fpl->cnp;
4978	dvp = fpl->dvp;
4979
4980#ifdef INVARIANTS
4981	for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
4982		KASSERT(*cp == '/',
4983		    ("%s: encountered non-slash; string [%s]\n", __func__,
4984		    cnp->cn_pnbuf));
4985	}
4986#endif
4987
4988	if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
4989		cache_fpl_smr_exit(fpl);
4990		return (cache_fpl_handled_error(fpl, EISDIR));
4991	}
4992
4993	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
4994		return (cache_fplookup_final_withparent(fpl));
4995	}
4996
4997	dvs = vget_prep_smr(dvp);
4998	cache_fpl_smr_exit(fpl);
4999	if (__predict_false(dvs == VGET_NONE)) {
5000		return (cache_fpl_aborted(fpl));
5001	}
5002
5003	if ((cnp->cn_flags & LOCKLEAF) != 0) {
5004		lkflags = LK_SHARED;
5005		if ((cnp->cn_flags & LOCKSHARED) == 0)
5006			lkflags = LK_EXCLUSIVE;
5007		error = vget_finish(dvp, lkflags, dvs);
5008		if (__predict_false(error != 0)) {
5009			return (cache_fpl_aborted(fpl));
5010		}
5011	} else {
5012		vget_finish_ref(dvp, dvs);
5013	}
5014	return (cache_fpl_handled(fpl));
5015}
5016
5017static int __noinline
5018cache_fplookup_emptypath(struct cache_fpl *fpl)
5019{
5020	struct nameidata *ndp;
5021	struct componentname *cnp;
5022	enum vgetstate tvs;
5023	struct vnode *tvp;
5024	int error, lkflags;
5025
5026	fpl->tvp = fpl->dvp;
5027	fpl->tvp_seqc = fpl->dvp_seqc;
5028
5029	ndp = fpl->ndp;
5030	cnp = fpl->cnp;
5031	tvp = fpl->tvp;
5032
5033	MPASS(*cnp->cn_pnbuf == '\0');
5034
5035	if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
5036		cache_fpl_smr_exit(fpl);
5037		return (cache_fpl_handled_error(fpl, ENOENT));
5038	}
5039
5040	MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
5041
5042	tvs = vget_prep_smr(tvp);
5043	cache_fpl_smr_exit(fpl);
5044	if (__predict_false(tvs == VGET_NONE)) {
5045		return (cache_fpl_aborted(fpl));
5046	}
5047
5048	if ((cnp->cn_flags & LOCKLEAF) != 0) {
5049		lkflags = LK_SHARED;
5050		if ((cnp->cn_flags & LOCKSHARED) == 0)
5051			lkflags = LK_EXCLUSIVE;
5052		error = vget_finish(tvp, lkflags, tvs);
5053		if (__predict_false(error != 0)) {
5054			return (cache_fpl_aborted(fpl));
5055		}
5056	} else {
5057		vget_finish_ref(tvp, tvs);
5058	}
5059
5060	ndp->ni_resflags |= NIRES_EMPTYPATH;
5061	return (cache_fpl_handled(fpl));
5062}
5063
5064static int __noinline
5065cache_fplookup_noentry(struct cache_fpl *fpl)
5066{
5067	struct nameidata *ndp;
5068	struct componentname *cnp;
5069	enum vgetstate dvs;
5070	struct vnode *dvp, *tvp;
5071	seqc_t dvp_seqc;
5072	int error;
5073
5074	ndp = fpl->ndp;
5075	cnp = fpl->cnp;
5076	dvp = fpl->dvp;
5077	dvp_seqc = fpl->dvp_seqc;
5078
5079	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
5080	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
5081	if (cnp->cn_nameiop == LOOKUP)
5082		MPASS((cnp->cn_flags & NOCACHE) == 0);
5083	MPASS(!cache_fpl_isdotdot(cnp));
5084
5085	/*
5086	 * Hack: delayed name len checking.
5087	 */
5088	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5089		cache_fpl_smr_exit(fpl);
5090		return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5091	}
5092
5093	if (cnp->cn_nameptr[0] == '/') {
5094		return (cache_fplookup_skip_slashes(fpl));
5095	}
5096
5097	if (cnp->cn_pnbuf[0] == '\0') {
5098		return (cache_fplookup_emptypath(fpl));
5099	}
5100
5101	if (cnp->cn_nameptr[0] == '\0') {
5102		if (fpl->tvp == NULL) {
5103			return (cache_fplookup_degenerate(fpl));
5104		}
5105		return (cache_fplookup_trailingslash(fpl));
5106	}
5107
5108	if (cnp->cn_nameiop != LOOKUP) {
5109		fpl->tvp = NULL;
5110		return (cache_fplookup_modifying(fpl));
5111	}
5112
5113	/*
5114	 * Only try to fill in the component if it is the last one,
5115	 * otherwise not only there may be several to handle but the
5116	 * walk may be complicated.
5117	 */
5118	if (!cache_fpl_islastcn(ndp)) {
5119		return (cache_fpl_partial(fpl));
5120	}
5121
5122	/*
5123	 * Regular lookup nulifies the slash, which we don't do here.
5124	 * Don't take chances with filesystem routines seeing it for
5125	 * the last entry.
5126	 */
5127	if (cache_fpl_istrailingslash(fpl)) {
5128		return (cache_fpl_partial(fpl));
5129	}
5130
5131	/*
5132	 * Secure access to dvp; check cache_fplookup_partial_setup for
5133	 * reasoning.
5134	 */
5135	dvs = vget_prep_smr(dvp);
5136	cache_fpl_smr_exit(fpl);
5137	if (__predict_false(dvs == VGET_NONE)) {
5138		return (cache_fpl_aborted(fpl));
5139	}
5140
5141	vget_finish_ref(dvp, dvs);
5142	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5143		vrele(dvp);
5144		return (cache_fpl_aborted(fpl));
5145	}
5146
5147	error = vn_lock(dvp, LK_SHARED);
5148	if (__predict_false(error != 0)) {
5149		vrele(dvp);
5150		return (cache_fpl_aborted(fpl));
5151	}
5152
5153	tvp = NULL;
5154	/*
5155	 * TODO: provide variants which don't require locking either vnode.
5156	 */
5157	cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5158	cnp->cn_lkflags = LK_SHARED;
5159	if ((cnp->cn_flags & LOCKSHARED) == 0) {
5160		cnp->cn_lkflags = LK_EXCLUSIVE;
5161	}
5162	error = VOP_LOOKUP(dvp, &tvp, cnp);
5163	switch (error) {
5164	case EJUSTRETURN:
5165	case 0:
5166		break;
5167	case ENOTDIR:
5168	case ENOENT:
5169		vput(dvp);
5170		return (cache_fpl_handled_error(fpl, error));
5171	default:
5172		vput(dvp);
5173		return (cache_fpl_aborted(fpl));
5174	}
5175
5176	fpl->tvp = tvp;
5177
5178	if (tvp == NULL) {
5179		MPASS(error == EJUSTRETURN);
5180		if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5181			vput(dvp);
5182		} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5183			VOP_UNLOCK(dvp);
5184		}
5185		return (cache_fpl_handled(fpl));
5186	}
5187
5188	if (tvp->v_type == VLNK) {
5189		if ((cnp->cn_flags & FOLLOW) != 0) {
5190			vput(dvp);
5191			vput(tvp);
5192			return (cache_fpl_aborted(fpl));
5193		}
5194	}
5195
5196	if (__predict_false(cache_fplookup_is_mp(fpl))) {
5197		vput(dvp);
5198		vput(tvp);
5199		return (cache_fpl_aborted(fpl));
5200	}
5201
5202	if ((cnp->cn_flags & LOCKLEAF) == 0) {
5203		VOP_UNLOCK(tvp);
5204	}
5205
5206	if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5207		vput(dvp);
5208	} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5209		VOP_UNLOCK(dvp);
5210	}
5211	return (cache_fpl_handled(fpl));
5212}
5213
5214static int __noinline
5215cache_fplookup_dot(struct cache_fpl *fpl)
5216{
5217	int error;
5218
5219	MPASS(!seqc_in_modify(fpl->dvp_seqc));
5220
5221	if (__predict_false(fpl->dvp->v_type != VDIR)) {
5222		cache_fpl_smr_exit(fpl);
5223		return (cache_fpl_handled_error(fpl, ENOTDIR));
5224	}
5225
5226	/*
5227	 * Just re-assign the value. seqc will be checked later for the first
5228	 * non-dot path component in line and/or before deciding to return the
5229	 * vnode.
5230	 */
5231	fpl->tvp = fpl->dvp;
5232	fpl->tvp_seqc = fpl->dvp_seqc;
5233
5234	SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5235
5236	error = 0;
5237	if (cache_fplookup_is_mp(fpl)) {
5238		error = cache_fplookup_cross_mount(fpl);
5239	}
5240	return (error);
5241}
5242
5243static int __noinline
5244cache_fplookup_dotdot(struct cache_fpl *fpl)
5245{
5246	struct nameidata *ndp;
5247	struct componentname *cnp;
5248	struct namecache *ncp;
5249	struct vnode *dvp;
5250	struct prison *pr;
5251	u_char nc_flag;
5252
5253	ndp = fpl->ndp;
5254	cnp = fpl->cnp;
5255	dvp = fpl->dvp;
5256
5257	MPASS(cache_fpl_isdotdot(cnp));
5258
5259	/*
5260	 * XXX this is racy the same way regular lookup is
5261	 */
5262	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
5263	    pr = pr->pr_parent)
5264		if (dvp == pr->pr_root)
5265			break;
5266
5267	if (dvp == ndp->ni_rootdir ||
5268	    dvp == ndp->ni_topdir ||
5269	    dvp == rootvnode ||
5270	    pr != NULL) {
5271		fpl->tvp = dvp;
5272		fpl->tvp_seqc = vn_seqc_read_any(dvp);
5273		if (seqc_in_modify(fpl->tvp_seqc)) {
5274			return (cache_fpl_aborted(fpl));
5275		}
5276		return (0);
5277	}
5278
5279	if ((dvp->v_vflag & VV_ROOT) != 0) {
5280		/*
5281		 * TODO
5282		 * The opposite of climb mount is needed here.
5283		 */
5284		return (cache_fpl_partial(fpl));
5285	}
5286
5287	if (__predict_false(dvp->v_type != VDIR)) {
5288		cache_fpl_smr_exit(fpl);
5289		return (cache_fpl_handled_error(fpl, ENOTDIR));
5290	}
5291
5292	ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5293	if (ncp == NULL) {
5294		return (cache_fpl_aborted(fpl));
5295	}
5296
5297	nc_flag = atomic_load_char(&ncp->nc_flag);
5298	if ((nc_flag & NCF_ISDOTDOT) != 0) {
5299		if ((nc_flag & NCF_NEGATIVE) != 0)
5300			return (cache_fpl_aborted(fpl));
5301		fpl->tvp = ncp->nc_vp;
5302	} else {
5303		fpl->tvp = ncp->nc_dvp;
5304	}
5305
5306	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5307	if (seqc_in_modify(fpl->tvp_seqc)) {
5308		return (cache_fpl_partial(fpl));
5309	}
5310
5311	/*
5312	 * Acquire fence provided by vn_seqc_read_any above.
5313	 */
5314	if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5315		return (cache_fpl_aborted(fpl));
5316	}
5317
5318	if (!cache_ncp_canuse(ncp)) {
5319		return (cache_fpl_aborted(fpl));
5320	}
5321
5322	return (0);
5323}
5324
5325static int __noinline
5326cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5327{
5328	u_char nc_flag __diagused;
5329	bool neg_promote;
5330
5331#ifdef INVARIANTS
5332	nc_flag = atomic_load_char(&ncp->nc_flag);
5333	MPASS((nc_flag & NCF_NEGATIVE) != 0);
5334#endif
5335	/*
5336	 * If they want to create an entry we need to replace this one.
5337	 */
5338	if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5339		fpl->tvp = NULL;
5340		return (cache_fplookup_modifying(fpl));
5341	}
5342	neg_promote = cache_neg_hit_prep(ncp);
5343	if (!cache_fpl_neg_ncp_canuse(ncp)) {
5344		cache_neg_hit_abort(ncp);
5345		return (cache_fpl_partial(fpl));
5346	}
5347	if (neg_promote) {
5348		return (cache_fplookup_negative_promote(fpl, ncp, hash));
5349	}
5350	cache_neg_hit_finish(ncp);
5351	cache_fpl_smr_exit(fpl);
5352	return (cache_fpl_handled_error(fpl, ENOENT));
5353}
5354
5355/*
5356 * Resolve a symlink. Called by filesystem-specific routines.
5357 *
5358 * Code flow is:
5359 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5360 */
5361int
5362cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5363{
5364	struct nameidata *ndp;
5365	struct componentname *cnp;
5366	size_t adjust;
5367
5368	ndp = fpl->ndp;
5369	cnp = fpl->cnp;
5370
5371	if (__predict_false(len == 0)) {
5372		return (ENOENT);
5373	}
5374
5375	if (__predict_false(len > MAXPATHLEN - 2)) {
5376		if (cache_fpl_istrailingslash(fpl)) {
5377			return (EAGAIN);
5378		}
5379	}
5380
5381	ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5382#ifdef INVARIANTS
5383	if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5384		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5385		    __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5386		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5387	}
5388#endif
5389
5390	if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5391		return (ENAMETOOLONG);
5392	}
5393
5394	if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5395		return (ELOOP);
5396	}
5397
5398	adjust = len;
5399	if (ndp->ni_pathlen > 1) {
5400		bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5401	} else {
5402		if (cache_fpl_istrailingslash(fpl)) {
5403			adjust = len + 1;
5404			cnp->cn_pnbuf[len] = '/';
5405			cnp->cn_pnbuf[len + 1] = '\0';
5406		} else {
5407			cnp->cn_pnbuf[len] = '\0';
5408		}
5409	}
5410	bcopy(string, cnp->cn_pnbuf, len);
5411
5412	ndp->ni_pathlen += adjust;
5413	cache_fpl_pathlen_add(fpl, adjust);
5414	cnp->cn_nameptr = cnp->cn_pnbuf;
5415	fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5416	fpl->tvp = NULL;
5417	return (0);
5418}
5419
5420static int __noinline
5421cache_fplookup_symlink(struct cache_fpl *fpl)
5422{
5423	struct mount *mp;
5424	struct nameidata *ndp;
5425	struct componentname *cnp;
5426	struct vnode *dvp, *tvp;
5427	struct pwd *pwd;
5428	int error;
5429
5430	ndp = fpl->ndp;
5431	cnp = fpl->cnp;
5432	dvp = fpl->dvp;
5433	tvp = fpl->tvp;
5434	pwd = *(fpl->pwd);
5435
5436	if (cache_fpl_islastcn(ndp)) {
5437		if ((cnp->cn_flags & FOLLOW) == 0) {
5438			return (cache_fplookup_final(fpl));
5439		}
5440	}
5441
5442	mp = atomic_load_ptr(&dvp->v_mount);
5443	if (__predict_false(mp == NULL)) {
5444		return (cache_fpl_aborted(fpl));
5445	}
5446
5447	/*
5448	 * Note this check races against setting the flag just like regular
5449	 * lookup.
5450	 */
5451	if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5452		cache_fpl_smr_exit(fpl);
5453		return (cache_fpl_handled_error(fpl, EACCES));
5454	}
5455
5456	error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5457	if (__predict_false(error != 0)) {
5458		switch (error) {
5459		case EAGAIN:
5460			return (cache_fpl_partial(fpl));
5461		case ENOENT:
5462		case ENAMETOOLONG:
5463		case ELOOP:
5464			cache_fpl_smr_exit(fpl);
5465			return (cache_fpl_handled_error(fpl, error));
5466		default:
5467			return (cache_fpl_aborted(fpl));
5468		}
5469	}
5470
5471	if (*(cnp->cn_nameptr) == '/') {
5472		fpl->dvp = cache_fpl_handle_root(fpl);
5473		fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5474		if (seqc_in_modify(fpl->dvp_seqc)) {
5475			return (cache_fpl_aborted(fpl));
5476		}
5477		/*
5478		 * The main loop assumes that ->dvp points to a vnode belonging
5479		 * to a filesystem which can do lockless lookup, but the absolute
5480		 * symlink can be wandering off to one which does not.
5481		 */
5482		mp = atomic_load_ptr(&fpl->dvp->v_mount);
5483		if (__predict_false(mp == NULL)) {
5484			return (cache_fpl_aborted(fpl));
5485		}
5486		if (!cache_fplookup_mp_supported(mp)) {
5487			cache_fpl_checkpoint(fpl);
5488			return (cache_fpl_partial(fpl));
5489		}
5490		if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) {
5491			return (cache_fpl_aborted(fpl));
5492		}
5493	}
5494	return (0);
5495}
5496
5497static int
5498cache_fplookup_next(struct cache_fpl *fpl)
5499{
5500	struct componentname *cnp;
5501	struct namecache *ncp;
5502	struct vnode *dvp, *tvp;
5503	u_char nc_flag;
5504	uint32_t hash;
5505	int error;
5506
5507	cnp = fpl->cnp;
5508	dvp = fpl->dvp;
5509	hash = fpl->hash;
5510
5511	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5512		if (cnp->cn_namelen == 1) {
5513			return (cache_fplookup_dot(fpl));
5514		}
5515		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5516			return (cache_fplookup_dotdot(fpl));
5517		}
5518	}
5519
5520	MPASS(!cache_fpl_isdotdot(cnp));
5521
5522	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
5523		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
5524		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
5525			break;
5526	}
5527
5528	if (__predict_false(ncp == NULL)) {
5529		return (cache_fplookup_noentry(fpl));
5530	}
5531
5532	tvp = atomic_load_ptr(&ncp->nc_vp);
5533	nc_flag = atomic_load_char(&ncp->nc_flag);
5534	if ((nc_flag & NCF_NEGATIVE) != 0) {
5535		return (cache_fplookup_neg(fpl, ncp, hash));
5536	}
5537
5538	if (!cache_ncp_canuse(ncp)) {
5539		return (cache_fpl_partial(fpl));
5540	}
5541
5542	fpl->tvp = tvp;
5543	fpl->tvp_seqc = vn_seqc_read_any(tvp);
5544	if (seqc_in_modify(fpl->tvp_seqc)) {
5545		return (cache_fpl_partial(fpl));
5546	}
5547
5548	counter_u64_add(numposhits, 1);
5549	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5550
5551	error = 0;
5552	if (cache_fplookup_is_mp(fpl)) {
5553		error = cache_fplookup_cross_mount(fpl);
5554	}
5555	return (error);
5556}
5557
5558static bool
5559cache_fplookup_mp_supported(struct mount *mp)
5560{
5561
5562	MPASS(mp != NULL);
5563	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5564		return (false);
5565	return (true);
5566}
5567
5568/*
5569 * Walk up the mount stack (if any).
5570 *
5571 * Correctness is provided in the following ways:
5572 * - all vnodes are protected from freeing with SMR
5573 * - struct mount objects are type stable making them always safe to access
5574 * - stability of the particular mount is provided by busying it
5575 * - relationship between the vnode which is mounted on and the mount is
5576 *   verified with the vnode sequence counter after busying
5577 * - association between root vnode of the mount and the mount is protected
5578 *   by busy
5579 *
5580 * From that point on we can read the sequence counter of the root vnode
5581 * and get the next mount on the stack (if any) using the same protection.
5582 *
5583 * By the end of successful walk we are guaranteed the reached state was
5584 * indeed present at least at some point which matches the regular lookup.
5585 */
5586static int __noinline
5587cache_fplookup_climb_mount(struct cache_fpl *fpl)
5588{
5589	struct mount *mp, *prev_mp;
5590	struct mount_pcpu *mpcpu, *prev_mpcpu;
5591	struct vnode *vp;
5592	seqc_t vp_seqc;
5593
5594	vp = fpl->tvp;
5595	vp_seqc = fpl->tvp_seqc;
5596
5597	VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5598	mp = atomic_load_ptr(&vp->v_mountedhere);
5599	if (__predict_false(mp == NULL)) {
5600		return (0);
5601	}
5602
5603	prev_mp = NULL;
5604	for (;;) {
5605		if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5606			if (prev_mp != NULL)
5607				vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5608			return (cache_fpl_partial(fpl));
5609		}
5610		if (prev_mp != NULL)
5611			vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5612		if (!vn_seqc_consistent(vp, vp_seqc)) {
5613			vfs_op_thread_exit_crit(mp, mpcpu);
5614			return (cache_fpl_partial(fpl));
5615		}
5616		if (!cache_fplookup_mp_supported(mp)) {
5617			vfs_op_thread_exit_crit(mp, mpcpu);
5618			return (cache_fpl_partial(fpl));
5619		}
5620		vp = atomic_load_ptr(&mp->mnt_rootvnode);
5621		if (vp == NULL) {
5622			vfs_op_thread_exit_crit(mp, mpcpu);
5623			return (cache_fpl_partial(fpl));
5624		}
5625		vp_seqc = vn_seqc_read_any(vp);
5626		if (seqc_in_modify(vp_seqc)) {
5627			vfs_op_thread_exit_crit(mp, mpcpu);
5628			return (cache_fpl_partial(fpl));
5629		}
5630		prev_mp = mp;
5631		prev_mpcpu = mpcpu;
5632		mp = atomic_load_ptr(&vp->v_mountedhere);
5633		if (mp == NULL)
5634			break;
5635	}
5636
5637	vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5638	fpl->tvp = vp;
5639	fpl->tvp_seqc = vp_seqc;
5640	return (0);
5641}
5642
5643static int __noinline
5644cache_fplookup_cross_mount(struct cache_fpl *fpl)
5645{
5646	struct mount *mp;
5647	struct mount_pcpu *mpcpu;
5648	struct vnode *vp;
5649	seqc_t vp_seqc;
5650
5651	vp = fpl->tvp;
5652	vp_seqc = fpl->tvp_seqc;
5653
5654	VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5655	mp = atomic_load_ptr(&vp->v_mountedhere);
5656	if (__predict_false(mp == NULL)) {
5657		return (0);
5658	}
5659
5660	if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5661		return (cache_fpl_partial(fpl));
5662	}
5663	if (!vn_seqc_consistent(vp, vp_seqc)) {
5664		vfs_op_thread_exit_crit(mp, mpcpu);
5665		return (cache_fpl_partial(fpl));
5666	}
5667	if (!cache_fplookup_mp_supported(mp)) {
5668		vfs_op_thread_exit_crit(mp, mpcpu);
5669		return (cache_fpl_partial(fpl));
5670	}
5671	vp = atomic_load_ptr(&mp->mnt_rootvnode);
5672	if (__predict_false(vp == NULL)) {
5673		vfs_op_thread_exit_crit(mp, mpcpu);
5674		return (cache_fpl_partial(fpl));
5675	}
5676	vp_seqc = vn_seqc_read_any(vp);
5677	vfs_op_thread_exit_crit(mp, mpcpu);
5678	if (seqc_in_modify(vp_seqc)) {
5679		return (cache_fpl_partial(fpl));
5680	}
5681	mp = atomic_load_ptr(&vp->v_mountedhere);
5682	if (__predict_false(mp != NULL)) {
5683		/*
5684		 * There are possibly more mount points on top.
5685		 * Normally this does not happen so for simplicity just start
5686		 * over.
5687		 */
5688		return (cache_fplookup_climb_mount(fpl));
5689	}
5690
5691	fpl->tvp = vp;
5692	fpl->tvp_seqc = vp_seqc;
5693	return (0);
5694}
5695
5696/*
5697 * Check if a vnode is mounted on.
5698 */
5699static bool
5700cache_fplookup_is_mp(struct cache_fpl *fpl)
5701{
5702	struct vnode *vp;
5703
5704	vp = fpl->tvp;
5705	return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5706}
5707
5708/*
5709 * Parse the path.
5710 *
5711 * The code was originally copy-pasted from regular lookup and despite
5712 * clean ups leaves performance on the table. Any modifications here
5713 * must take into account that in case off fallback the resulting
5714 * nameidata state has to be compatible with the original.
5715 */
5716
5717/*
5718 * Debug ni_pathlen tracking.
5719 */
5720#ifdef INVARIANTS
5721static void
5722cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5723{
5724
5725	fpl->debug.ni_pathlen += n;
5726	KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5727	    ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5728}
5729
5730static void
5731cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5732{
5733
5734	fpl->debug.ni_pathlen -= n;
5735	KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5736	    ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5737}
5738
5739static void
5740cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5741{
5742
5743	cache_fpl_pathlen_add(fpl, 1);
5744}
5745
5746static void
5747cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5748{
5749
5750	cache_fpl_pathlen_sub(fpl, 1);
5751}
5752#else
5753static void
5754cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5755{
5756}
5757
5758static void
5759cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5760{
5761}
5762
5763static void
5764cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5765{
5766}
5767
5768static void
5769cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5770{
5771}
5772#endif
5773
5774static void
5775cache_fplookup_parse(struct cache_fpl *fpl)
5776{
5777	struct nameidata *ndp;
5778	struct componentname *cnp;
5779	struct vnode *dvp;
5780	char *cp;
5781	uint32_t hash;
5782
5783	ndp = fpl->ndp;
5784	cnp = fpl->cnp;
5785	dvp = fpl->dvp;
5786
5787	/*
5788	 * Find the end of this path component, it is either / or nul.
5789	 *
5790	 * Store / as a temporary sentinel so that we only have one character
5791	 * to test for. Pathnames tend to be short so this should not be
5792	 * resulting in cache misses.
5793	 *
5794	 * TODO: fix this to be word-sized.
5795	 */
5796	MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5797	KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5798	    ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5799	    __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5800	    fpl->nulchar, cnp->cn_pnbuf));
5801	KASSERT(*fpl->nulchar == '\0',
5802	    ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5803	    cnp->cn_pnbuf));
5804	hash = cache_get_hash_iter_start(dvp);
5805	*fpl->nulchar = '/';
5806	for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5807		KASSERT(*cp != '\0',
5808		    ("%s: encountered unexpected nul; string [%s]\n", __func__,
5809		    cnp->cn_nameptr));
5810		hash = cache_get_hash_iter(*cp, hash);
5811		continue;
5812	}
5813	*fpl->nulchar = '\0';
5814	fpl->hash = cache_get_hash_iter_finish(hash);
5815
5816	cnp->cn_namelen = cp - cnp->cn_nameptr;
5817	cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5818
5819#ifdef INVARIANTS
5820	/*
5821	 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5822	 * we are going to fail this lookup with ENAMETOOLONG (see below).
5823	 */
5824	if (cnp->cn_namelen <= NAME_MAX) {
5825		if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5826			panic("%s: mismatched hash for [%s] len %ld", __func__,
5827			    cnp->cn_nameptr, cnp->cn_namelen);
5828		}
5829	}
5830#endif
5831
5832	/*
5833	 * Hack: we have to check if the found path component's length exceeds
5834	 * NAME_MAX. However, the condition is very rarely true and check can
5835	 * be elided in the common case -- if an entry was found in the cache,
5836	 * then it could not have been too long to begin with.
5837	 */
5838	ndp->ni_next = cp;
5839}
5840
5841static void
5842cache_fplookup_parse_advance(struct cache_fpl *fpl)
5843{
5844	struct nameidata *ndp;
5845	struct componentname *cnp;
5846
5847	ndp = fpl->ndp;
5848	cnp = fpl->cnp;
5849
5850	cnp->cn_nameptr = ndp->ni_next;
5851	KASSERT(*(cnp->cn_nameptr) == '/',
5852	    ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5853	    cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5854	cnp->cn_nameptr++;
5855	cache_fpl_pathlen_dec(fpl);
5856}
5857
5858/*
5859 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5860 *
5861 * Lockless lookup tries to elide checking for spurious slashes and should they
5862 * be present is guaranteed to fail to find an entry. In this case the caller
5863 * must check if the name starts with a slash and call this routine.  It is
5864 * going to fast forward across the spurious slashes and set the state up for
5865 * retry.
5866 */
5867static int __noinline
5868cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5869{
5870	struct nameidata *ndp;
5871	struct componentname *cnp;
5872
5873	ndp = fpl->ndp;
5874	cnp = fpl->cnp;
5875
5876	MPASS(*(cnp->cn_nameptr) == '/');
5877	do {
5878		cnp->cn_nameptr++;
5879		cache_fpl_pathlen_dec(fpl);
5880	} while (*(cnp->cn_nameptr) == '/');
5881
5882	/*
5883	 * Go back to one slash so that cache_fplookup_parse_advance has
5884	 * something to skip.
5885	 */
5886	cnp->cn_nameptr--;
5887	cache_fpl_pathlen_inc(fpl);
5888
5889	/*
5890	 * cache_fplookup_parse_advance starts from ndp->ni_next
5891	 */
5892	ndp->ni_next = cnp->cn_nameptr;
5893
5894	/*
5895	 * See cache_fplookup_dot.
5896	 */
5897	fpl->tvp = fpl->dvp;
5898	fpl->tvp_seqc = fpl->dvp_seqc;
5899
5900	return (0);
5901}
5902
5903/*
5904 * Handle trailing slashes (e.g., "foo/").
5905 *
5906 * If a trailing slash is found the terminal vnode must be a directory.
5907 * Regular lookup shortens the path by nulifying the first trailing slash and
5908 * sets the TRAILINGSLASH flag to denote this took place. There are several
5909 * checks on it performed later.
5910 *
5911 * Similarly to spurious slashes, lockless lookup handles this in a speculative
5912 * manner relying on an invariant that a non-directory vnode will get a miss.
5913 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5914 *
5915 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5916 * and denotes this is the last path component, which avoids looping back.
5917 *
5918 * Only plain lookups are supported for now to restrict corner cases to handle.
5919 */
5920static int __noinline
5921cache_fplookup_trailingslash(struct cache_fpl *fpl)
5922{
5923#ifdef INVARIANTS
5924	size_t ni_pathlen;
5925#endif
5926	struct nameidata *ndp;
5927	struct componentname *cnp;
5928	struct namecache *ncp;
5929	struct vnode *tvp;
5930	char *cn_nameptr_orig, *cn_nameptr_slash;
5931	seqc_t tvp_seqc;
5932	u_char nc_flag;
5933
5934	ndp = fpl->ndp;
5935	cnp = fpl->cnp;
5936	tvp = fpl->tvp;
5937	tvp_seqc = fpl->tvp_seqc;
5938
5939	MPASS(fpl->dvp == fpl->tvp);
5940	KASSERT(cache_fpl_istrailingslash(fpl),
5941	    ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
5942	    cnp->cn_pnbuf));
5943	KASSERT(cnp->cn_nameptr[0] == '\0',
5944	    ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
5945	    cnp->cn_pnbuf));
5946	KASSERT(cnp->cn_namelen == 0,
5947	    ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
5948	    cnp->cn_pnbuf));
5949	MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
5950
5951	if (cnp->cn_nameiop != LOOKUP) {
5952		return (cache_fpl_aborted(fpl));
5953	}
5954
5955	if (__predict_false(tvp->v_type != VDIR)) {
5956		if (!vn_seqc_consistent(tvp, tvp_seqc)) {
5957			return (cache_fpl_aborted(fpl));
5958		}
5959		cache_fpl_smr_exit(fpl);
5960		return (cache_fpl_handled_error(fpl, ENOTDIR));
5961	}
5962
5963	/*
5964	 * Denote the last component.
5965	 */
5966	ndp->ni_next = &cnp->cn_nameptr[0];
5967	MPASS(cache_fpl_islastcn(ndp));
5968
5969	/*
5970	 * Unwind trailing slashes.
5971	 */
5972	cn_nameptr_orig = cnp->cn_nameptr;
5973	while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
5974		cnp->cn_nameptr--;
5975		if (cnp->cn_nameptr[0] != '/') {
5976			break;
5977		}
5978	}
5979
5980	/*
5981	 * Unwind to the beginning of the path component.
5982	 *
5983	 * Note the path may or may not have started with a slash.
5984	 */
5985	cn_nameptr_slash = cnp->cn_nameptr;
5986	while (cnp->cn_nameptr > cnp->cn_pnbuf) {
5987		cnp->cn_nameptr--;
5988		if (cnp->cn_nameptr[0] == '/') {
5989			break;
5990		}
5991	}
5992	if (cnp->cn_nameptr[0] == '/') {
5993		cnp->cn_nameptr++;
5994	}
5995
5996	cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
5997	cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
5998	cache_fpl_checkpoint(fpl);
5999
6000#ifdef INVARIANTS
6001	ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
6002	if (ni_pathlen != fpl->debug.ni_pathlen) {
6003		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
6004		    __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
6005		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
6006	}
6007#endif
6008
6009	/*
6010	 * If this was a "./" lookup the parent directory is already correct.
6011	 */
6012	if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
6013		return (0);
6014	}
6015
6016	/*
6017	 * Otherwise we need to look it up.
6018	 */
6019	tvp = fpl->tvp;
6020	ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
6021	if (__predict_false(ncp == NULL)) {
6022		return (cache_fpl_aborted(fpl));
6023	}
6024	nc_flag = atomic_load_char(&ncp->nc_flag);
6025	if ((nc_flag & NCF_ISDOTDOT) != 0) {
6026		return (cache_fpl_aborted(fpl));
6027	}
6028	fpl->dvp = ncp->nc_dvp;
6029	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
6030	if (seqc_in_modify(fpl->dvp_seqc)) {
6031		return (cache_fpl_aborted(fpl));
6032	}
6033	return (0);
6034}
6035
6036/*
6037 * See the API contract for VOP_FPLOOKUP_VEXEC.
6038 */
6039static int __noinline
6040cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
6041{
6042	struct componentname *cnp;
6043	struct vnode *dvp;
6044	seqc_t dvp_seqc;
6045
6046	cnp = fpl->cnp;
6047	dvp = fpl->dvp;
6048	dvp_seqc = fpl->dvp_seqc;
6049
6050	/*
6051	 * Hack: delayed empty path checking.
6052	 */
6053	if (cnp->cn_pnbuf[0] == '\0') {
6054		return (cache_fplookup_emptypath(fpl));
6055	}
6056
6057	/*
6058	 * TODO: Due to ignoring trailing slashes lookup will perform a
6059	 * permission check on the last dir when it should not be doing it.  It
6060	 * may fail, but said failure should be ignored. It is possible to fix
6061	 * it up fully without resorting to regular lookup, but for now just
6062	 * abort.
6063	 */
6064	if (cache_fpl_istrailingslash(fpl)) {
6065		return (cache_fpl_aborted(fpl));
6066	}
6067
6068	/*
6069	 * Hack: delayed degenerate path checking.
6070	 */
6071	if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
6072		return (cache_fplookup_degenerate(fpl));
6073	}
6074
6075	/*
6076	 * Hack: delayed name len checking.
6077	 */
6078	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
6079		cache_fpl_smr_exit(fpl);
6080		return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
6081	}
6082
6083	/*
6084	 * Hack: they may be looking up foo/bar, where foo is not a directory.
6085	 * In such a case we need to return ENOTDIR, but we may happen to get
6086	 * here with a different error.
6087	 */
6088	if (dvp->v_type != VDIR) {
6089		error = ENOTDIR;
6090	}
6091
6092	/*
6093	 * Hack: handle O_SEARCH.
6094	 *
6095	 * Open Group Base Specifications Issue 7, 2018 edition states:
6096	 * <quote>
6097	 * If the access mode of the open file description associated with the
6098	 * file descriptor is not O_SEARCH, the function shall check whether
6099	 * directory searches are permitted using the current permissions of
6100	 * the directory underlying the file descriptor. If the access mode is
6101	 * O_SEARCH, the function shall not perform the check.
6102	 * </quote>
6103	 *
6104	 * Regular lookup tests for the NOEXECCHECK flag for every path
6105	 * component to decide whether to do the permission check. However,
6106	 * since most lookups never have the flag (and when they do it is only
6107	 * present for the first path component), lockless lookup only acts on
6108	 * it if there is a permission problem. Here the flag is represented
6109	 * with a boolean so that we don't have to clear it on the way out.
6110	 *
6111	 * For simplicity this always aborts.
6112	 * TODO: check if this is the first lookup and ignore the permission
6113	 * problem. Note the flag has to survive fallback (if it happens to be
6114	 * performed).
6115	 */
6116	if (fpl->fsearch) {
6117		return (cache_fpl_aborted(fpl));
6118	}
6119
6120	switch (error) {
6121	case EAGAIN:
6122		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6123			error = cache_fpl_aborted(fpl);
6124		} else {
6125			cache_fpl_partial(fpl);
6126		}
6127		break;
6128	default:
6129		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6130			error = cache_fpl_aborted(fpl);
6131		} else {
6132			cache_fpl_smr_exit(fpl);
6133			cache_fpl_handled_error(fpl, error);
6134		}
6135		break;
6136	}
6137	return (error);
6138}
6139
6140static int
6141cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
6142{
6143	struct nameidata *ndp;
6144	struct componentname *cnp;
6145	struct mount *mp;
6146	int error;
6147
6148	ndp = fpl->ndp;
6149	cnp = fpl->cnp;
6150
6151	cache_fpl_checkpoint(fpl);
6152
6153	/*
6154	 * The vnode at hand is almost always stable, skip checking for it.
6155	 * Worst case this postpones the check towards the end of the iteration
6156	 * of the main loop.
6157	 */
6158	fpl->dvp = dvp;
6159	fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6160
6161	mp = atomic_load_ptr(&dvp->v_mount);
6162	if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6163		return (cache_fpl_aborted(fpl));
6164	}
6165
6166	MPASS(fpl->tvp == NULL);
6167
6168	for (;;) {
6169		cache_fplookup_parse(fpl);
6170
6171		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6172		if (__predict_false(error != 0)) {
6173			error = cache_fplookup_failed_vexec(fpl, error);
6174			break;
6175		}
6176
6177		error = cache_fplookup_next(fpl);
6178		if (__predict_false(cache_fpl_terminated(fpl))) {
6179			break;
6180		}
6181
6182		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6183
6184		if (fpl->tvp->v_type == VLNK) {
6185			error = cache_fplookup_symlink(fpl);
6186			if (cache_fpl_terminated(fpl)) {
6187				break;
6188			}
6189		} else {
6190			if (cache_fpl_islastcn(ndp)) {
6191				error = cache_fplookup_final(fpl);
6192				break;
6193			}
6194
6195			if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6196				error = cache_fpl_aborted(fpl);
6197				break;
6198			}
6199
6200			fpl->dvp = fpl->tvp;
6201			fpl->dvp_seqc = fpl->tvp_seqc;
6202			cache_fplookup_parse_advance(fpl);
6203		}
6204
6205		cache_fpl_checkpoint(fpl);
6206	}
6207
6208	return (error);
6209}
6210
6211/*
6212 * Fast path lookup protected with SMR and sequence counters.
6213 *
6214 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6215 *
6216 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6217 * outlined below.
6218 *
6219 * Traditional vnode lookup conceptually looks like this:
6220 *
6221 * vn_lock(current);
6222 * for (;;) {
6223 *	next = find();
6224 *	vn_lock(next);
6225 *	vn_unlock(current);
6226 *	current = next;
6227 *	if (last)
6228 *	    break;
6229 * }
6230 * return (current);
6231 *
6232 * Each jump to the next vnode is safe memory-wise and atomic with respect to
6233 * any modifications thanks to holding respective locks.
6234 *
6235 * The same guarantee can be provided with a combination of safe memory
6236 * reclamation and sequence counters instead. If all operations which affect
6237 * the relationship between the current vnode and the one we are looking for
6238 * also modify the counter, we can verify whether all the conditions held as
6239 * we made the jump. This includes things like permissions, mount points etc.
6240 * Counter modification is provided by enclosing relevant places in
6241 * vn_seqc_write_begin()/end() calls.
6242 *
6243 * Thus this translates to:
6244 *
6245 * vfs_smr_enter();
6246 * dvp_seqc = seqc_read_any(dvp);
6247 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
6248 *     abort();
6249 * for (;;) {
6250 * 	tvp = find();
6251 * 	tvp_seqc = seqc_read_any(tvp);
6252 * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
6253 * 	    abort();
6254 * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
6255 * 	    abort();
6256 * 	dvp = tvp; // we know nothing of importance has changed
6257 * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
6258 * 	if (last)
6259 * 	    break;
6260 * }
6261 * vget(); // secure the vnode
6262 * if (!seqc_consistent(tvp, tvp_seqc) // final check
6263 * 	    abort();
6264 * // at this point we know nothing has changed for any parent<->child pair
6265 * // as they were crossed during the lookup, meaning we matched the guarantee
6266 * // of the locked variant
6267 * return (tvp);
6268 *
6269 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6270 * - they are called while within vfs_smr protection which they must never exit
6271 * - EAGAIN can be returned to denote checking could not be performed, it is
6272 *   always valid to return it
6273 * - if the sequence counter has not changed the result must be valid
6274 * - if the sequence counter has changed both false positives and false negatives
6275 *   are permitted (since the result will be rejected later)
6276 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6277 *
6278 * Caveats to watch out for:
6279 * - vnodes are passed unlocked and unreferenced with nothing stopping
6280 *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6281 *   to use atomic_load_ptr to fetch it.
6282 * - the aforementioned object can also get freed, meaning absent other means it
6283 *   should be protected with vfs_smr
6284 * - either safely checking permissions as they are modified or guaranteeing
6285 *   their stability is left to the routine
6286 */
6287int
6288cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6289    struct pwd **pwdp)
6290{
6291	struct cache_fpl fpl;
6292	struct pwd *pwd;
6293	struct vnode *dvp;
6294	struct componentname *cnp;
6295	int error;
6296
6297	fpl.status = CACHE_FPL_STATUS_UNSET;
6298	fpl.in_smr = false;
6299	fpl.ndp = ndp;
6300	fpl.cnp = cnp = &ndp->ni_cnd;
6301	MPASS(ndp->ni_lcf == 0);
6302	KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6303	    ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6304	    cnp->cn_flags));
6305	MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6306	MPASS(ndp->ni_resflags == 0);
6307
6308	if (__predict_false(!cache_can_fplookup(&fpl))) {
6309		*status = fpl.status;
6310		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6311		return (EOPNOTSUPP);
6312	}
6313
6314	cache_fpl_checkpoint_outer(&fpl);
6315
6316	cache_fpl_smr_enter_initial(&fpl);
6317#ifdef INVARIANTS
6318	fpl.debug.ni_pathlen = ndp->ni_pathlen;
6319#endif
6320	fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6321	fpl.fsearch = false;
6322	fpl.tvp = NULL; /* for degenerate path handling */
6323	fpl.pwd = pwdp;
6324	pwd = pwd_get_smr();
6325	*(fpl.pwd) = pwd;
6326	namei_setup_rootdir(ndp, cnp, pwd);
6327	ndp->ni_topdir = pwd->pwd_jdir;
6328
6329	if (cnp->cn_pnbuf[0] == '/') {
6330		dvp = cache_fpl_handle_root(&fpl);
6331		ndp->ni_resflags = NIRES_ABS;
6332	} else {
6333		if (ndp->ni_dirfd == AT_FDCWD) {
6334			dvp = pwd->pwd_cdir;
6335		} else {
6336			error = cache_fplookup_dirfd(&fpl, &dvp);
6337			if (__predict_false(error != 0)) {
6338				goto out;
6339			}
6340		}
6341	}
6342
6343	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6344	error = cache_fplookup_impl(dvp, &fpl);
6345out:
6346	cache_fpl_smr_assert_not_entered(&fpl);
6347	cache_fpl_assert_status(&fpl);
6348	*status = fpl.status;
6349	if (SDT_PROBES_ENABLED()) {
6350		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6351		if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6352			SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6353			    ndp);
6354	}
6355
6356	if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6357		MPASS(error != CACHE_FPL_FAILED);
6358		if (error != 0) {
6359			cache_fpl_cleanup_cnp(fpl.cnp);
6360			MPASS(fpl.dvp == NULL);
6361			MPASS(fpl.tvp == NULL);
6362		}
6363		ndp->ni_dvp = fpl.dvp;
6364		ndp->ni_vp = fpl.tvp;
6365	}
6366	return (error);
6367}
6368