1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD$");
39
40#include "opt_ddb.h"
41#include "opt_ktrace.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/capsicum.h>
46#include <sys/counter.h>
47#include <sys/filedesc.h>
48#include <sys/fnv_hash.h>
49#include <sys/kernel.h>
50#include <sys/ktr.h>
51#include <sys/lock.h>
52#include <sys/malloc.h>
53#include <sys/fcntl.h>
54#include <sys/jail.h>
55#include <sys/mount.h>
56#include <sys/namei.h>
57#include <sys/proc.h>
58#include <sys/seqc.h>
59#include <sys/sdt.h>
60#include <sys/smr.h>
61#include <sys/smp.h>
62#include <sys/syscallsubr.h>
63#include <sys/sysctl.h>
64#include <sys/sysproto.h>
65#include <sys/vnode.h>
66#include <ck_queue.h>
67#ifdef KTRACE
68#include <sys/ktrace.h>
69#endif
70#ifdef INVARIANTS
71#include <machine/_inttypes.h>
72#endif
73
74#include <sys/capsicum.h>
75
76#include <security/audit/audit.h>
77#include <security/mac/mac_framework.h>
78
79#ifdef DDB
80#include <ddb/ddb.h>
81#endif
82
83#include <vm/uma.h>
84
85/*
86 * High level overview of name caching in the VFS layer.
87 *
88 * Originally caching was implemented as part of UFS, later extracted to allow
89 * use by other filesystems. A decision was made to make it optional and
90 * completely detached from the rest of the kernel, which comes with limitations
91 * outlined near the end of this comment block.
92 *
93 * This fundamental choice needs to be revisited. In the meantime, the current
94 * state is described below. Significance of all notable routines is explained
95 * in comments placed above their implementation. Scattered thoroughout the
96 * file are TODO comments indicating shortcomings which can be fixed without
97 * reworking everything (most of the fixes will likely be reusable). Various
98 * details are omitted from this explanation to not clutter the overview, they
99 * have to be checked by reading the code and associated commentary.
100 *
101 * Keep in mind that it's individual path components which are cached, not full
102 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
103 * one for each name.
104 *
105 * I. Data organization
106 *
107 * Entries are described by "struct namecache" objects and stored in a hash
108 * table. See cache_get_hash for more information.
109 *
110 * "struct vnode" contains pointers to source entries (names which can be found
111 * when traversing through said vnode), destination entries (names of that
112 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
113 * the parent vnode.
114 *
115 * The (directory vnode; name) tuple reliably determines the target entry if
116 * it exists.
117 *
118 * Since there are no small locks at this time (all are 32 bytes in size on
119 * LP64), the code works around the problem by introducing lock arrays to
120 * protect hash buckets and vnode lists.
121 *
122 * II. Filesystem integration
123 *
124 * Filesystems participating in name caching do the following:
125 * - set vop_lookup routine to vfs_cache_lookup
126 * - set vop_cachedlookup to whatever can perform the lookup if the above fails
127 * - if they support lockless lookup (see below), vop_fplookup_vexec and
128 *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
129 *   mount point
130 * - call cache_purge or cache_vop_* routines to eliminate stale entries as
131 *   applicable
132 * - call cache_enter to add entries depending on the MAKEENTRY flag
133 *
134 * With the above in mind, there are 2 entry points when doing lookups:
135 * - ... -> namei -> cache_fplookup -- this is the default
136 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
137 *   should the above fail
138 *
139 * Example code flow how an entry is added:
140 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
141 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
142 *
143 * III. Performance considerations
144 *
145 * For lockless case forward lookup avoids any writes to shared areas apart
146 * from the terminal path component. In other words non-modifying lookups of
147 * different files don't suffer any scalability problems in the namecache.
148 * Looking up the same file is limited by VFS and goes beyond the scope of this
149 * file.
150 *
151 * At least on amd64 the single-threaded bottleneck for long paths is hashing
152 * (see cache_get_hash). There are cases where the code issues acquire fence
153 * multiple times, they can be combined on architectures which suffer from it.
154 *
155 * For locked case each encountered vnode has to be referenced and locked in
156 * order to be handed out to the caller (normally that's namei). This
157 * introduces significant hit single-threaded and serialization multi-threaded.
158 *
159 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
160 * avoids any writes to shared areas to any components.
161 *
162 * Unrelated insertions are partially serialized on updating the global entry
163 * counter and possibly serialized on colliding bucket or vnode locks.
164 *
165 * IV. Observability
166 *
167 * Note not everything has an explicit dtrace probe nor it should have, thus
168 * some of the one-liners below depend on implementation details.
169 *
170 * Examples:
171 *
172 * # Check what lookups failed to be handled in a lockless manner. Column 1 is
173 * # line number, column 2 is status code (see cache_fpl_status)
174 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
175 *
176 * # Lengths of names added by binary name
177 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
178 *
179 * # Same as above but only those which exceed 64 characters
180 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
181 *
182 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
183 * # path is it
184 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
185 *
186 * V. Limitations and implementation defects
187 *
188 * - since it is possible there is no entry for an open file, tools like
189 *   "procstat" may fail to resolve fd -> vnode -> path to anything
190 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
191 *   shortage) in which case the above problem applies
192 * - hardlinks are not tracked, thus if a vnode is reachable in more than one
193 *   way, resolving a name may return a different path than the one used to
194 *   open it (even if said path is still valid)
195 * - by default entries are not added for newly created files
196 * - adding an entry may need to evict negative entry first, which happens in 2
197 *   distinct places (evicting on lookup, adding in a later VOP) making it
198 *   impossible to simply reuse it
199 * - there is a simple scheme to evict negative entries as the cache is approaching
200 *   its capacity, but it is very unclear if doing so is a good idea to begin with
201 * - vnodes are subject to being recycled even if target inode is left in memory,
202 *   which loses the name cache entries when it perhaps should not. in case of tmpfs
203 *   names get duplicated -- kept by filesystem itself and namecache separately
204 * - struct namecache has a fixed size and comes in 2 variants, often wasting space.
205 *   now hard to replace with malloc due to dependence on SMR.
206 * - lack of better integration with the kernel also turns nullfs into a layered
207 *   filesystem instead of something which can take advantage of caching
208 */
209
210static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
211    "Name cache");
212
213SDT_PROVIDER_DECLARE(vfs);
214SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
215    "struct vnode *");
216SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
217    "struct vnode *");
218SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
219    "char *");
220SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
221    "const char *");
222SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
223    "struct namecache *", "int", "int");
224SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
225SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
226    "char *", "struct vnode *");
227SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
228SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
229    "struct vnode *", "char *");
230SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
231    "struct vnode *");
232SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
233    "struct vnode *", "char *");
234SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
235    "char *");
236SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
237    "struct componentname *");
238SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
239    "struct componentname *");
240SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
241SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
242SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
243SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
244SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
245    "struct vnode *");
246SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
247    "char *");
248SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
249    "char *");
250SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
251
252SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
253SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
254SDT_PROBE_DECLARE(vfs, namei, lookup, return);
255
256/*
257 * This structure describes the elements in the cache of recent
258 * names looked up by namei.
259 */
260struct negstate {
261	u_char neg_flag;
262	u_char neg_hit;
263};
264_Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
265    "the state must fit in a union with a pointer without growing it");
266
267struct	namecache {
268	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
269	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
270	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
271	struct	vnode *nc_dvp;		/* vnode of parent of name */
272	union {
273		struct	vnode *nu_vp;	/* vnode the name refers to */
274		struct	negstate nu_neg;/* negative entry state */
275	} n_un;
276	u_char	nc_flag;		/* flag bits */
277	u_char	nc_nlen;		/* length of name */
278	char	nc_name[0];		/* segment name + nul */
279};
280
281/*
282 * struct namecache_ts repeats struct namecache layout up to the
283 * nc_nlen member.
284 * struct namecache_ts is used in place of struct namecache when time(s) need
285 * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
286 * both a non-dotdot directory name plus dotdot for the directory's
287 * parent.
288 *
289 * See below for alignment requirement.
290 */
291struct	namecache_ts {
292	struct	timespec nc_time;	/* timespec provided by fs */
293	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
294	int	nc_ticks;		/* ticks value when entry was added */
295	int	nc_pad;
296	struct namecache nc_nc;
297};
298
299TAILQ_HEAD(cache_freebatch, namecache);
300
301/*
302 * At least mips n32 performs 64-bit accesses to timespec as found
303 * in namecache_ts and requires them to be aligned. Since others
304 * may be in the same spot suffer a little bit and enforce the
305 * alignment for everyone. Note this is a nop for 64-bit platforms.
306 */
307#define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
308
309/*
310 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
311 * 4.4 BSD codebase. Later on struct namecache was tweaked to become
312 * smaller and the value was bumped to retain the total size, but it
313 * was never re-evaluated for suitability. A simple test counting
314 * lengths during package building shows that the value of 45 covers
315 * about 86% of all added entries, reaching 99% at 65.
316 *
317 * Regardless of the above, use of dedicated zones instead of malloc may be
318 * inducing additional waste. This may be hard to address as said zones are
319 * tied to VFS SMR. Even if retaining them, the current split should be
320 * re-evaluated.
321 */
322#ifdef __LP64__
323#define	CACHE_PATH_CUTOFF	45
324#define	CACHE_LARGE_PAD		6
325#else
326#define	CACHE_PATH_CUTOFF	41
327#define	CACHE_LARGE_PAD		2
328#endif
329
330#define CACHE_ZONE_SMALL_SIZE		(offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
331#define CACHE_ZONE_SMALL_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
332#define CACHE_ZONE_LARGE_SIZE		(offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
333#define CACHE_ZONE_LARGE_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
334
335_Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
336_Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
337_Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
338_Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
339
340#define	nc_vp		n_un.nu_vp
341#define	nc_neg		n_un.nu_neg
342
343/*
344 * Flags in namecache.nc_flag
345 */
346#define NCF_WHITE	0x01
347#define NCF_ISDOTDOT	0x02
348#define	NCF_TS		0x04
349#define	NCF_DTS		0x08
350#define	NCF_DVDROP	0x10
351#define	NCF_NEGATIVE	0x20
352#define	NCF_INVALID	0x40
353#define	NCF_WIP		0x80
354
355/*
356 * Flags in negstate.neg_flag
357 */
358#define NEG_HOT		0x01
359
360static bool	cache_neg_evict_cond(u_long lnumcache);
361
362/*
363 * Mark an entry as invalid.
364 *
365 * This is called before it starts getting deconstructed.
366 */
367static void
368cache_ncp_invalidate(struct namecache *ncp)
369{
370
371	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
372	    ("%s: entry %p already invalid", __func__, ncp));
373	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
374	atomic_thread_fence_rel();
375}
376
377/*
378 * Check whether the entry can be safely used.
379 *
380 * All places which elide locks are supposed to call this after they are
381 * done with reading from an entry.
382 */
383#define cache_ncp_canuse(ncp)	({					\
384	struct namecache *_ncp = (ncp);					\
385	u_char _nc_flag;						\
386									\
387	atomic_thread_fence_acq();					\
388	_nc_flag = atomic_load_char(&_ncp->nc_flag);			\
389	__predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);	\
390})
391
392/*
393 * Like the above but also checks NCF_WHITE.
394 */
395#define cache_fpl_neg_ncp_canuse(ncp)	({				\
396	struct namecache *_ncp = (ncp);					\
397	u_char _nc_flag;						\
398									\
399	atomic_thread_fence_acq();					\
400	_nc_flag = atomic_load_char(&_ncp->nc_flag);			\
401	__predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);	\
402})
403
404VFS_SMR_DECLARE;
405
406static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
407    "Name cache parameters");
408
409static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
410SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
411    "Total namecache capacity");
412
413u_int ncsizefactor = 2;
414SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
415    "Size factor for namecache");
416
417static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
418SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
419    "Ratio of negative namecache entries");
420
421/*
422 * Negative entry % of namecache capacity above which automatic eviction is allowed.
423 *
424 * Check cache_neg_evict_cond for details.
425 */
426static u_int ncnegminpct = 3;
427
428static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
429SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
430    "Negative entry count above which automatic eviction is allowed");
431
432/*
433 * Structures associated with name caching.
434 */
435#define NCHHASH(hash) \
436	(&nchashtbl[(hash) & nchash])
437static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
438static u_long __read_mostly	nchash;			/* size of hash table */
439SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
440    "Size of namecache hash table");
441static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
442static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
443
444struct nchstats	nchstats;		/* cache effectiveness statistics */
445
446static bool __read_frequently cache_fast_revlookup = true;
447SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
448    &cache_fast_revlookup, 0, "");
449
450static bool __read_mostly cache_rename_add = true;
451SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW,
452    &cache_rename_add, 0, "");
453
454static u_int __exclusive_cache_line neg_cycle;
455
456#define ncneghash	3
457#define	numneglists	(ncneghash + 1)
458
459struct neglist {
460	struct mtx		nl_evict_lock;
461	struct mtx		nl_lock __aligned(CACHE_LINE_SIZE);
462	TAILQ_HEAD(, namecache) nl_list;
463	TAILQ_HEAD(, namecache) nl_hotlist;
464	u_long			nl_hotnum;
465} __aligned(CACHE_LINE_SIZE);
466
467static struct neglist neglists[numneglists];
468
469static inline struct neglist *
470NCP2NEGLIST(struct namecache *ncp)
471{
472
473	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
474}
475
476static inline struct negstate *
477NCP2NEGSTATE(struct namecache *ncp)
478{
479
480	MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
481	return (&ncp->nc_neg);
482}
483
484#define	numbucketlocks (ncbuckethash + 1)
485static u_int __read_mostly  ncbuckethash;
486static struct mtx_padalign __read_mostly  *bucketlocks;
487#define	HASH2BUCKETLOCK(hash) \
488	((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
489
490#define	numvnodelocks (ncvnodehash + 1)
491static u_int __read_mostly  ncvnodehash;
492static struct mtx __read_mostly *vnodelocks;
493static inline struct mtx *
494VP2VNODELOCK(struct vnode *vp)
495{
496
497	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
498}
499
500static void
501cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
502{
503	struct namecache_ts *ncp_ts;
504
505	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
506	    (tsp == NULL && ticksp == NULL),
507	    ("No NCF_TS"));
508
509	if (tsp == NULL)
510		return;
511
512	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
513	*tsp = ncp_ts->nc_time;
514	*ticksp = ncp_ts->nc_ticks;
515}
516
517#ifdef DEBUG_CACHE
518static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
519SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
520    "VFS namecache enabled");
521#endif
522
523/* Export size information to userland */
524SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
525    sizeof(struct namecache), "sizeof(struct namecache)");
526
527/*
528 * The new name cache statistics
529 */
530static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
531    "Name cache statistics");
532
533#define STATNODE_ULONG(name, varname, descr)					\
534	SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
535#define STATNODE_COUNTER(name, varname, descr)					\
536	static COUNTER_U64_DEFINE_EARLY(varname);				\
537	SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
538	    descr);
539STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
540STATNODE_ULONG(count, numcache, "Number of cache entries");
541STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
542STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
543STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
544STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
545STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
546STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
547STATNODE_COUNTER(posszaps, numposzaps,
548    "Number of cache hits (positive) we do not want to cache");
549STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
550STATNODE_COUNTER(negzaps, numnegzaps,
551    "Number of cache hits (negative) we do not want to cache");
552STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
553/* These count for vn_getcwd(), too. */
554STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
555STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
556STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
557    "Number of fullpath search errors (VOP_VPTOCNP failures)");
558STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
559STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
560STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
561
562/*
563 * Debug or developer statistics.
564 */
565static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
566    "Name cache debugging");
567#define DEBUGNODE_ULONG(name, varname, descr)					\
568	SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
569#define DEBUGNODE_COUNTER(name, varname, descr)					\
570	static COUNTER_U64_DEFINE_EARLY(varname);				\
571	SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
572	    descr);
573DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
574    "Number of successful removals after relocking");
575static long zap_bucket_fail;
576DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
577static long zap_bucket_fail2;
578DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
579static long cache_lock_vnodes_cel_3_failures;
580DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
581    "Number of times 3-way vnode locking failed");
582
583static void cache_zap_locked(struct namecache *ncp);
584static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
585    char **freebuf, size_t *buflen);
586static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
587    char **retbuf, size_t *buflen, size_t addend);
588static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
589    char **retbuf, size_t *buflen);
590static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
591    char **retbuf, size_t *len, size_t addend);
592
593static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
594
595static inline void
596cache_assert_vlp_locked(struct mtx *vlp)
597{
598
599	if (vlp != NULL)
600		mtx_assert(vlp, MA_OWNED);
601}
602
603static inline void
604cache_assert_vnode_locked(struct vnode *vp)
605{
606	struct mtx *vlp;
607
608	vlp = VP2VNODELOCK(vp);
609	cache_assert_vlp_locked(vlp);
610}
611
612/*
613 * Directory vnodes with entries are held for two reasons:
614 * 1. make them less of a target for reclamation in vnlru
615 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
616 *
617 * It will be feasible to stop doing it altogether if all filesystems start
618 * supporting lockless lookup.
619 */
620static void
621cache_hold_vnode(struct vnode *vp)
622{
623
624	cache_assert_vnode_locked(vp);
625	VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
626	vhold(vp);
627	counter_u64_add(numcachehv, 1);
628}
629
630static void
631cache_drop_vnode(struct vnode *vp)
632{
633
634	/*
635	 * Called after all locks are dropped, meaning we can't assert
636	 * on the state of v_cache_src.
637	 */
638	vdrop(vp);
639	counter_u64_add(numcachehv, -1);
640}
641
642/*
643 * UMA zones.
644 */
645static uma_zone_t __read_mostly cache_zone_small;
646static uma_zone_t __read_mostly cache_zone_small_ts;
647static uma_zone_t __read_mostly cache_zone_large;
648static uma_zone_t __read_mostly cache_zone_large_ts;
649
650char *
651cache_symlink_alloc(size_t size, int flags)
652{
653
654	if (size < CACHE_ZONE_SMALL_SIZE) {
655		return (uma_zalloc_smr(cache_zone_small, flags));
656	}
657	if (size < CACHE_ZONE_LARGE_SIZE) {
658		return (uma_zalloc_smr(cache_zone_large, flags));
659	}
660	counter_u64_add(symlinktoobig, 1);
661	SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
662	return (NULL);
663}
664
665void
666cache_symlink_free(char *string, size_t size)
667{
668
669	MPASS(string != NULL);
670	KASSERT(size < CACHE_ZONE_LARGE_SIZE,
671	    ("%s: size %zu too big", __func__, size));
672
673	if (size < CACHE_ZONE_SMALL_SIZE) {
674		uma_zfree_smr(cache_zone_small, string);
675		return;
676	}
677	if (size < CACHE_ZONE_LARGE_SIZE) {
678		uma_zfree_smr(cache_zone_large, string);
679		return;
680	}
681	__assert_unreachable();
682}
683
684static struct namecache *
685cache_alloc_uma(int len, bool ts)
686{
687	struct namecache_ts *ncp_ts;
688	struct namecache *ncp;
689
690	if (__predict_false(ts)) {
691		if (len <= CACHE_PATH_CUTOFF)
692			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
693		else
694			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
695		ncp = &ncp_ts->nc_nc;
696	} else {
697		if (len <= CACHE_PATH_CUTOFF)
698			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
699		else
700			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
701	}
702	return (ncp);
703}
704
705static void
706cache_free_uma(struct namecache *ncp)
707{
708	struct namecache_ts *ncp_ts;
709
710	if (__predict_false(ncp->nc_flag & NCF_TS)) {
711		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
712		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
713			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
714		else
715			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
716	} else {
717		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
718			uma_zfree_smr(cache_zone_small, ncp);
719		else
720			uma_zfree_smr(cache_zone_large, ncp);
721	}
722}
723
724static struct namecache *
725cache_alloc(int len, bool ts)
726{
727	u_long lnumcache;
728
729	/*
730	 * Avoid blowout in namecache entries.
731	 *
732	 * Bugs:
733	 * 1. filesystems may end up trying to add an already existing entry
734	 * (for example this can happen after a cache miss during concurrent
735	 * lookup), in which case we will call cache_neg_evict despite not
736	 * adding anything.
737	 * 2. the routine may fail to free anything and no provisions are made
738	 * to make it try harder (see the inside for failure modes)
739	 * 3. it only ever looks at negative entries.
740	 */
741	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
742	if (cache_neg_evict_cond(lnumcache)) {
743		lnumcache = atomic_load_long(&numcache);
744	}
745	if (__predict_false(lnumcache >= ncsize)) {
746		atomic_subtract_long(&numcache, 1);
747		counter_u64_add(numdrops, 1);
748		return (NULL);
749	}
750	return (cache_alloc_uma(len, ts));
751}
752
753static void
754cache_free(struct namecache *ncp)
755{
756
757	MPASS(ncp != NULL);
758	if ((ncp->nc_flag & NCF_DVDROP) != 0) {
759		cache_drop_vnode(ncp->nc_dvp);
760	}
761	cache_free_uma(ncp);
762	atomic_subtract_long(&numcache, 1);
763}
764
765static void
766cache_free_batch(struct cache_freebatch *batch)
767{
768	struct namecache *ncp, *nnp;
769	int i;
770
771	i = 0;
772	if (TAILQ_EMPTY(batch))
773		goto out;
774	TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
775		if ((ncp->nc_flag & NCF_DVDROP) != 0) {
776			cache_drop_vnode(ncp->nc_dvp);
777		}
778		cache_free_uma(ncp);
779		i++;
780	}
781	atomic_subtract_long(&numcache, i);
782out:
783	SDT_PROBE1(vfs, namecache, purge, batch, i);
784}
785
786/*
787 * Hashing.
788 *
789 * The code was made to use FNV in 2001 and this choice needs to be revisited.
790 *
791 * Short summary of the difficulty:
792 * The longest name which can be inserted is NAME_MAX characters in length (or
793 * 255 at the time of writing this comment), while majority of names used in
794 * practice are significantly shorter (mostly below 10). More importantly
795 * majority of lookups performed find names are even shorter than that.
796 *
797 * This poses a problem where hashes which do better than FNV past word size
798 * (or so) tend to come with additional overhead when finalizing the result,
799 * making them noticeably slower for the most commonly used range.
800 *
801 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
802 *
803 * When looking it up the most time consuming part by a large margin (at least
804 * on amd64) is hashing.  Replacing FNV with something which pessimizes short
805 * input would make the slowest part stand out even more.
806 */
807
808/*
809 * TODO: With the value stored we can do better than computing the hash based
810 * on the address.
811 */
812static void
813cache_prehash(struct vnode *vp)
814{
815
816	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
817}
818
819static uint32_t
820cache_get_hash(char *name, u_char len, struct vnode *dvp)
821{
822
823	return (fnv_32_buf(name, len, dvp->v_nchash));
824}
825
826static uint32_t
827cache_get_hash_iter_start(struct vnode *dvp)
828{
829
830	return (dvp->v_nchash);
831}
832
833static uint32_t
834cache_get_hash_iter(char c, uint32_t hash)
835{
836
837	return (fnv_32_buf(&c, 1, hash));
838}
839
840static uint32_t
841cache_get_hash_iter_finish(uint32_t hash)
842{
843
844	return (hash);
845}
846
847static inline struct nchashhead *
848NCP2BUCKET(struct namecache *ncp)
849{
850	uint32_t hash;
851
852	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
853	return (NCHHASH(hash));
854}
855
856static inline struct mtx *
857NCP2BUCKETLOCK(struct namecache *ncp)
858{
859	uint32_t hash;
860
861	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
862	return (HASH2BUCKETLOCK(hash));
863}
864
865#ifdef INVARIANTS
866static void
867cache_assert_bucket_locked(struct namecache *ncp)
868{
869	struct mtx *blp;
870
871	blp = NCP2BUCKETLOCK(ncp);
872	mtx_assert(blp, MA_OWNED);
873}
874
875static void
876cache_assert_bucket_unlocked(struct namecache *ncp)
877{
878	struct mtx *blp;
879
880	blp = NCP2BUCKETLOCK(ncp);
881	mtx_assert(blp, MA_NOTOWNED);
882}
883#else
884#define cache_assert_bucket_locked(x) do { } while (0)
885#define cache_assert_bucket_unlocked(x) do { } while (0)
886#endif
887
888#define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
889static void
890_cache_sort_vnodes(void **p1, void **p2)
891{
892	void *tmp;
893
894	MPASS(*p1 != NULL || *p2 != NULL);
895
896	if (*p1 > *p2) {
897		tmp = *p2;
898		*p2 = *p1;
899		*p1 = tmp;
900	}
901}
902
903static void
904cache_lock_all_buckets(void)
905{
906	u_int i;
907
908	for (i = 0; i < numbucketlocks; i++)
909		mtx_lock(&bucketlocks[i]);
910}
911
912static void
913cache_unlock_all_buckets(void)
914{
915	u_int i;
916
917	for (i = 0; i < numbucketlocks; i++)
918		mtx_unlock(&bucketlocks[i]);
919}
920
921static void
922cache_lock_all_vnodes(void)
923{
924	u_int i;
925
926	for (i = 0; i < numvnodelocks; i++)
927		mtx_lock(&vnodelocks[i]);
928}
929
930static void
931cache_unlock_all_vnodes(void)
932{
933	u_int i;
934
935	for (i = 0; i < numvnodelocks; i++)
936		mtx_unlock(&vnodelocks[i]);
937}
938
939static int
940cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
941{
942
943	cache_sort_vnodes(&vlp1, &vlp2);
944
945	if (vlp1 != NULL) {
946		if (!mtx_trylock(vlp1))
947			return (EAGAIN);
948	}
949	if (!mtx_trylock(vlp2)) {
950		if (vlp1 != NULL)
951			mtx_unlock(vlp1);
952		return (EAGAIN);
953	}
954
955	return (0);
956}
957
958static void
959cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
960{
961
962	MPASS(vlp1 != NULL || vlp2 != NULL);
963	MPASS(vlp1 <= vlp2);
964
965	if (vlp1 != NULL)
966		mtx_lock(vlp1);
967	if (vlp2 != NULL)
968		mtx_lock(vlp2);
969}
970
971static void
972cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
973{
974
975	MPASS(vlp1 != NULL || vlp2 != NULL);
976
977	if (vlp1 != NULL)
978		mtx_unlock(vlp1);
979	if (vlp2 != NULL)
980		mtx_unlock(vlp2);
981}
982
983static int
984sysctl_nchstats(SYSCTL_HANDLER_ARGS)
985{
986	struct nchstats snap;
987
988	if (req->oldptr == NULL)
989		return (SYSCTL_OUT(req, 0, sizeof(snap)));
990
991	snap = nchstats;
992	snap.ncs_goodhits = counter_u64_fetch(numposhits);
993	snap.ncs_neghits = counter_u64_fetch(numneghits);
994	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
995	    counter_u64_fetch(numnegzaps);
996	snap.ncs_miss = counter_u64_fetch(nummisszap) +
997	    counter_u64_fetch(nummiss);
998
999	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
1000}
1001SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
1002    CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
1003    "VFS cache effectiveness statistics");
1004
1005static void
1006cache_recalc_neg_min(u_int val)
1007{
1008
1009	neg_min = (ncsize * val) / 100;
1010}
1011
1012static int
1013sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1014{
1015	u_int val;
1016	int error;
1017
1018	val = ncnegminpct;
1019	error = sysctl_handle_int(oidp, &val, 0, req);
1020	if (error != 0 || req->newptr == NULL)
1021		return (error);
1022
1023	if (val == ncnegminpct)
1024		return (0);
1025	if (val < 0 || val > 99)
1026		return (EINVAL);
1027	ncnegminpct = val;
1028	cache_recalc_neg_min(val);
1029	return (0);
1030}
1031
1032SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1033    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1034    "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1035
1036#ifdef DIAGNOSTIC
1037/*
1038 * Grab an atomic snapshot of the name cache hash chain lengths
1039 */
1040static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1041    CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1042    "hash table stats");
1043
1044static int
1045sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1046{
1047	struct nchashhead *ncpp;
1048	struct namecache *ncp;
1049	int i, error, n_nchash, *cntbuf;
1050
1051retry:
1052	n_nchash = nchash + 1;	/* nchash is max index, not count */
1053	if (req->oldptr == NULL)
1054		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1055	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1056	cache_lock_all_buckets();
1057	if (n_nchash != nchash + 1) {
1058		cache_unlock_all_buckets();
1059		free(cntbuf, M_TEMP);
1060		goto retry;
1061	}
1062	/* Scan hash tables counting entries */
1063	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1064		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1065			cntbuf[i]++;
1066	cache_unlock_all_buckets();
1067	for (error = 0, i = 0; i < n_nchash; i++)
1068		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1069			break;
1070	free(cntbuf, M_TEMP);
1071	return (error);
1072}
1073SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1074    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1075    "nchash chain lengths");
1076
1077static int
1078sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1079{
1080	int error;
1081	struct nchashhead *ncpp;
1082	struct namecache *ncp;
1083	int n_nchash;
1084	int count, maxlength, used, pct;
1085
1086	if (!req->oldptr)
1087		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1088
1089	cache_lock_all_buckets();
1090	n_nchash = nchash + 1;	/* nchash is max index, not count */
1091	used = 0;
1092	maxlength = 0;
1093
1094	/* Scan hash tables for applicable entries */
1095	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1096		count = 0;
1097		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1098			count++;
1099		}
1100		if (count)
1101			used++;
1102		if (maxlength < count)
1103			maxlength = count;
1104	}
1105	n_nchash = nchash + 1;
1106	cache_unlock_all_buckets();
1107	pct = (used * 100) / (n_nchash / 100);
1108	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1109	if (error)
1110		return (error);
1111	error = SYSCTL_OUT(req, &used, sizeof(used));
1112	if (error)
1113		return (error);
1114	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1115	if (error)
1116		return (error);
1117	error = SYSCTL_OUT(req, &pct, sizeof(pct));
1118	if (error)
1119		return (error);
1120	return (0);
1121}
1122SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1123    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1124    "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1125#endif
1126
1127/*
1128 * Negative entries management
1129 *
1130 * Various workloads create plenty of negative entries and barely use them
1131 * afterwards. Moreover malicious users can keep performing bogus lookups
1132 * adding even more entries. For example "make tinderbox" as of writing this
1133 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1134 * negative.
1135 *
1136 * As such, a rather aggressive eviction method is needed. The currently
1137 * employed method is a placeholder.
1138 *
1139 * Entries are split over numneglists separate lists, each of which is further
1140 * split into hot and cold entries. Entries get promoted after getting a hit.
1141 * Eviction happens on addition of new entry.
1142 */
1143static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1144    "Name cache negative entry statistics");
1145
1146SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1147    "Number of negative cache entries");
1148
1149static COUNTER_U64_DEFINE_EARLY(neg_created);
1150SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1151    "Number of created negative entries");
1152
1153static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1154SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1155    "Number of evicted negative entries");
1156
1157static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1158SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1159    &neg_evict_skipped_empty,
1160    "Number of times evicting failed due to lack of entries");
1161
1162static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1163SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1164    &neg_evict_skipped_missed,
1165    "Number of times evicting failed due to target entry disappearing");
1166
1167static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1168SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1169    &neg_evict_skipped_contended,
1170    "Number of times evicting failed due to contention");
1171
1172SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1173    "Number of cache hits (negative)");
1174
1175static int
1176sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1177{
1178	int i, out;
1179
1180	out = 0;
1181	for (i = 0; i < numneglists; i++)
1182		out += neglists[i].nl_hotnum;
1183
1184	return (SYSCTL_OUT(req, &out, sizeof(out)));
1185}
1186SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1187    CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1188    "Number of hot negative entries");
1189
1190static void
1191cache_neg_init(struct namecache *ncp)
1192{
1193	struct negstate *ns;
1194
1195	ncp->nc_flag |= NCF_NEGATIVE;
1196	ns = NCP2NEGSTATE(ncp);
1197	ns->neg_flag = 0;
1198	ns->neg_hit = 0;
1199	counter_u64_add(neg_created, 1);
1200}
1201
1202#define CACHE_NEG_PROMOTION_THRESH 2
1203
1204static bool
1205cache_neg_hit_prep(struct namecache *ncp)
1206{
1207	struct negstate *ns;
1208	u_char n;
1209
1210	ns = NCP2NEGSTATE(ncp);
1211	n = atomic_load_char(&ns->neg_hit);
1212	for (;;) {
1213		if (n >= CACHE_NEG_PROMOTION_THRESH)
1214			return (false);
1215		if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1216			break;
1217	}
1218	return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1219}
1220
1221/*
1222 * Nothing to do here but it is provided for completeness as some
1223 * cache_neg_hit_prep callers may end up returning without even
1224 * trying to promote.
1225 */
1226#define cache_neg_hit_abort(ncp)	do { } while (0)
1227
1228static void
1229cache_neg_hit_finish(struct namecache *ncp)
1230{
1231
1232	SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1233	counter_u64_add(numneghits, 1);
1234}
1235
1236/*
1237 * Move a negative entry to the hot list.
1238 */
1239static void
1240cache_neg_promote_locked(struct namecache *ncp)
1241{
1242	struct neglist *nl;
1243	struct negstate *ns;
1244
1245	ns = NCP2NEGSTATE(ncp);
1246	nl = NCP2NEGLIST(ncp);
1247	mtx_assert(&nl->nl_lock, MA_OWNED);
1248	if ((ns->neg_flag & NEG_HOT) == 0) {
1249		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1250		TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1251		nl->nl_hotnum++;
1252		ns->neg_flag |= NEG_HOT;
1253	}
1254}
1255
1256/*
1257 * Move a hot negative entry to the cold list.
1258 */
1259static void
1260cache_neg_demote_locked(struct namecache *ncp)
1261{
1262	struct neglist *nl;
1263	struct negstate *ns;
1264
1265	ns = NCP2NEGSTATE(ncp);
1266	nl = NCP2NEGLIST(ncp);
1267	mtx_assert(&nl->nl_lock, MA_OWNED);
1268	MPASS(ns->neg_flag & NEG_HOT);
1269	TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1270	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1271	nl->nl_hotnum--;
1272	ns->neg_flag &= ~NEG_HOT;
1273	atomic_store_char(&ns->neg_hit, 0);
1274}
1275
1276/*
1277 * Move a negative entry to the hot list if it matches the lookup.
1278 *
1279 * We have to take locks, but they may be contended and in the worst
1280 * case we may need to go off CPU. We don't want to spin within the
1281 * smr section and we can't block with it. Exiting the section means
1282 * the found entry could have been evicted. We are going to look it
1283 * up again.
1284 */
1285static bool
1286cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1287    struct namecache *oncp, uint32_t hash)
1288{
1289	struct namecache *ncp;
1290	struct neglist *nl;
1291	u_char nc_flag;
1292
1293	nl = NCP2NEGLIST(oncp);
1294
1295	mtx_lock(&nl->nl_lock);
1296	/*
1297	 * For hash iteration.
1298	 */
1299	vfs_smr_enter();
1300
1301	/*
1302	 * Avoid all surprises by only succeeding if we got the same entry and
1303	 * bailing completely otherwise.
1304	 * XXX There are no provisions to keep the vnode around, meaning we may
1305	 * end up promoting a negative entry for a *new* vnode and returning
1306	 * ENOENT on its account. This is the error we want to return anyway
1307	 * and promotion is harmless.
1308	 *
1309	 * In particular at this point there can be a new ncp which matches the
1310	 * search but hashes to a different neglist.
1311	 */
1312	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1313		if (ncp == oncp)
1314			break;
1315	}
1316
1317	/*
1318	 * No match to begin with.
1319	 */
1320	if (__predict_false(ncp == NULL)) {
1321		goto out_abort;
1322	}
1323
1324	/*
1325	 * The newly found entry may be something different...
1326	 */
1327	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1328	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1329		goto out_abort;
1330	}
1331
1332	/*
1333	 * ... and not even negative.
1334	 */
1335	nc_flag = atomic_load_char(&ncp->nc_flag);
1336	if ((nc_flag & NCF_NEGATIVE) == 0) {
1337		goto out_abort;
1338	}
1339
1340	if (!cache_ncp_canuse(ncp)) {
1341		goto out_abort;
1342	}
1343
1344	cache_neg_promote_locked(ncp);
1345	cache_neg_hit_finish(ncp);
1346	vfs_smr_exit();
1347	mtx_unlock(&nl->nl_lock);
1348	return (true);
1349out_abort:
1350	vfs_smr_exit();
1351	mtx_unlock(&nl->nl_lock);
1352	return (false);
1353}
1354
1355static void
1356cache_neg_promote(struct namecache *ncp)
1357{
1358	struct neglist *nl;
1359
1360	nl = NCP2NEGLIST(ncp);
1361	mtx_lock(&nl->nl_lock);
1362	cache_neg_promote_locked(ncp);
1363	mtx_unlock(&nl->nl_lock);
1364}
1365
1366static void
1367cache_neg_insert(struct namecache *ncp)
1368{
1369	struct neglist *nl;
1370
1371	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1372	cache_assert_bucket_locked(ncp);
1373	nl = NCP2NEGLIST(ncp);
1374	mtx_lock(&nl->nl_lock);
1375	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1376	mtx_unlock(&nl->nl_lock);
1377	atomic_add_long(&numneg, 1);
1378}
1379
1380static void
1381cache_neg_remove(struct namecache *ncp)
1382{
1383	struct neglist *nl;
1384	struct negstate *ns;
1385
1386	cache_assert_bucket_locked(ncp);
1387	nl = NCP2NEGLIST(ncp);
1388	ns = NCP2NEGSTATE(ncp);
1389	mtx_lock(&nl->nl_lock);
1390	if ((ns->neg_flag & NEG_HOT) != 0) {
1391		TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1392		nl->nl_hotnum--;
1393	} else {
1394		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1395	}
1396	mtx_unlock(&nl->nl_lock);
1397	atomic_subtract_long(&numneg, 1);
1398}
1399
1400static struct neglist *
1401cache_neg_evict_select_list(void)
1402{
1403	struct neglist *nl;
1404	u_int c;
1405
1406	c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1407	nl = &neglists[c % numneglists];
1408	if (!mtx_trylock(&nl->nl_evict_lock)) {
1409		counter_u64_add(neg_evict_skipped_contended, 1);
1410		return (NULL);
1411	}
1412	return (nl);
1413}
1414
1415static struct namecache *
1416cache_neg_evict_select_entry(struct neglist *nl)
1417{
1418	struct namecache *ncp, *lncp;
1419	struct negstate *ns, *lns;
1420	int i;
1421
1422	mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1423	mtx_assert(&nl->nl_lock, MA_OWNED);
1424	ncp = TAILQ_FIRST(&nl->nl_list);
1425	if (ncp == NULL)
1426		return (NULL);
1427	lncp = ncp;
1428	lns = NCP2NEGSTATE(lncp);
1429	for (i = 1; i < 4; i++) {
1430		ncp = TAILQ_NEXT(ncp, nc_dst);
1431		if (ncp == NULL)
1432			break;
1433		ns = NCP2NEGSTATE(ncp);
1434		if (ns->neg_hit < lns->neg_hit) {
1435			lncp = ncp;
1436			lns = ns;
1437		}
1438	}
1439	return (lncp);
1440}
1441
1442static bool
1443cache_neg_evict(void)
1444{
1445	struct namecache *ncp, *ncp2;
1446	struct neglist *nl;
1447	struct vnode *dvp;
1448	struct mtx *dvlp;
1449	struct mtx *blp;
1450	uint32_t hash;
1451	u_char nlen;
1452	bool evicted;
1453
1454	nl = cache_neg_evict_select_list();
1455	if (nl == NULL) {
1456		return (false);
1457	}
1458
1459	mtx_lock(&nl->nl_lock);
1460	ncp = TAILQ_FIRST(&nl->nl_hotlist);
1461	if (ncp != NULL) {
1462		cache_neg_demote_locked(ncp);
1463	}
1464	ncp = cache_neg_evict_select_entry(nl);
1465	if (ncp == NULL) {
1466		counter_u64_add(neg_evict_skipped_empty, 1);
1467		mtx_unlock(&nl->nl_lock);
1468		mtx_unlock(&nl->nl_evict_lock);
1469		return (false);
1470	}
1471	nlen = ncp->nc_nlen;
1472	dvp = ncp->nc_dvp;
1473	hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1474	dvlp = VP2VNODELOCK(dvp);
1475	blp = HASH2BUCKETLOCK(hash);
1476	mtx_unlock(&nl->nl_lock);
1477	mtx_unlock(&nl->nl_evict_lock);
1478	mtx_lock(dvlp);
1479	mtx_lock(blp);
1480	/*
1481	 * Note that since all locks were dropped above, the entry may be
1482	 * gone or reallocated to be something else.
1483	 */
1484	CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1485		if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1486		    ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1487			break;
1488	}
1489	if (ncp2 == NULL) {
1490		counter_u64_add(neg_evict_skipped_missed, 1);
1491		ncp = NULL;
1492		evicted = false;
1493	} else {
1494		MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1495		MPASS(blp == NCP2BUCKETLOCK(ncp));
1496		SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1497		    ncp->nc_name);
1498		cache_zap_locked(ncp);
1499		counter_u64_add(neg_evicted, 1);
1500		evicted = true;
1501	}
1502	mtx_unlock(blp);
1503	mtx_unlock(dvlp);
1504	if (ncp != NULL)
1505		cache_free(ncp);
1506	return (evicted);
1507}
1508
1509/*
1510 * Maybe evict a negative entry to create more room.
1511 *
1512 * The ncnegfactor parameter limits what fraction of the total count
1513 * can comprise of negative entries. However, if the cache is just
1514 * warming up this leads to excessive evictions.  As such, ncnegminpct
1515 * (recomputed to neg_min) dictates whether the above should be
1516 * applied.
1517 *
1518 * Try evicting if the cache is close to full capacity regardless of
1519 * other considerations.
1520 */
1521static bool
1522cache_neg_evict_cond(u_long lnumcache)
1523{
1524	u_long lnumneg;
1525
1526	if (ncsize - 1000 < lnumcache)
1527		goto out_evict;
1528	lnumneg = atomic_load_long(&numneg);
1529	if (lnumneg < neg_min)
1530		return (false);
1531	if (lnumneg * ncnegfactor < lnumcache)
1532		return (false);
1533out_evict:
1534	return (cache_neg_evict());
1535}
1536
1537/*
1538 * cache_zap_locked():
1539 *
1540 *   Removes a namecache entry from cache, whether it contains an actual
1541 *   pointer to a vnode or if it is just a negative cache entry.
1542 */
1543static void
1544cache_zap_locked(struct namecache *ncp)
1545{
1546	struct nchashhead *ncpp;
1547	struct vnode *dvp, *vp;
1548
1549	dvp = ncp->nc_dvp;
1550	vp = ncp->nc_vp;
1551
1552	if (!(ncp->nc_flag & NCF_NEGATIVE))
1553		cache_assert_vnode_locked(vp);
1554	cache_assert_vnode_locked(dvp);
1555	cache_assert_bucket_locked(ncp);
1556
1557	cache_ncp_invalidate(ncp);
1558
1559	ncpp = NCP2BUCKET(ncp);
1560	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1561	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1562		SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1563		TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1564		if (ncp == vp->v_cache_dd) {
1565			atomic_store_ptr(&vp->v_cache_dd, NULL);
1566		}
1567	} else {
1568		SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1569		cache_neg_remove(ncp);
1570	}
1571	if (ncp->nc_flag & NCF_ISDOTDOT) {
1572		if (ncp == dvp->v_cache_dd) {
1573			atomic_store_ptr(&dvp->v_cache_dd, NULL);
1574		}
1575	} else {
1576		LIST_REMOVE(ncp, nc_src);
1577		if (LIST_EMPTY(&dvp->v_cache_src)) {
1578			ncp->nc_flag |= NCF_DVDROP;
1579		}
1580	}
1581}
1582
1583static void
1584cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1585{
1586	struct mtx *blp;
1587
1588	MPASS(ncp->nc_dvp == vp);
1589	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1590	cache_assert_vnode_locked(vp);
1591
1592	blp = NCP2BUCKETLOCK(ncp);
1593	mtx_lock(blp);
1594	cache_zap_locked(ncp);
1595	mtx_unlock(blp);
1596}
1597
1598static bool
1599cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1600    struct mtx **vlpp)
1601{
1602	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1603	struct mtx *blp;
1604
1605	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1606	cache_assert_vnode_locked(vp);
1607
1608	if (ncp->nc_flag & NCF_NEGATIVE) {
1609		if (*vlpp != NULL) {
1610			mtx_unlock(*vlpp);
1611			*vlpp = NULL;
1612		}
1613		cache_zap_negative_locked_vnode_kl(ncp, vp);
1614		return (true);
1615	}
1616
1617	pvlp = VP2VNODELOCK(vp);
1618	blp = NCP2BUCKETLOCK(ncp);
1619	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1620	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1621
1622	if (*vlpp == vlp1 || *vlpp == vlp2) {
1623		to_unlock = *vlpp;
1624		*vlpp = NULL;
1625	} else {
1626		if (*vlpp != NULL) {
1627			mtx_unlock(*vlpp);
1628			*vlpp = NULL;
1629		}
1630		cache_sort_vnodes(&vlp1, &vlp2);
1631		if (vlp1 == pvlp) {
1632			mtx_lock(vlp2);
1633			to_unlock = vlp2;
1634		} else {
1635			if (!mtx_trylock(vlp1))
1636				goto out_relock;
1637			to_unlock = vlp1;
1638		}
1639	}
1640	mtx_lock(blp);
1641	cache_zap_locked(ncp);
1642	mtx_unlock(blp);
1643	if (to_unlock != NULL)
1644		mtx_unlock(to_unlock);
1645	return (true);
1646
1647out_relock:
1648	mtx_unlock(vlp2);
1649	mtx_lock(vlp1);
1650	mtx_lock(vlp2);
1651	MPASS(*vlpp == NULL);
1652	*vlpp = vlp1;
1653	return (false);
1654}
1655
1656/*
1657 * If trylocking failed we can get here. We know enough to take all needed locks
1658 * in the right order and re-lookup the entry.
1659 */
1660static int
1661cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1662    struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1663    struct mtx *blp)
1664{
1665	struct namecache *rncp;
1666
1667	cache_assert_bucket_unlocked(ncp);
1668
1669	cache_sort_vnodes(&dvlp, &vlp);
1670	cache_lock_vnodes(dvlp, vlp);
1671	mtx_lock(blp);
1672	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1673		if (rncp == ncp && rncp->nc_dvp == dvp &&
1674		    rncp->nc_nlen == cnp->cn_namelen &&
1675		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1676			break;
1677	}
1678	if (rncp != NULL) {
1679		cache_zap_locked(rncp);
1680		mtx_unlock(blp);
1681		cache_unlock_vnodes(dvlp, vlp);
1682		counter_u64_add(zap_bucket_relock_success, 1);
1683		return (0);
1684	}
1685
1686	mtx_unlock(blp);
1687	cache_unlock_vnodes(dvlp, vlp);
1688	return (EAGAIN);
1689}
1690
1691static int __noinline
1692cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1693    uint32_t hash, struct mtx *blp)
1694{
1695	struct mtx *dvlp, *vlp;
1696	struct vnode *dvp;
1697
1698	cache_assert_bucket_locked(ncp);
1699
1700	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1701	vlp = NULL;
1702	if (!(ncp->nc_flag & NCF_NEGATIVE))
1703		vlp = VP2VNODELOCK(ncp->nc_vp);
1704	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1705		cache_zap_locked(ncp);
1706		mtx_unlock(blp);
1707		cache_unlock_vnodes(dvlp, vlp);
1708		return (0);
1709	}
1710
1711	dvp = ncp->nc_dvp;
1712	mtx_unlock(blp);
1713	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1714}
1715
1716static __noinline int
1717cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1718{
1719	struct namecache *ncp;
1720	struct mtx *blp;
1721	struct mtx *dvlp, *dvlp2;
1722	uint32_t hash;
1723	int error;
1724
1725	if (cnp->cn_namelen == 2 &&
1726	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1727		dvlp = VP2VNODELOCK(dvp);
1728		dvlp2 = NULL;
1729		mtx_lock(dvlp);
1730retry_dotdot:
1731		ncp = dvp->v_cache_dd;
1732		if (ncp == NULL) {
1733			mtx_unlock(dvlp);
1734			if (dvlp2 != NULL)
1735				mtx_unlock(dvlp2);
1736			SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1737			return (0);
1738		}
1739		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1740			if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1741				goto retry_dotdot;
1742			MPASS(dvp->v_cache_dd == NULL);
1743			mtx_unlock(dvlp);
1744			if (dvlp2 != NULL)
1745				mtx_unlock(dvlp2);
1746			cache_free(ncp);
1747		} else {
1748			atomic_store_ptr(&dvp->v_cache_dd, NULL);
1749			mtx_unlock(dvlp);
1750			if (dvlp2 != NULL)
1751				mtx_unlock(dvlp2);
1752		}
1753		SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1754		return (1);
1755	}
1756
1757	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1758	blp = HASH2BUCKETLOCK(hash);
1759retry:
1760	if (CK_SLIST_EMPTY(NCHHASH(hash)))
1761		goto out_no_entry;
1762
1763	mtx_lock(blp);
1764
1765	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1766		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1767		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1768			break;
1769	}
1770
1771	if (ncp == NULL) {
1772		mtx_unlock(blp);
1773		goto out_no_entry;
1774	}
1775
1776	error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1777	if (__predict_false(error != 0)) {
1778		zap_bucket_fail++;
1779		goto retry;
1780	}
1781	counter_u64_add(numposzaps, 1);
1782	SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1783	cache_free(ncp);
1784	return (1);
1785out_no_entry:
1786	counter_u64_add(nummisszap, 1);
1787	SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1788	return (0);
1789}
1790
1791static int __noinline
1792cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1793    struct timespec *tsp, int *ticksp)
1794{
1795	int ltype;
1796
1797	*vpp = dvp;
1798	counter_u64_add(dothits, 1);
1799	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1800	if (tsp != NULL)
1801		timespecclear(tsp);
1802	if (ticksp != NULL)
1803		*ticksp = ticks;
1804	vrefact(*vpp);
1805	/*
1806	 * When we lookup "." we still can be asked to lock it
1807	 * differently...
1808	 */
1809	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1810	if (ltype != VOP_ISLOCKED(*vpp)) {
1811		if (ltype == LK_EXCLUSIVE) {
1812			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1813			if (VN_IS_DOOMED((*vpp))) {
1814				/* forced unmount */
1815				vrele(*vpp);
1816				*vpp = NULL;
1817				return (ENOENT);
1818			}
1819		} else
1820			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1821	}
1822	return (-1);
1823}
1824
1825static int __noinline
1826cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1827    struct timespec *tsp, int *ticksp)
1828{
1829	struct namecache_ts *ncp_ts;
1830	struct namecache *ncp;
1831	struct mtx *dvlp;
1832	enum vgetstate vs;
1833	int error, ltype;
1834	bool whiteout;
1835
1836	MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1837
1838	if ((cnp->cn_flags & MAKEENTRY) == 0) {
1839		cache_remove_cnp(dvp, cnp);
1840		return (0);
1841	}
1842
1843	counter_u64_add(dotdothits, 1);
1844retry:
1845	dvlp = VP2VNODELOCK(dvp);
1846	mtx_lock(dvlp);
1847	ncp = dvp->v_cache_dd;
1848	if (ncp == NULL) {
1849		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1850		mtx_unlock(dvlp);
1851		return (0);
1852	}
1853	if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1854		if (ncp->nc_flag & NCF_NEGATIVE)
1855			*vpp = NULL;
1856		else
1857			*vpp = ncp->nc_vp;
1858	} else
1859		*vpp = ncp->nc_dvp;
1860	if (*vpp == NULL)
1861		goto negative_success;
1862	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1863	cache_out_ts(ncp, tsp, ticksp);
1864	if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1865	    NCF_DTS && tsp != NULL) {
1866		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1867		*tsp = ncp_ts->nc_dotdottime;
1868	}
1869
1870	MPASS(dvp != *vpp);
1871	ltype = VOP_ISLOCKED(dvp);
1872	VOP_UNLOCK(dvp);
1873	vs = vget_prep(*vpp);
1874	mtx_unlock(dvlp);
1875	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1876	vn_lock(dvp, ltype | LK_RETRY);
1877	if (VN_IS_DOOMED(dvp)) {
1878		if (error == 0)
1879			vput(*vpp);
1880		*vpp = NULL;
1881		return (ENOENT);
1882	}
1883	if (error) {
1884		*vpp = NULL;
1885		goto retry;
1886	}
1887	return (-1);
1888negative_success:
1889	if (__predict_false(cnp->cn_nameiop == CREATE)) {
1890		if (cnp->cn_flags & ISLASTCN) {
1891			counter_u64_add(numnegzaps, 1);
1892			cache_zap_negative_locked_vnode_kl(ncp, dvp);
1893			mtx_unlock(dvlp);
1894			cache_free(ncp);
1895			return (0);
1896		}
1897	}
1898
1899	whiteout = (ncp->nc_flag & NCF_WHITE);
1900	cache_out_ts(ncp, tsp, ticksp);
1901	if (cache_neg_hit_prep(ncp))
1902		cache_neg_promote(ncp);
1903	else
1904		cache_neg_hit_finish(ncp);
1905	mtx_unlock(dvlp);
1906	if (whiteout)
1907		cnp->cn_flags |= ISWHITEOUT;
1908	return (ENOENT);
1909}
1910
1911/**
1912 * Lookup a name in the name cache
1913 *
1914 * # Arguments
1915 *
1916 * - dvp:	Parent directory in which to search.
1917 * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1918 * - cnp:	Parameters of the name search.  The most interesting bits of
1919 *   		the cn_flags field have the following meanings:
1920 *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1921 *   			it up.
1922 *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1923 * - tsp:	Return storage for cache timestamp.  On a successful (positive
1924 *   		or negative) lookup, tsp will be filled with any timespec that
1925 *   		was stored when this cache entry was created.  However, it will
1926 *   		be clear for "." entries.
1927 * - ticks:	Return storage for alternate cache timestamp.  On a successful
1928 *   		(positive or negative) lookup, it will contain the ticks value
1929 *   		that was current when the cache entry was created, unless cnp
1930 *   		was ".".
1931 *
1932 * Either both tsp and ticks have to be provided or neither of them.
1933 *
1934 * # Returns
1935 *
1936 * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1937 * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1938 *		to a forced unmount.  vpp will not be modified.  If the entry
1939 *		is a whiteout, then the ISWHITEOUT flag will be set in
1940 *		cnp->cn_flags.
1941 * - 0:		A cache miss.  vpp will not be modified.
1942 *
1943 * # Locking
1944 *
1945 * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1946 * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1947 * lock is not recursively acquired.
1948 */
1949static int __noinline
1950cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1951    struct timespec *tsp, int *ticksp)
1952{
1953	struct namecache *ncp;
1954	struct mtx *blp;
1955	uint32_t hash;
1956	enum vgetstate vs;
1957	int error;
1958	bool whiteout;
1959
1960	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1961	MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1962
1963retry:
1964	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1965	blp = HASH2BUCKETLOCK(hash);
1966	mtx_lock(blp);
1967
1968	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1969		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1970		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1971			break;
1972	}
1973
1974	if (__predict_false(ncp == NULL)) {
1975		mtx_unlock(blp);
1976		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
1977		counter_u64_add(nummiss, 1);
1978		return (0);
1979	}
1980
1981	if (ncp->nc_flag & NCF_NEGATIVE)
1982		goto negative_success;
1983
1984	counter_u64_add(numposhits, 1);
1985	*vpp = ncp->nc_vp;
1986	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1987	cache_out_ts(ncp, tsp, ticksp);
1988	MPASS(dvp != *vpp);
1989	vs = vget_prep(*vpp);
1990	mtx_unlock(blp);
1991	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1992	if (error) {
1993		*vpp = NULL;
1994		goto retry;
1995	}
1996	return (-1);
1997negative_success:
1998	/*
1999	 * We don't get here with regular lookup apart from corner cases.
2000	 */
2001	if (__predict_true(cnp->cn_nameiop == CREATE)) {
2002		if (cnp->cn_flags & ISLASTCN) {
2003			counter_u64_add(numnegzaps, 1);
2004			error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2005			if (__predict_false(error != 0)) {
2006				zap_bucket_fail2++;
2007				goto retry;
2008			}
2009			cache_free(ncp);
2010			return (0);
2011		}
2012	}
2013
2014	whiteout = (ncp->nc_flag & NCF_WHITE);
2015	cache_out_ts(ncp, tsp, ticksp);
2016	if (cache_neg_hit_prep(ncp))
2017		cache_neg_promote(ncp);
2018	else
2019		cache_neg_hit_finish(ncp);
2020	mtx_unlock(blp);
2021	if (whiteout)
2022		cnp->cn_flags |= ISWHITEOUT;
2023	return (ENOENT);
2024}
2025
2026int
2027cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2028    struct timespec *tsp, int *ticksp)
2029{
2030	struct namecache *ncp;
2031	uint32_t hash;
2032	enum vgetstate vs;
2033	int error;
2034	bool whiteout, neg_promote;
2035	u_short nc_flag;
2036
2037	MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2038
2039#ifdef DEBUG_CACHE
2040	if (__predict_false(!doingcache)) {
2041		cnp->cn_flags &= ~MAKEENTRY;
2042		return (0);
2043	}
2044#endif
2045
2046	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2047		if (cnp->cn_namelen == 1)
2048			return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2049		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2050			return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2051	}
2052
2053	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2054
2055	if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2056		cache_remove_cnp(dvp, cnp);
2057		return (0);
2058	}
2059
2060	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2061	vfs_smr_enter();
2062
2063	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2064		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2065		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
2066			break;
2067	}
2068
2069	if (__predict_false(ncp == NULL)) {
2070		vfs_smr_exit();
2071		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2072		counter_u64_add(nummiss, 1);
2073		return (0);
2074	}
2075
2076	nc_flag = atomic_load_char(&ncp->nc_flag);
2077	if (nc_flag & NCF_NEGATIVE)
2078		goto negative_success;
2079
2080	counter_u64_add(numposhits, 1);
2081	*vpp = ncp->nc_vp;
2082	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2083	cache_out_ts(ncp, tsp, ticksp);
2084	MPASS(dvp != *vpp);
2085	if (!cache_ncp_canuse(ncp)) {
2086		vfs_smr_exit();
2087		*vpp = NULL;
2088		goto out_fallback;
2089	}
2090	vs = vget_prep_smr(*vpp);
2091	vfs_smr_exit();
2092	if (__predict_false(vs == VGET_NONE)) {
2093		*vpp = NULL;
2094		goto out_fallback;
2095	}
2096	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2097	if (error) {
2098		*vpp = NULL;
2099		goto out_fallback;
2100	}
2101	return (-1);
2102negative_success:
2103	if (cnp->cn_nameiop == CREATE) {
2104		if (cnp->cn_flags & ISLASTCN) {
2105			vfs_smr_exit();
2106			goto out_fallback;
2107		}
2108	}
2109
2110	cache_out_ts(ncp, tsp, ticksp);
2111	whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2112	neg_promote = cache_neg_hit_prep(ncp);
2113	if (!cache_ncp_canuse(ncp)) {
2114		cache_neg_hit_abort(ncp);
2115		vfs_smr_exit();
2116		goto out_fallback;
2117	}
2118	if (neg_promote) {
2119		vfs_smr_exit();
2120		if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2121			goto out_fallback;
2122	} else {
2123		cache_neg_hit_finish(ncp);
2124		vfs_smr_exit();
2125	}
2126	if (whiteout)
2127		cnp->cn_flags |= ISWHITEOUT;
2128	return (ENOENT);
2129out_fallback:
2130	return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2131}
2132
2133struct celockstate {
2134	struct mtx *vlp[3];
2135	struct mtx *blp[2];
2136};
2137CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2138CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2139
2140static inline void
2141cache_celockstate_init(struct celockstate *cel)
2142{
2143
2144	bzero(cel, sizeof(*cel));
2145}
2146
2147static void
2148cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2149    struct vnode *dvp)
2150{
2151	struct mtx *vlp1, *vlp2;
2152
2153	MPASS(cel->vlp[0] == NULL);
2154	MPASS(cel->vlp[1] == NULL);
2155	MPASS(cel->vlp[2] == NULL);
2156
2157	MPASS(vp != NULL || dvp != NULL);
2158
2159	vlp1 = VP2VNODELOCK(vp);
2160	vlp2 = VP2VNODELOCK(dvp);
2161	cache_sort_vnodes(&vlp1, &vlp2);
2162
2163	if (vlp1 != NULL) {
2164		mtx_lock(vlp1);
2165		cel->vlp[0] = vlp1;
2166	}
2167	mtx_lock(vlp2);
2168	cel->vlp[1] = vlp2;
2169}
2170
2171static void
2172cache_unlock_vnodes_cel(struct celockstate *cel)
2173{
2174
2175	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2176
2177	if (cel->vlp[0] != NULL)
2178		mtx_unlock(cel->vlp[0]);
2179	if (cel->vlp[1] != NULL)
2180		mtx_unlock(cel->vlp[1]);
2181	if (cel->vlp[2] != NULL)
2182		mtx_unlock(cel->vlp[2]);
2183}
2184
2185static bool
2186cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2187{
2188	struct mtx *vlp;
2189	bool ret;
2190
2191	cache_assert_vlp_locked(cel->vlp[0]);
2192	cache_assert_vlp_locked(cel->vlp[1]);
2193	MPASS(cel->vlp[2] == NULL);
2194
2195	MPASS(vp != NULL);
2196	vlp = VP2VNODELOCK(vp);
2197
2198	ret = true;
2199	if (vlp >= cel->vlp[1]) {
2200		mtx_lock(vlp);
2201	} else {
2202		if (mtx_trylock(vlp))
2203			goto out;
2204		cache_lock_vnodes_cel_3_failures++;
2205		cache_unlock_vnodes_cel(cel);
2206		if (vlp < cel->vlp[0]) {
2207			mtx_lock(vlp);
2208			mtx_lock(cel->vlp[0]);
2209			mtx_lock(cel->vlp[1]);
2210		} else {
2211			if (cel->vlp[0] != NULL)
2212				mtx_lock(cel->vlp[0]);
2213			mtx_lock(vlp);
2214			mtx_lock(cel->vlp[1]);
2215		}
2216		ret = false;
2217	}
2218out:
2219	cel->vlp[2] = vlp;
2220	return (ret);
2221}
2222
2223static void
2224cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2225    struct mtx *blp2)
2226{
2227
2228	MPASS(cel->blp[0] == NULL);
2229	MPASS(cel->blp[1] == NULL);
2230
2231	cache_sort_vnodes(&blp1, &blp2);
2232
2233	if (blp1 != NULL) {
2234		mtx_lock(blp1);
2235		cel->blp[0] = blp1;
2236	}
2237	mtx_lock(blp2);
2238	cel->blp[1] = blp2;
2239}
2240
2241static void
2242cache_unlock_buckets_cel(struct celockstate *cel)
2243{
2244
2245	if (cel->blp[0] != NULL)
2246		mtx_unlock(cel->blp[0]);
2247	mtx_unlock(cel->blp[1]);
2248}
2249
2250/*
2251 * Lock part of the cache affected by the insertion.
2252 *
2253 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2254 * However, insertion can result in removal of an old entry. In this
2255 * case we have an additional vnode and bucketlock pair to lock.
2256 *
2257 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2258 * preserving the locking order (smaller address first).
2259 */
2260static void
2261cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2262    uint32_t hash)
2263{
2264	struct namecache *ncp;
2265	struct mtx *blps[2];
2266	u_char nc_flag;
2267
2268	blps[0] = HASH2BUCKETLOCK(hash);
2269	for (;;) {
2270		blps[1] = NULL;
2271		cache_lock_vnodes_cel(cel, dvp, vp);
2272		if (vp == NULL || vp->v_type != VDIR)
2273			break;
2274		ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2275		if (ncp == NULL)
2276			break;
2277		nc_flag = atomic_load_char(&ncp->nc_flag);
2278		if ((nc_flag & NCF_ISDOTDOT) == 0)
2279			break;
2280		MPASS(ncp->nc_dvp == vp);
2281		blps[1] = NCP2BUCKETLOCK(ncp);
2282		if ((nc_flag & NCF_NEGATIVE) != 0)
2283			break;
2284		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2285			break;
2286		/*
2287		 * All vnodes got re-locked. Re-validate the state and if
2288		 * nothing changed we are done. Otherwise restart.
2289		 */
2290		if (ncp == vp->v_cache_dd &&
2291		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2292		    blps[1] == NCP2BUCKETLOCK(ncp) &&
2293		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2294			break;
2295		cache_unlock_vnodes_cel(cel);
2296		cel->vlp[0] = NULL;
2297		cel->vlp[1] = NULL;
2298		cel->vlp[2] = NULL;
2299	}
2300	cache_lock_buckets_cel(cel, blps[0], blps[1]);
2301}
2302
2303static void
2304cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2305    uint32_t hash)
2306{
2307	struct namecache *ncp;
2308	struct mtx *blps[2];
2309	u_char nc_flag;
2310
2311	blps[0] = HASH2BUCKETLOCK(hash);
2312	for (;;) {
2313		blps[1] = NULL;
2314		cache_lock_vnodes_cel(cel, dvp, vp);
2315		ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2316		if (ncp == NULL)
2317			break;
2318		nc_flag = atomic_load_char(&ncp->nc_flag);
2319		if ((nc_flag & NCF_ISDOTDOT) == 0)
2320			break;
2321		MPASS(ncp->nc_dvp == dvp);
2322		blps[1] = NCP2BUCKETLOCK(ncp);
2323		if ((nc_flag & NCF_NEGATIVE) != 0)
2324			break;
2325		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2326			break;
2327		if (ncp == dvp->v_cache_dd &&
2328		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2329		    blps[1] == NCP2BUCKETLOCK(ncp) &&
2330		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2331			break;
2332		cache_unlock_vnodes_cel(cel);
2333		cel->vlp[0] = NULL;
2334		cel->vlp[1] = NULL;
2335		cel->vlp[2] = NULL;
2336	}
2337	cache_lock_buckets_cel(cel, blps[0], blps[1]);
2338}
2339
2340static void
2341cache_enter_unlock(struct celockstate *cel)
2342{
2343
2344	cache_unlock_buckets_cel(cel);
2345	cache_unlock_vnodes_cel(cel);
2346}
2347
2348static void __noinline
2349cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2350    struct componentname *cnp)
2351{
2352	struct celockstate cel;
2353	struct namecache *ncp;
2354	uint32_t hash;
2355	int len;
2356
2357	if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2358		return;
2359	len = cnp->cn_namelen;
2360	cache_celockstate_init(&cel);
2361	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2362	cache_enter_lock_dd(&cel, dvp, vp, hash);
2363	ncp = dvp->v_cache_dd;
2364	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2365		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2366		cache_zap_locked(ncp);
2367	} else {
2368		ncp = NULL;
2369	}
2370	atomic_store_ptr(&dvp->v_cache_dd, NULL);
2371	cache_enter_unlock(&cel);
2372	if (ncp != NULL)
2373		cache_free(ncp);
2374}
2375
2376/*
2377 * Add an entry to the cache.
2378 */
2379void
2380cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2381    struct timespec *tsp, struct timespec *dtsp)
2382{
2383	struct celockstate cel;
2384	struct namecache *ncp, *n2, *ndd;
2385	struct namecache_ts *ncp_ts;
2386	struct nchashhead *ncpp;
2387	uint32_t hash;
2388	int flag;
2389	int len;
2390
2391	KASSERT(cnp->cn_namelen <= NAME_MAX,
2392	    ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2393	    NAME_MAX));
2394#ifdef notyet
2395	/*
2396	 * Not everything doing this is weeded out yet.
2397	 */
2398	VNPASS(dvp != vp, dvp);
2399#endif
2400	VNPASS(!VN_IS_DOOMED(dvp), dvp);
2401	VNPASS(dvp->v_type != VNON, dvp);
2402	if (vp != NULL) {
2403		VNPASS(!VN_IS_DOOMED(vp), vp);
2404		VNPASS(vp->v_type != VNON, vp);
2405	}
2406
2407#ifdef DEBUG_CACHE
2408	if (__predict_false(!doingcache))
2409		return;
2410#endif
2411
2412	flag = 0;
2413	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2414		if (cnp->cn_namelen == 1)
2415			return;
2416		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2417			cache_enter_dotdot_prep(dvp, vp, cnp);
2418			flag = NCF_ISDOTDOT;
2419		}
2420	}
2421
2422	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2423	if (ncp == NULL)
2424		return;
2425
2426	cache_celockstate_init(&cel);
2427	ndd = NULL;
2428	ncp_ts = NULL;
2429
2430	/*
2431	 * Calculate the hash key and setup as much of the new
2432	 * namecache entry as possible before acquiring the lock.
2433	 */
2434	ncp->nc_flag = flag | NCF_WIP;
2435	ncp->nc_vp = vp;
2436	if (vp == NULL)
2437		cache_neg_init(ncp);
2438	ncp->nc_dvp = dvp;
2439	if (tsp != NULL) {
2440		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2441		ncp_ts->nc_time = *tsp;
2442		ncp_ts->nc_ticks = ticks;
2443		ncp_ts->nc_nc.nc_flag |= NCF_TS;
2444		if (dtsp != NULL) {
2445			ncp_ts->nc_dotdottime = *dtsp;
2446			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2447		}
2448	}
2449	len = ncp->nc_nlen = cnp->cn_namelen;
2450	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2451	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2452	ncp->nc_name[len] = '\0';
2453	cache_enter_lock(&cel, dvp, vp, hash);
2454
2455	/*
2456	 * See if this vnode or negative entry is already in the cache
2457	 * with this name.  This can happen with concurrent lookups of
2458	 * the same path name.
2459	 */
2460	ncpp = NCHHASH(hash);
2461	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2462		if (n2->nc_dvp == dvp &&
2463		    n2->nc_nlen == cnp->cn_namelen &&
2464		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2465			MPASS(cache_ncp_canuse(n2));
2466			if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2467				KASSERT(vp == NULL,
2468				    ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2469				    __func__, NULL, vp, cnp->cn_nameptr));
2470			else
2471				KASSERT(n2->nc_vp == vp,
2472				    ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2473				    __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2474			/*
2475			 * Entries are supposed to be immutable unless in the
2476			 * process of getting destroyed. Accommodating for
2477			 * changing timestamps is possible but not worth it.
2478			 * This should be harmless in terms of correctness, in
2479			 * the worst case resulting in an earlier expiration.
2480			 * Alternatively, the found entry can be replaced
2481			 * altogether.
2482			 */
2483			MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2484#if 0
2485			if (tsp != NULL) {
2486				KASSERT((n2->nc_flag & NCF_TS) != 0,
2487				    ("no NCF_TS"));
2488				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2489				n2_ts->nc_time = ncp_ts->nc_time;
2490				n2_ts->nc_ticks = ncp_ts->nc_ticks;
2491				if (dtsp != NULL) {
2492					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2493					n2_ts->nc_nc.nc_flag |= NCF_DTS;
2494				}
2495			}
2496#endif
2497			SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2498			    vp);
2499			goto out_unlock_free;
2500		}
2501	}
2502
2503	if (flag == NCF_ISDOTDOT) {
2504		/*
2505		 * See if we are trying to add .. entry, but some other lookup
2506		 * has populated v_cache_dd pointer already.
2507		 */
2508		if (dvp->v_cache_dd != NULL)
2509			goto out_unlock_free;
2510		KASSERT(vp == NULL || vp->v_type == VDIR,
2511		    ("wrong vnode type %p", vp));
2512		atomic_thread_fence_rel();
2513		atomic_store_ptr(&dvp->v_cache_dd, ncp);
2514	}
2515
2516	if (vp != NULL) {
2517		if (flag != NCF_ISDOTDOT) {
2518			/*
2519			 * For this case, the cache entry maps both the
2520			 * directory name in it and the name ".." for the
2521			 * directory's parent.
2522			 */
2523			if ((ndd = vp->v_cache_dd) != NULL) {
2524				if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2525					cache_zap_locked(ndd);
2526				else
2527					ndd = NULL;
2528			}
2529			atomic_thread_fence_rel();
2530			atomic_store_ptr(&vp->v_cache_dd, ncp);
2531		} else if (vp->v_type != VDIR) {
2532			if (vp->v_cache_dd != NULL) {
2533				atomic_store_ptr(&vp->v_cache_dd, NULL);
2534			}
2535		}
2536	}
2537
2538	if (flag != NCF_ISDOTDOT) {
2539		if (LIST_EMPTY(&dvp->v_cache_src)) {
2540			cache_hold_vnode(dvp);
2541		}
2542		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2543	}
2544
2545	/*
2546	 * If the entry is "negative", we place it into the
2547	 * "negative" cache queue, otherwise, we place it into the
2548	 * destination vnode's cache entries queue.
2549	 */
2550	if (vp != NULL) {
2551		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2552		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2553		    vp);
2554	} else {
2555		if (cnp->cn_flags & ISWHITEOUT)
2556			atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2557		cache_neg_insert(ncp);
2558		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2559		    ncp->nc_name);
2560	}
2561
2562	/*
2563	 * Insert the new namecache entry into the appropriate chain
2564	 * within the cache entries table.
2565	 */
2566	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2567
2568	atomic_thread_fence_rel();
2569	/*
2570	 * Mark the entry as fully constructed.
2571	 * It is immutable past this point until its removal.
2572	 */
2573	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2574
2575	cache_enter_unlock(&cel);
2576	if (ndd != NULL)
2577		cache_free(ndd);
2578	return;
2579out_unlock_free:
2580	cache_enter_unlock(&cel);
2581	cache_free(ncp);
2582	return;
2583}
2584
2585static u_int
2586cache_roundup_2(u_int val)
2587{
2588	u_int res;
2589
2590	for (res = 1; res <= val; res <<= 1)
2591		continue;
2592
2593	return (res);
2594}
2595
2596static struct nchashhead *
2597nchinittbl(u_long elements, u_long *hashmask)
2598{
2599	struct nchashhead *hashtbl;
2600	u_long hashsize, i;
2601
2602	hashsize = cache_roundup_2(elements) / 2;
2603
2604	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2605	for (i = 0; i < hashsize; i++)
2606		CK_SLIST_INIT(&hashtbl[i]);
2607	*hashmask = hashsize - 1;
2608	return (hashtbl);
2609}
2610
2611static void
2612ncfreetbl(struct nchashhead *hashtbl)
2613{
2614
2615	free(hashtbl, M_VFSCACHE);
2616}
2617
2618/*
2619 * Name cache initialization, from vfs_init() when we are booting
2620 */
2621static void
2622nchinit(void *dummy __unused)
2623{
2624	u_int i;
2625
2626	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2627	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2628	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2629	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2630	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2631	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2632	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2633	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2634
2635	VFS_SMR_ZONE_SET(cache_zone_small);
2636	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2637	VFS_SMR_ZONE_SET(cache_zone_large);
2638	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2639
2640	ncsize = desiredvnodes * ncsizefactor;
2641	cache_recalc_neg_min(ncnegminpct);
2642	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2643	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2644	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2645		ncbuckethash = 7;
2646	if (ncbuckethash > nchash)
2647		ncbuckethash = nchash;
2648	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2649	    M_WAITOK | M_ZERO);
2650	for (i = 0; i < numbucketlocks; i++)
2651		mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2652	ncvnodehash = ncbuckethash;
2653	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2654	    M_WAITOK | M_ZERO);
2655	for (i = 0; i < numvnodelocks; i++)
2656		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2657
2658	for (i = 0; i < numneglists; i++) {
2659		mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2660		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2661		TAILQ_INIT(&neglists[i].nl_list);
2662		TAILQ_INIT(&neglists[i].nl_hotlist);
2663	}
2664}
2665SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2666
2667void
2668cache_vnode_init(struct vnode *vp)
2669{
2670
2671	LIST_INIT(&vp->v_cache_src);
2672	TAILQ_INIT(&vp->v_cache_dst);
2673	vp->v_cache_dd = NULL;
2674	cache_prehash(vp);
2675}
2676
2677/*
2678 * Induce transient cache misses for lockless operation in cache_lookup() by
2679 * using a temporary hash table.
2680 *
2681 * This will force a fs lookup.
2682 *
2683 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2684 * to observe all CPUs not performing the lookup.
2685 */
2686static void
2687cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2688{
2689
2690	MPASS(temphash < nchash);
2691	/*
2692	 * Change the size. The new size is smaller and can safely be used
2693	 * against the existing table. All lookups which now hash wrong will
2694	 * result in a cache miss, which all callers are supposed to know how
2695	 * to handle.
2696	 */
2697	atomic_store_long(&nchash, temphash);
2698	atomic_thread_fence_rel();
2699	vfs_smr_synchronize();
2700	/*
2701	 * At this point everyone sees the updated hash value, but they still
2702	 * see the old table.
2703	 */
2704	atomic_store_ptr(&nchashtbl, temptbl);
2705	atomic_thread_fence_rel();
2706	vfs_smr_synchronize();
2707	/*
2708	 * At this point everyone sees the updated table pointer and size pair.
2709	 */
2710}
2711
2712/*
2713 * Set the new hash table.
2714 *
2715 * Similarly to cache_changesize_set_temp(), this has to synchronize against
2716 * lockless operation in cache_lookup().
2717 */
2718static void
2719cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2720{
2721
2722	MPASS(nchash < new_hash);
2723	/*
2724	 * Change the pointer first. This wont result in out of bounds access
2725	 * since the temporary table is guaranteed to be smaller.
2726	 */
2727	atomic_store_ptr(&nchashtbl, new_tbl);
2728	atomic_thread_fence_rel();
2729	vfs_smr_synchronize();
2730	/*
2731	 * At this point everyone sees the updated pointer value, but they
2732	 * still see the old size.
2733	 */
2734	atomic_store_long(&nchash, new_hash);
2735	atomic_thread_fence_rel();
2736	vfs_smr_synchronize();
2737	/*
2738	 * At this point everyone sees the updated table pointer and size pair.
2739	 */
2740}
2741
2742void
2743cache_changesize(u_long newmaxvnodes)
2744{
2745	struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2746	u_long new_nchash, old_nchash, temphash;
2747	struct namecache *ncp;
2748	uint32_t hash;
2749	u_long newncsize;
2750	int i;
2751
2752	newncsize = newmaxvnodes * ncsizefactor;
2753	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2754	if (newmaxvnodes < numbucketlocks)
2755		newmaxvnodes = numbucketlocks;
2756
2757	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2758	/* If same hash table size, nothing to do */
2759	if (nchash == new_nchash) {
2760		ncfreetbl(new_nchashtbl);
2761		return;
2762	}
2763
2764	temptbl = nchinittbl(1, &temphash);
2765
2766	/*
2767	 * Move everything from the old hash table to the new table.
2768	 * None of the namecache entries in the table can be removed
2769	 * because to do so, they have to be removed from the hash table.
2770	 */
2771	cache_lock_all_vnodes();
2772	cache_lock_all_buckets();
2773	old_nchashtbl = nchashtbl;
2774	old_nchash = nchash;
2775	cache_changesize_set_temp(temptbl, temphash);
2776	for (i = 0; i <= old_nchash; i++) {
2777		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2778			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2779			    ncp->nc_dvp);
2780			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2781			CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2782		}
2783	}
2784	ncsize = newncsize;
2785	cache_recalc_neg_min(ncnegminpct);
2786	cache_changesize_set_new(new_nchashtbl, new_nchash);
2787	cache_unlock_all_buckets();
2788	cache_unlock_all_vnodes();
2789	ncfreetbl(old_nchashtbl);
2790	ncfreetbl(temptbl);
2791}
2792
2793/*
2794 * Remove all entries from and to a particular vnode.
2795 */
2796static void
2797cache_purge_impl(struct vnode *vp)
2798{
2799	struct cache_freebatch batch;
2800	struct namecache *ncp;
2801	struct mtx *vlp, *vlp2;
2802
2803	TAILQ_INIT(&batch);
2804	vlp = VP2VNODELOCK(vp);
2805	vlp2 = NULL;
2806	mtx_lock(vlp);
2807retry:
2808	while (!LIST_EMPTY(&vp->v_cache_src)) {
2809		ncp = LIST_FIRST(&vp->v_cache_src);
2810		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2811			goto retry;
2812		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2813	}
2814	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2815		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2816		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2817			goto retry;
2818		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2819	}
2820	ncp = vp->v_cache_dd;
2821	if (ncp != NULL) {
2822		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2823		   ("lost dotdot link"));
2824		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2825			goto retry;
2826		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2827	}
2828	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2829	mtx_unlock(vlp);
2830	if (vlp2 != NULL)
2831		mtx_unlock(vlp2);
2832	cache_free_batch(&batch);
2833}
2834
2835/*
2836 * Opportunistic check to see if there is anything to do.
2837 */
2838static bool
2839cache_has_entries(struct vnode *vp)
2840{
2841
2842	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2843	    atomic_load_ptr(&vp->v_cache_dd) == NULL)
2844		return (false);
2845	return (true);
2846}
2847
2848void
2849cache_purge(struct vnode *vp)
2850{
2851
2852	SDT_PROBE1(vfs, namecache, purge, done, vp);
2853	if (!cache_has_entries(vp))
2854		return;
2855	cache_purge_impl(vp);
2856}
2857
2858/*
2859 * Only to be used by vgone.
2860 */
2861void
2862cache_purge_vgone(struct vnode *vp)
2863{
2864	struct mtx *vlp;
2865
2866	VNPASS(VN_IS_DOOMED(vp), vp);
2867	if (cache_has_entries(vp)) {
2868		cache_purge_impl(vp);
2869		return;
2870	}
2871
2872	/*
2873	 * Serialize against a potential thread doing cache_purge.
2874	 */
2875	vlp = VP2VNODELOCK(vp);
2876	mtx_wait_unlocked(vlp);
2877	if (cache_has_entries(vp)) {
2878		cache_purge_impl(vp);
2879		return;
2880	}
2881	return;
2882}
2883
2884/*
2885 * Remove all negative entries for a particular directory vnode.
2886 */
2887void
2888cache_purge_negative(struct vnode *vp)
2889{
2890	struct cache_freebatch batch;
2891	struct namecache *ncp, *nnp;
2892	struct mtx *vlp;
2893
2894	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2895	if (LIST_EMPTY(&vp->v_cache_src))
2896		return;
2897	TAILQ_INIT(&batch);
2898	vlp = VP2VNODELOCK(vp);
2899	mtx_lock(vlp);
2900	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2901		if (!(ncp->nc_flag & NCF_NEGATIVE))
2902			continue;
2903		cache_zap_negative_locked_vnode_kl(ncp, vp);
2904		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2905	}
2906	mtx_unlock(vlp);
2907	cache_free_batch(&batch);
2908}
2909
2910/*
2911 * Entry points for modifying VOP operations.
2912 */
2913void
2914cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2915    struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2916{
2917
2918	ASSERT_VOP_IN_SEQC(fdvp);
2919	ASSERT_VOP_IN_SEQC(fvp);
2920	ASSERT_VOP_IN_SEQC(tdvp);
2921	if (tvp != NULL)
2922		ASSERT_VOP_IN_SEQC(tvp);
2923
2924	cache_purge(fvp);
2925	if (tvp != NULL) {
2926		cache_purge(tvp);
2927		KASSERT(!cache_remove_cnp(tdvp, tcnp),
2928		    ("%s: lingering negative entry", __func__));
2929	} else {
2930		cache_remove_cnp(tdvp, tcnp);
2931	}
2932
2933	/*
2934	 * TODO
2935	 *
2936	 * Historically renaming was always purging all revelang entries,
2937	 * but that's quite wasteful. In particular turns out that in many cases
2938	 * the target file is immediately accessed after rename, inducing a cache
2939	 * miss.
2940	 *
2941	 * Recode this to reduce relocking and reuse the existing entry (if any)
2942	 * instead of just removing it above and allocating a new one here.
2943	 */
2944	if (cache_rename_add) {
2945		cache_enter(tdvp, fvp, tcnp);
2946	}
2947}
2948
2949void
2950cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2951{
2952
2953	ASSERT_VOP_IN_SEQC(dvp);
2954	ASSERT_VOP_IN_SEQC(vp);
2955	cache_purge(vp);
2956}
2957
2958#ifdef INVARIANTS
2959/*
2960 * Validate that if an entry exists it matches.
2961 */
2962void
2963cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2964{
2965	struct namecache *ncp;
2966	struct mtx *blp;
2967	uint32_t hash;
2968
2969	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2970	if (CK_SLIST_EMPTY(NCHHASH(hash)))
2971		return;
2972	blp = HASH2BUCKETLOCK(hash);
2973	mtx_lock(blp);
2974	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2975		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2976		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2977			if (ncp->nc_vp != vp)
2978				panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
2979				    __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
2980		}
2981	}
2982	mtx_unlock(blp);
2983}
2984#endif
2985
2986/*
2987 * Flush all entries referencing a particular filesystem.
2988 */
2989void
2990cache_purgevfs(struct mount *mp)
2991{
2992	struct vnode *vp, *mvp;
2993
2994	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2995	/*
2996	 * Somewhat wasteful iteration over all vnodes. Would be better to
2997	 * support filtering and avoid the interlock to begin with.
2998	 */
2999	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3000		if (!cache_has_entries(vp)) {
3001			VI_UNLOCK(vp);
3002			continue;
3003		}
3004		vholdl(vp);
3005		VI_UNLOCK(vp);
3006		cache_purge(vp);
3007		vdrop(vp);
3008	}
3009}
3010
3011/*
3012 * Perform canonical checks and cache lookup and pass on to filesystem
3013 * through the vop_cachedlookup only if needed.
3014 */
3015
3016int
3017vfs_cache_lookup(struct vop_lookup_args *ap)
3018{
3019	struct vnode *dvp;
3020	int error;
3021	struct vnode **vpp = ap->a_vpp;
3022	struct componentname *cnp = ap->a_cnp;
3023	int flags = cnp->cn_flags;
3024
3025	*vpp = NULL;
3026	dvp = ap->a_dvp;
3027
3028	if (dvp->v_type != VDIR)
3029		return (ENOTDIR);
3030
3031	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3032	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3033		return (EROFS);
3034
3035	error = vn_dir_check_exec(dvp, cnp);
3036	if (error != 0)
3037		return (error);
3038
3039	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3040	if (error == 0)
3041		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3042	if (error == -1)
3043		return (0);
3044	return (error);
3045}
3046
3047/* Implementation of the getcwd syscall. */
3048int
3049sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3050{
3051	char *buf, *retbuf;
3052	size_t buflen;
3053	int error;
3054
3055	buflen = uap->buflen;
3056	if (__predict_false(buflen < 2))
3057		return (EINVAL);
3058	if (buflen > MAXPATHLEN)
3059		buflen = MAXPATHLEN;
3060
3061	buf = uma_zalloc(namei_zone, M_WAITOK);
3062	error = vn_getcwd(buf, &retbuf, &buflen);
3063	if (error == 0)
3064		error = copyout(retbuf, uap->buf, buflen);
3065	uma_zfree(namei_zone, buf);
3066	return (error);
3067}
3068
3069int
3070vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3071{
3072	struct pwd *pwd;
3073	int error;
3074
3075	vfs_smr_enter();
3076	pwd = pwd_get_smr();
3077	error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3078	    buflen, 0);
3079	VFS_SMR_ASSERT_NOT_ENTERED();
3080	if (error < 0) {
3081		pwd = pwd_hold(curthread);
3082		error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3083		    retbuf, buflen);
3084		pwd_drop(pwd);
3085	}
3086
3087#ifdef KTRACE
3088	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3089		ktrnamei(*retbuf);
3090#endif
3091	return (error);
3092}
3093
3094static int
3095kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3096    size_t size, int flags, enum uio_seg pathseg)
3097{
3098	struct nameidata nd;
3099	char *retbuf, *freebuf;
3100	int error;
3101
3102	if (flags != 0)
3103		return (EINVAL);
3104	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
3105	    pathseg, path, fd, &cap_fstat_rights, td);
3106	if ((error = namei(&nd)) != 0)
3107		return (error);
3108	error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
3109	if (error == 0) {
3110		error = copyout(retbuf, buf, size);
3111		free(freebuf, M_TEMP);
3112	}
3113	NDFREE(&nd, 0);
3114	return (error);
3115}
3116
3117int
3118sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3119{
3120
3121	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3122	    uap->flags, UIO_USERSPACE));
3123}
3124
3125/*
3126 * Retrieve the full filesystem path that correspond to a vnode from the name
3127 * cache (if available)
3128 */
3129int
3130vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3131{
3132	struct pwd *pwd;
3133	char *buf;
3134	size_t buflen;
3135	int error;
3136
3137	if (__predict_false(vp == NULL))
3138		return (EINVAL);
3139
3140	buflen = MAXPATHLEN;
3141	buf = malloc(buflen, M_TEMP, M_WAITOK);
3142	vfs_smr_enter();
3143	pwd = pwd_get_smr();
3144	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3145	VFS_SMR_ASSERT_NOT_ENTERED();
3146	if (error < 0) {
3147		pwd = pwd_hold(curthread);
3148		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3149		pwd_drop(pwd);
3150	}
3151	if (error == 0)
3152		*freebuf = buf;
3153	else
3154		free(buf, M_TEMP);
3155	return (error);
3156}
3157
3158/*
3159 * This function is similar to vn_fullpath, but it attempts to lookup the
3160 * pathname relative to the global root mount point.  This is required for the
3161 * auditing sub-system, as audited pathnames must be absolute, relative to the
3162 * global root mount point.
3163 */
3164int
3165vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3166{
3167	char *buf;
3168	size_t buflen;
3169	int error;
3170
3171	if (__predict_false(vp == NULL))
3172		return (EINVAL);
3173	buflen = MAXPATHLEN;
3174	buf = malloc(buflen, M_TEMP, M_WAITOK);
3175	vfs_smr_enter();
3176	error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3177	VFS_SMR_ASSERT_NOT_ENTERED();
3178	if (error < 0) {
3179		error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3180	}
3181	if (error == 0)
3182		*freebuf = buf;
3183	else
3184		free(buf, M_TEMP);
3185	return (error);
3186}
3187
3188static struct namecache *
3189vn_dd_from_dst(struct vnode *vp)
3190{
3191	struct namecache *ncp;
3192
3193	cache_assert_vnode_locked(vp);
3194	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3195		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3196			return (ncp);
3197	}
3198	return (NULL);
3199}
3200
3201int
3202vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3203{
3204	struct vnode *dvp;
3205	struct namecache *ncp;
3206	struct mtx *vlp;
3207	int error;
3208
3209	vlp = VP2VNODELOCK(*vp);
3210	mtx_lock(vlp);
3211	ncp = (*vp)->v_cache_dd;
3212	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3213		KASSERT(ncp == vn_dd_from_dst(*vp),
3214		    ("%s: mismatch for dd entry (%p != %p)", __func__,
3215		    ncp, vn_dd_from_dst(*vp)));
3216	} else {
3217		ncp = vn_dd_from_dst(*vp);
3218	}
3219	if (ncp != NULL) {
3220		if (*buflen < ncp->nc_nlen) {
3221			mtx_unlock(vlp);
3222			vrele(*vp);
3223			counter_u64_add(numfullpathfail4, 1);
3224			error = ENOMEM;
3225			SDT_PROBE3(vfs, namecache, fullpath, return, error,
3226			    vp, NULL);
3227			return (error);
3228		}
3229		*buflen -= ncp->nc_nlen;
3230		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3231		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3232		    ncp->nc_name, vp);
3233		dvp = *vp;
3234		*vp = ncp->nc_dvp;
3235		vref(*vp);
3236		mtx_unlock(vlp);
3237		vrele(dvp);
3238		return (0);
3239	}
3240	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3241
3242	mtx_unlock(vlp);
3243	vn_lock(*vp, LK_SHARED | LK_RETRY);
3244	error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3245	vput(*vp);
3246	if (error) {
3247		counter_u64_add(numfullpathfail2, 1);
3248		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
3249		return (error);
3250	}
3251
3252	*vp = dvp;
3253	if (VN_IS_DOOMED(dvp)) {
3254		/* forced unmount */
3255		vrele(dvp);
3256		error = ENOENT;
3257		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3258		return (error);
3259	}
3260	/*
3261	 * *vp has its use count incremented still.
3262	 */
3263
3264	return (0);
3265}
3266
3267/*
3268 * Resolve a directory to a pathname.
3269 *
3270 * The name of the directory can always be found in the namecache or fetched
3271 * from the filesystem. There is also guaranteed to be only one parent, meaning
3272 * we can just follow vnodes up until we find the root.
3273 *
3274 * The vnode must be referenced.
3275 */
3276static int
3277vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3278    size_t *len, size_t addend)
3279{
3280#ifdef KDTRACE_HOOKS
3281	struct vnode *startvp = vp;
3282#endif
3283	struct vnode *vp1;
3284	size_t buflen;
3285	int error;
3286	bool slash_prefixed;
3287
3288	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3289	VNPASS(vp->v_usecount > 0, vp);
3290
3291	buflen = *len;
3292
3293	slash_prefixed = true;
3294	if (addend == 0) {
3295		MPASS(*len >= 2);
3296		buflen--;
3297		buf[buflen] = '\0';
3298		slash_prefixed = false;
3299	}
3300
3301	error = 0;
3302
3303	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3304	counter_u64_add(numfullpathcalls, 1);
3305	while (vp != rdir && vp != rootvnode) {
3306		/*
3307		 * The vp vnode must be already fully constructed,
3308		 * since it is either found in namecache or obtained
3309		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3310		 * without obtaining the vnode lock.
3311		 */
3312		if ((vp->v_vflag & VV_ROOT) != 0) {
3313			vn_lock(vp, LK_RETRY | LK_SHARED);
3314
3315			/*
3316			 * With the vnode locked, check for races with
3317			 * unmount, forced or not.  Note that we
3318			 * already verified that vp is not equal to
3319			 * the root vnode, which means that
3320			 * mnt_vnodecovered can be NULL only for the
3321			 * case of unmount.
3322			 */
3323			if (VN_IS_DOOMED(vp) ||
3324			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3325			    vp1->v_mountedhere != vp->v_mount) {
3326				vput(vp);
3327				error = ENOENT;
3328				SDT_PROBE3(vfs, namecache, fullpath, return,
3329				    error, vp, NULL);
3330				break;
3331			}
3332
3333			vref(vp1);
3334			vput(vp);
3335			vp = vp1;
3336			continue;
3337		}
3338		if (vp->v_type != VDIR) {
3339			vrele(vp);
3340			counter_u64_add(numfullpathfail1, 1);
3341			error = ENOTDIR;
3342			SDT_PROBE3(vfs, namecache, fullpath, return,
3343			    error, vp, NULL);
3344			break;
3345		}
3346		error = vn_vptocnp(&vp, buf, &buflen);
3347		if (error)
3348			break;
3349		if (buflen == 0) {
3350			vrele(vp);
3351			error = ENOMEM;
3352			SDT_PROBE3(vfs, namecache, fullpath, return, error,
3353			    startvp, NULL);
3354			break;
3355		}
3356		buf[--buflen] = '/';
3357		slash_prefixed = true;
3358	}
3359	if (error)
3360		return (error);
3361	if (!slash_prefixed) {
3362		if (buflen == 0) {
3363			vrele(vp);
3364			counter_u64_add(numfullpathfail4, 1);
3365			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3366			    startvp, NULL);
3367			return (ENOMEM);
3368		}
3369		buf[--buflen] = '/';
3370	}
3371	counter_u64_add(numfullpathfound, 1);
3372	vrele(vp);
3373
3374	*retbuf = buf + buflen;
3375	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3376	*len -= buflen;
3377	*len += addend;
3378	return (0);
3379}
3380
3381/*
3382 * Resolve an arbitrary vnode to a pathname.
3383 *
3384 * Note 2 caveats:
3385 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3386 *   resolve to a different path than the one used to find it
3387 * - namecache is not mandatory, meaning names are not guaranteed to be added
3388 *   (in which case resolving fails)
3389 */
3390static void __inline
3391cache_rev_failed_impl(int *reason, int line)
3392{
3393
3394	*reason = line;
3395}
3396#define cache_rev_failed(var)	cache_rev_failed_impl((var), __LINE__)
3397
3398static int
3399vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3400    char **retbuf, size_t *buflen, size_t addend)
3401{
3402#ifdef KDTRACE_HOOKS
3403	struct vnode *startvp = vp;
3404#endif
3405	struct vnode *tvp;
3406	struct mount *mp;
3407	struct namecache *ncp;
3408	size_t orig_buflen;
3409	int reason;
3410	int error;
3411#ifdef KDTRACE_HOOKS
3412	int i;
3413#endif
3414	seqc_t vp_seqc, tvp_seqc;
3415	u_char nc_flag;
3416
3417	VFS_SMR_ASSERT_ENTERED();
3418
3419	if (!cache_fast_revlookup) {
3420		vfs_smr_exit();
3421		return (-1);
3422	}
3423
3424	orig_buflen = *buflen;
3425
3426	if (addend == 0) {
3427		MPASS(*buflen >= 2);
3428		*buflen -= 1;
3429		buf[*buflen] = '\0';
3430	}
3431
3432	if (vp == rdir || vp == rootvnode) {
3433		if (addend == 0) {
3434			*buflen -= 1;
3435			buf[*buflen] = '/';
3436		}
3437		goto out_ok;
3438	}
3439
3440#ifdef KDTRACE_HOOKS
3441	i = 0;
3442#endif
3443	error = -1;
3444	ncp = NULL; /* for sdt probe down below */
3445	vp_seqc = vn_seqc_read_any(vp);
3446	if (seqc_in_modify(vp_seqc)) {
3447		cache_rev_failed(&reason);
3448		goto out_abort;
3449	}
3450
3451	for (;;) {
3452#ifdef KDTRACE_HOOKS
3453		i++;
3454#endif
3455		if ((vp->v_vflag & VV_ROOT) != 0) {
3456			mp = atomic_load_ptr(&vp->v_mount);
3457			if (mp == NULL) {
3458				cache_rev_failed(&reason);
3459				goto out_abort;
3460			}
3461			tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3462			tvp_seqc = vn_seqc_read_any(tvp);
3463			if (seqc_in_modify(tvp_seqc)) {
3464				cache_rev_failed(&reason);
3465				goto out_abort;
3466			}
3467			if (!vn_seqc_consistent(vp, vp_seqc)) {
3468				cache_rev_failed(&reason);
3469				goto out_abort;
3470			}
3471			vp = tvp;
3472			vp_seqc = tvp_seqc;
3473			continue;
3474		}
3475		ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3476		if (ncp == NULL) {
3477			cache_rev_failed(&reason);
3478			goto out_abort;
3479		}
3480		nc_flag = atomic_load_char(&ncp->nc_flag);
3481		if ((nc_flag & NCF_ISDOTDOT) != 0) {
3482			cache_rev_failed(&reason);
3483			goto out_abort;
3484		}
3485		if (ncp->nc_nlen >= *buflen) {
3486			cache_rev_failed(&reason);
3487			error = ENOMEM;
3488			goto out_abort;
3489		}
3490		*buflen -= ncp->nc_nlen;
3491		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3492		*buflen -= 1;
3493		buf[*buflen] = '/';
3494		tvp = ncp->nc_dvp;
3495		tvp_seqc = vn_seqc_read_any(tvp);
3496		if (seqc_in_modify(tvp_seqc)) {
3497			cache_rev_failed(&reason);
3498			goto out_abort;
3499		}
3500		if (!vn_seqc_consistent(vp, vp_seqc)) {
3501			cache_rev_failed(&reason);
3502			goto out_abort;
3503		}
3504		/*
3505		 * Acquire fence provided by vn_seqc_read_any above.
3506		 */
3507		if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3508			cache_rev_failed(&reason);
3509			goto out_abort;
3510		}
3511		if (!cache_ncp_canuse(ncp)) {
3512			cache_rev_failed(&reason);
3513			goto out_abort;
3514		}
3515		vp = tvp;
3516		vp_seqc = tvp_seqc;
3517		if (vp == rdir || vp == rootvnode)
3518			break;
3519	}
3520out_ok:
3521	vfs_smr_exit();
3522	*retbuf = buf + *buflen;
3523	*buflen = orig_buflen - *buflen + addend;
3524	SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3525	return (0);
3526
3527out_abort:
3528	*buflen = orig_buflen;
3529	SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3530	vfs_smr_exit();
3531	return (error);
3532}
3533
3534static int
3535vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3536    size_t *buflen)
3537{
3538	size_t orig_buflen, addend;
3539	int error;
3540
3541	if (*buflen < 2)
3542		return (EINVAL);
3543
3544	orig_buflen = *buflen;
3545
3546	vref(vp);
3547	addend = 0;
3548	if (vp->v_type != VDIR) {
3549		*buflen -= 1;
3550		buf[*buflen] = '\0';
3551		error = vn_vptocnp(&vp, buf, buflen);
3552		if (error)
3553			return (error);
3554		if (*buflen == 0) {
3555			vrele(vp);
3556			return (ENOMEM);
3557		}
3558		*buflen -= 1;
3559		buf[*buflen] = '/';
3560		addend = orig_buflen - *buflen;
3561	}
3562
3563	return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3564}
3565
3566/*
3567 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3568 *
3569 * Since the namecache does not track hardlinks, the caller is expected to first
3570 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3571 *
3572 * Then we have 2 cases:
3573 * - if the found vnode is a directory, the path can be constructed just by
3574 *   following names up the chain
3575 * - otherwise we populate the buffer with the saved name and start resolving
3576 *   from the parent
3577 */
3578static int
3579vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3580    size_t *buflen)
3581{
3582	char *buf, *tmpbuf;
3583	struct pwd *pwd;
3584	struct componentname *cnp;
3585	struct vnode *vp;
3586	size_t addend;
3587	int error;
3588	enum vtype type;
3589
3590	if (*buflen < 2)
3591		return (EINVAL);
3592	if (*buflen > MAXPATHLEN)
3593		*buflen = MAXPATHLEN;
3594
3595	buf = malloc(*buflen, M_TEMP, M_WAITOK);
3596
3597	addend = 0;
3598	vp = ndp->ni_vp;
3599	/*
3600	 * Check for VBAD to work around the vp_crossmp bug in lookup().
3601	 *
3602	 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3603	 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3604	 * If the type is VDIR (like in this very case) we can skip looking
3605	 * at ni_dvp in the first place. However, since vnodes get passed here
3606	 * unlocked the target may transition to doomed state (type == VBAD)
3607	 * before we get to evaluate the condition. If this happens, we will
3608	 * populate part of the buffer and descend to vn_fullpath_dir with
3609	 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3610	 *
3611	 * This should be atomic_load(&vp->v_type) but it is illegal to take
3612	 * an address of a bit field, even if said field is sized to char.
3613	 * Work around the problem by reading the value into a full-sized enum
3614	 * and then re-reading it with atomic_load which will still prevent
3615	 * the compiler from re-reading down the road.
3616	 */
3617	type = vp->v_type;
3618	type = atomic_load_int(&type);
3619	if (type == VBAD) {
3620		error = ENOENT;
3621		goto out_bad;
3622	}
3623	if (type != VDIR) {
3624		cnp = &ndp->ni_cnd;
3625		addend = cnp->cn_namelen + 2;
3626		if (*buflen < addend) {
3627			error = ENOMEM;
3628			goto out_bad;
3629		}
3630		*buflen -= addend;
3631		tmpbuf = buf + *buflen;
3632		tmpbuf[0] = '/';
3633		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3634		tmpbuf[addend - 1] = '\0';
3635		vp = ndp->ni_dvp;
3636	}
3637
3638	vfs_smr_enter();
3639	pwd = pwd_get_smr();
3640	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3641	    addend);
3642	VFS_SMR_ASSERT_NOT_ENTERED();
3643	if (error < 0) {
3644		pwd = pwd_hold(curthread);
3645		vref(vp);
3646		error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3647		    addend);
3648		pwd_drop(pwd);
3649	}
3650	if (error != 0)
3651		goto out_bad;
3652
3653	*freebuf = buf;
3654
3655	return (0);
3656out_bad:
3657	free(buf, M_TEMP);
3658	return (error);
3659}
3660
3661struct vnode *
3662vn_dir_dd_ino(struct vnode *vp)
3663{
3664	struct namecache *ncp;
3665	struct vnode *ddvp;
3666	struct mtx *vlp;
3667	enum vgetstate vs;
3668
3669	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3670	vlp = VP2VNODELOCK(vp);
3671	mtx_lock(vlp);
3672	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3673		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3674			continue;
3675		ddvp = ncp->nc_dvp;
3676		vs = vget_prep(ddvp);
3677		mtx_unlock(vlp);
3678		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3679			return (NULL);
3680		return (ddvp);
3681	}
3682	mtx_unlock(vlp);
3683	return (NULL);
3684}
3685
3686int
3687vn_commname(struct vnode *vp, char *buf, u_int buflen)
3688{
3689	struct namecache *ncp;
3690	struct mtx *vlp;
3691	int l;
3692
3693	vlp = VP2VNODELOCK(vp);
3694	mtx_lock(vlp);
3695	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3696		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3697			break;
3698	if (ncp == NULL) {
3699		mtx_unlock(vlp);
3700		return (ENOENT);
3701	}
3702	l = min(ncp->nc_nlen, buflen - 1);
3703	memcpy(buf, ncp->nc_name, l);
3704	mtx_unlock(vlp);
3705	buf[l] = '\0';
3706	return (0);
3707}
3708
3709/*
3710 * This function updates path string to vnode's full global path
3711 * and checks the size of the new path string against the pathlen argument.
3712 *
3713 * Requires a locked, referenced vnode.
3714 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3715 *
3716 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3717 * because it falls back to the ".." lookup if the namecache lookup fails.
3718 */
3719int
3720vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3721    u_int pathlen)
3722{
3723	struct nameidata nd;
3724	struct vnode *vp1;
3725	char *rpath, *fbuf;
3726	int error;
3727
3728	ASSERT_VOP_ELOCKED(vp, __func__);
3729
3730	/* Construct global filesystem path from vp. */
3731	VOP_UNLOCK(vp);
3732	error = vn_fullpath_global(vp, &rpath, &fbuf);
3733
3734	if (error != 0) {
3735		vrele(vp);
3736		return (error);
3737	}
3738
3739	if (strlen(rpath) >= pathlen) {
3740		vrele(vp);
3741		error = ENAMETOOLONG;
3742		goto out;
3743	}
3744
3745	/*
3746	 * Re-lookup the vnode by path to detect a possible rename.
3747	 * As a side effect, the vnode is relocked.
3748	 * If vnode was renamed, return ENOENT.
3749	 */
3750	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3751	    UIO_SYSSPACE, path, td);
3752	error = namei(&nd);
3753	if (error != 0) {
3754		vrele(vp);
3755		goto out;
3756	}
3757	NDFREE(&nd, NDF_ONLY_PNBUF);
3758	vp1 = nd.ni_vp;
3759	vrele(vp);
3760	if (vp1 == vp)
3761		strcpy(path, rpath);
3762	else {
3763		vput(vp1);
3764		error = ENOENT;
3765	}
3766
3767out:
3768	free(fbuf, M_TEMP);
3769	return (error);
3770}
3771
3772#ifdef DDB
3773static void
3774db_print_vpath(struct vnode *vp)
3775{
3776
3777	while (vp != NULL) {
3778		db_printf("%p: ", vp);
3779		if (vp == rootvnode) {
3780			db_printf("/");
3781			vp = NULL;
3782		} else {
3783			if (vp->v_vflag & VV_ROOT) {
3784				db_printf("<mount point>");
3785				vp = vp->v_mount->mnt_vnodecovered;
3786			} else {
3787				struct namecache *ncp;
3788				char *ncn;
3789				int i;
3790
3791				ncp = TAILQ_FIRST(&vp->v_cache_dst);
3792				if (ncp != NULL) {
3793					ncn = ncp->nc_name;
3794					for (i = 0; i < ncp->nc_nlen; i++)
3795						db_printf("%c", *ncn++);
3796					vp = ncp->nc_dvp;
3797				} else {
3798					vp = NULL;
3799				}
3800			}
3801		}
3802		db_printf("\n");
3803	}
3804
3805	return;
3806}
3807
3808DB_SHOW_COMMAND(vpath, db_show_vpath)
3809{
3810	struct vnode *vp;
3811
3812	if (!have_addr) {
3813		db_printf("usage: show vpath <struct vnode *>\n");
3814		return;
3815	}
3816
3817	vp = (struct vnode *)addr;
3818	db_print_vpath(vp);
3819}
3820
3821#endif
3822
3823static int cache_fast_lookup = 1;
3824static char __read_frequently cache_fast_lookup_enabled = true;
3825
3826#define CACHE_FPL_FAILED	-2020
3827
3828void
3829cache_fast_lookup_enabled_recalc(void)
3830{
3831	int lookup_flag;
3832	int mac_on;
3833
3834#ifdef MAC
3835	mac_on = mac_vnode_check_lookup_enabled();
3836	mac_on |= mac_vnode_check_readlink_enabled();
3837#else
3838	mac_on = 0;
3839#endif
3840
3841	lookup_flag = atomic_load_int(&cache_fast_lookup);
3842	if (lookup_flag && !mac_on) {
3843		atomic_store_char(&cache_fast_lookup_enabled, true);
3844	} else {
3845		atomic_store_char(&cache_fast_lookup_enabled, false);
3846	}
3847}
3848
3849static int
3850syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
3851{
3852	int error, old;
3853
3854	old = atomic_load_int(&cache_fast_lookup);
3855	error = sysctl_handle_int(oidp, arg1, arg2, req);
3856	if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
3857		cache_fast_lookup_enabled_recalc();
3858	return (error);
3859}
3860SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
3861    &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
3862
3863/*
3864 * Components of nameidata (or objects it can point to) which may
3865 * need restoring in case fast path lookup fails.
3866 */
3867struct nameidata_outer {
3868	size_t ni_pathlen;
3869	int cn_flags;
3870};
3871
3872struct nameidata_saved {
3873#ifdef INVARIANTS
3874	char *cn_nameptr;
3875	size_t ni_pathlen;
3876#endif
3877};
3878
3879#ifdef INVARIANTS
3880struct cache_fpl_debug {
3881	size_t ni_pathlen;
3882};
3883#endif
3884
3885struct cache_fpl {
3886	struct nameidata *ndp;
3887	struct componentname *cnp;
3888	char *nulchar;
3889	struct vnode *dvp;
3890	struct vnode *tvp;
3891	seqc_t dvp_seqc;
3892	seqc_t tvp_seqc;
3893	uint32_t hash;
3894	struct nameidata_saved snd;
3895	struct nameidata_outer snd_outer;
3896	int line;
3897	enum cache_fpl_status status:8;
3898	bool in_smr;
3899	bool fsearch;
3900	bool savename;
3901	struct pwd **pwd;
3902#ifdef INVARIANTS
3903	struct cache_fpl_debug debug;
3904#endif
3905};
3906
3907static bool cache_fplookup_mp_supported(struct mount *mp);
3908static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
3909static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
3910static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
3911static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
3912static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
3913static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
3914static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
3915static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
3916static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
3917
3918static void
3919cache_fpl_cleanup_cnp(struct componentname *cnp)
3920{
3921
3922	uma_zfree(namei_zone, cnp->cn_pnbuf);
3923#ifdef DIAGNOSTIC
3924	cnp->cn_pnbuf = NULL;
3925	cnp->cn_nameptr = NULL;
3926#endif
3927}
3928
3929static struct vnode *
3930cache_fpl_handle_root(struct cache_fpl *fpl)
3931{
3932	struct nameidata *ndp;
3933	struct componentname *cnp;
3934
3935	ndp = fpl->ndp;
3936	cnp = fpl->cnp;
3937
3938	MPASS(*(cnp->cn_nameptr) == '/');
3939	cnp->cn_nameptr++;
3940	cache_fpl_pathlen_dec(fpl);
3941
3942	if (__predict_false(*(cnp->cn_nameptr) == '/')) {
3943		do {
3944			cnp->cn_nameptr++;
3945			cache_fpl_pathlen_dec(fpl);
3946		} while (*(cnp->cn_nameptr) == '/');
3947	}
3948
3949	return (ndp->ni_rootdir);
3950}
3951
3952static void
3953cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
3954{
3955
3956	fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
3957	fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
3958}
3959
3960static void
3961cache_fpl_checkpoint(struct cache_fpl *fpl)
3962{
3963
3964#ifdef INVARIANTS
3965	fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3966	fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
3967#endif
3968}
3969
3970static void
3971cache_fpl_restore_partial(struct cache_fpl *fpl)
3972{
3973
3974	fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
3975#ifdef INVARIANTS
3976	fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
3977#endif
3978}
3979
3980static void
3981cache_fpl_restore_abort(struct cache_fpl *fpl)
3982{
3983
3984	cache_fpl_restore_partial(fpl);
3985	/*
3986	 * It is 0 on entry by API contract.
3987	 */
3988	fpl->ndp->ni_resflags = 0;
3989	fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
3990	fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
3991}
3992
3993#ifdef INVARIANTS
3994#define cache_fpl_smr_assert_entered(fpl) ({			\
3995	struct cache_fpl *_fpl = (fpl);				\
3996	MPASS(_fpl->in_smr == true);				\
3997	VFS_SMR_ASSERT_ENTERED();				\
3998})
3999#define cache_fpl_smr_assert_not_entered(fpl) ({		\
4000	struct cache_fpl *_fpl = (fpl);				\
4001	MPASS(_fpl->in_smr == false);				\
4002	VFS_SMR_ASSERT_NOT_ENTERED();				\
4003})
4004static void
4005cache_fpl_assert_status(struct cache_fpl *fpl)
4006{
4007
4008	switch (fpl->status) {
4009	case CACHE_FPL_STATUS_UNSET:
4010		__assert_unreachable();
4011		break;
4012	case CACHE_FPL_STATUS_DESTROYED:
4013	case CACHE_FPL_STATUS_ABORTED:
4014	case CACHE_FPL_STATUS_PARTIAL:
4015	case CACHE_FPL_STATUS_HANDLED:
4016		break;
4017	}
4018}
4019#else
4020#define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4021#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4022#define cache_fpl_assert_status(fpl) do { } while (0)
4023#endif
4024
4025#define cache_fpl_smr_enter_initial(fpl) ({			\
4026	struct cache_fpl *_fpl = (fpl);				\
4027	vfs_smr_enter();					\
4028	_fpl->in_smr = true;					\
4029})
4030
4031#define cache_fpl_smr_enter(fpl) ({				\
4032	struct cache_fpl *_fpl = (fpl);				\
4033	MPASS(_fpl->in_smr == false);				\
4034	vfs_smr_enter();					\
4035	_fpl->in_smr = true;					\
4036})
4037
4038#define cache_fpl_smr_exit(fpl) ({				\
4039	struct cache_fpl *_fpl = (fpl);				\
4040	MPASS(_fpl->in_smr == true);				\
4041	vfs_smr_exit();						\
4042	_fpl->in_smr = false;					\
4043})
4044
4045static int
4046cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4047{
4048
4049	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4050		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4051		    ("%s: converting to abort from %d at %d, set at %d\n",
4052		    __func__, fpl->status, line, fpl->line));
4053	}
4054	cache_fpl_smr_assert_not_entered(fpl);
4055	fpl->status = CACHE_FPL_STATUS_ABORTED;
4056	fpl->line = line;
4057	return (CACHE_FPL_FAILED);
4058}
4059
4060#define cache_fpl_aborted_early(x)	cache_fpl_aborted_early_impl((x), __LINE__)
4061
4062static int __noinline
4063cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4064{
4065	struct nameidata *ndp;
4066	struct componentname *cnp;
4067
4068	ndp = fpl->ndp;
4069	cnp = fpl->cnp;
4070
4071	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4072		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4073		    ("%s: converting to abort from %d at %d, set at %d\n",
4074		    __func__, fpl->status, line, fpl->line));
4075	}
4076	fpl->status = CACHE_FPL_STATUS_ABORTED;
4077	fpl->line = line;
4078	if (fpl->in_smr)
4079		cache_fpl_smr_exit(fpl);
4080	cache_fpl_restore_abort(fpl);
4081	/*
4082	 * Resolving symlinks overwrites data passed by the caller.
4083	 * Let namei know.
4084	 */
4085	if (ndp->ni_loopcnt > 0) {
4086		fpl->status = CACHE_FPL_STATUS_DESTROYED;
4087		cache_fpl_cleanup_cnp(cnp);
4088	}
4089	return (CACHE_FPL_FAILED);
4090}
4091
4092#define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
4093
4094static int __noinline
4095cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4096{
4097
4098	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4099	    ("%s: setting to partial at %d, but already set to %d at %d\n",
4100	    __func__, line, fpl->status, fpl->line));
4101	cache_fpl_smr_assert_entered(fpl);
4102	fpl->status = CACHE_FPL_STATUS_PARTIAL;
4103	fpl->line = line;
4104	return (cache_fplookup_partial_setup(fpl));
4105}
4106
4107#define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
4108
4109static int
4110cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4111{
4112
4113	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4114	    ("%s: setting to handled at %d, but already set to %d at %d\n",
4115	    __func__, line, fpl->status, fpl->line));
4116	cache_fpl_smr_assert_not_entered(fpl);
4117	fpl->status = CACHE_FPL_STATUS_HANDLED;
4118	fpl->line = line;
4119	return (0);
4120}
4121
4122#define cache_fpl_handled(x)	cache_fpl_handled_impl((x), __LINE__)
4123
4124static int
4125cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4126{
4127
4128	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4129	    ("%s: setting to handled at %d, but already set to %d at %d\n",
4130	    __func__, line, fpl->status, fpl->line));
4131	MPASS(error != 0);
4132	MPASS(error != CACHE_FPL_FAILED);
4133	cache_fpl_smr_assert_not_entered(fpl);
4134	fpl->status = CACHE_FPL_STATUS_HANDLED;
4135	fpl->line = line;
4136	fpl->dvp = NULL;
4137	fpl->tvp = NULL;
4138	fpl->savename = false;
4139	return (error);
4140}
4141
4142#define cache_fpl_handled_error(x, e)	cache_fpl_handled_error_impl((x), (e), __LINE__)
4143
4144static bool
4145cache_fpl_terminated(struct cache_fpl *fpl)
4146{
4147
4148	return (fpl->status != CACHE_FPL_STATUS_UNSET);
4149}
4150
4151#define CACHE_FPL_SUPPORTED_CN_FLAGS \
4152	(NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4153	 FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \
4154	 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
4155
4156#define CACHE_FPL_INTERNAL_CN_FLAGS \
4157	(ISDOTDOT | MAKEENTRY | ISLASTCN)
4158
4159_Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4160    "supported and internal flags overlap");
4161
4162static bool
4163cache_fpl_islastcn(struct nameidata *ndp)
4164{
4165
4166	return (*ndp->ni_next == 0);
4167}
4168
4169static bool
4170cache_fpl_istrailingslash(struct cache_fpl *fpl)
4171{
4172
4173	return (*(fpl->nulchar - 1) == '/');
4174}
4175
4176static bool
4177cache_fpl_isdotdot(struct componentname *cnp)
4178{
4179
4180	if (cnp->cn_namelen == 2 &&
4181	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4182		return (true);
4183	return (false);
4184}
4185
4186static bool
4187cache_can_fplookup(struct cache_fpl *fpl)
4188{
4189	struct nameidata *ndp;
4190	struct componentname *cnp;
4191	struct thread *td;
4192
4193	ndp = fpl->ndp;
4194	cnp = fpl->cnp;
4195	td = cnp->cn_thread;
4196
4197	if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4198		cache_fpl_aborted_early(fpl);
4199		return (false);
4200	}
4201	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4202		cache_fpl_aborted_early(fpl);
4203		return (false);
4204	}
4205	if (IN_CAPABILITY_MODE(td)) {
4206		cache_fpl_aborted_early(fpl);
4207		return (false);
4208	}
4209	if (AUDITING_TD(td)) {
4210		cache_fpl_aborted_early(fpl);
4211		return (false);
4212	}
4213	if (ndp->ni_startdir != NULL) {
4214		cache_fpl_aborted_early(fpl);
4215		return (false);
4216	}
4217	return (true);
4218}
4219
4220static int
4221cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4222{
4223	struct nameidata *ndp;
4224	int error;
4225	bool fsearch;
4226
4227	ndp = fpl->ndp;
4228	error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
4229	if (__predict_false(error != 0)) {
4230		return (cache_fpl_aborted(fpl));
4231	}
4232	fpl->fsearch = fsearch;
4233	return (0);
4234}
4235
4236static int __noinline
4237cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4238    uint32_t hash)
4239{
4240	struct componentname *cnp;
4241	struct vnode *dvp;
4242
4243	cnp = fpl->cnp;
4244	dvp = fpl->dvp;
4245
4246	cache_fpl_smr_exit(fpl);
4247	if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4248		return (cache_fpl_handled_error(fpl, ENOENT));
4249	else
4250		return (cache_fpl_aborted(fpl));
4251}
4252
4253/*
4254 * The target vnode is not supported, prepare for the slow path to take over.
4255 */
4256static int __noinline
4257cache_fplookup_partial_setup(struct cache_fpl *fpl)
4258{
4259	struct nameidata *ndp;
4260	struct componentname *cnp;
4261	enum vgetstate dvs;
4262	struct vnode *dvp;
4263	struct pwd *pwd;
4264	seqc_t dvp_seqc;
4265
4266	ndp = fpl->ndp;
4267	cnp = fpl->cnp;
4268	pwd = *(fpl->pwd);
4269	dvp = fpl->dvp;
4270	dvp_seqc = fpl->dvp_seqc;
4271
4272	if (!pwd_hold_smr(pwd)) {
4273		return (cache_fpl_aborted(fpl));
4274	}
4275
4276	/*
4277	 * Note that seqc is checked before the vnode is locked, so by
4278	 * the time regular lookup gets to it it may have moved.
4279	 *
4280	 * Ultimately this does not affect correctness, any lookup errors
4281	 * are userspace racing with itself. It is guaranteed that any
4282	 * path which ultimately gets found could also have been found
4283	 * by regular lookup going all the way in absence of concurrent
4284	 * modifications.
4285	 */
4286	dvs = vget_prep_smr(dvp);
4287	cache_fpl_smr_exit(fpl);
4288	if (__predict_false(dvs == VGET_NONE)) {
4289		pwd_drop(pwd);
4290		return (cache_fpl_aborted(fpl));
4291	}
4292
4293	vget_finish_ref(dvp, dvs);
4294	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4295		vrele(dvp);
4296		pwd_drop(pwd);
4297		return (cache_fpl_aborted(fpl));
4298	}
4299
4300	cache_fpl_restore_partial(fpl);
4301#ifdef INVARIANTS
4302	if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4303		panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4304		    cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4305	}
4306#endif
4307
4308	ndp->ni_startdir = dvp;
4309	cnp->cn_flags |= MAKEENTRY;
4310	if (cache_fpl_islastcn(ndp))
4311		cnp->cn_flags |= ISLASTCN;
4312	if (cache_fpl_isdotdot(cnp))
4313		cnp->cn_flags |= ISDOTDOT;
4314
4315	/*
4316	 * Skip potential extra slashes parsing did not take care of.
4317	 * cache_fplookup_skip_slashes explains the mechanism.
4318	 */
4319	if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4320		do {
4321			cnp->cn_nameptr++;
4322			cache_fpl_pathlen_dec(fpl);
4323		} while (*(cnp->cn_nameptr) == '/');
4324	}
4325
4326	ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4327#ifdef INVARIANTS
4328	if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4329		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4330		    __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4331		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4332	}
4333#endif
4334	return (0);
4335}
4336
4337static int
4338cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4339{
4340	struct componentname *cnp;
4341	struct vnode *tvp;
4342	seqc_t tvp_seqc;
4343	int error, lkflags;
4344
4345	cnp = fpl->cnp;
4346	tvp = fpl->tvp;
4347	tvp_seqc = fpl->tvp_seqc;
4348
4349	if ((cnp->cn_flags & LOCKLEAF) != 0) {
4350		lkflags = LK_SHARED;
4351		if ((cnp->cn_flags & LOCKSHARED) == 0)
4352			lkflags = LK_EXCLUSIVE;
4353		error = vget_finish(tvp, lkflags, tvs);
4354		if (__predict_false(error != 0)) {
4355			return (cache_fpl_aborted(fpl));
4356		}
4357	} else {
4358		vget_finish_ref(tvp, tvs);
4359	}
4360
4361	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4362		if ((cnp->cn_flags & LOCKLEAF) != 0)
4363			vput(tvp);
4364		else
4365			vrele(tvp);
4366		return (cache_fpl_aborted(fpl));
4367	}
4368
4369	return (cache_fpl_handled(fpl));
4370}
4371
4372/*
4373 * They want to possibly modify the state of the namecache.
4374 */
4375static int __noinline
4376cache_fplookup_final_modifying(struct cache_fpl *fpl)
4377{
4378	struct nameidata *ndp;
4379	struct componentname *cnp;
4380	enum vgetstate dvs;
4381	struct vnode *dvp, *tvp;
4382	struct mount *mp;
4383	seqc_t dvp_seqc;
4384	int error;
4385	bool docache;
4386
4387	ndp = fpl->ndp;
4388	cnp = fpl->cnp;
4389	dvp = fpl->dvp;
4390	dvp_seqc = fpl->dvp_seqc;
4391
4392	MPASS(*(cnp->cn_nameptr) != '/');
4393	MPASS(cache_fpl_islastcn(ndp));
4394	if ((cnp->cn_flags & LOCKPARENT) == 0)
4395		MPASS((cnp->cn_flags & WANTPARENT) != 0);
4396	MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4397	MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4398	    cnp->cn_nameiop == RENAME);
4399	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4400	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4401
4402	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4403	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4404		docache = false;
4405
4406	/*
4407	 * Regular lookup nulifies the slash, which we don't do here.
4408	 * Don't take chances with filesystem routines seeing it for
4409	 * the last entry.
4410	 */
4411	if (cache_fpl_istrailingslash(fpl)) {
4412		return (cache_fpl_partial(fpl));
4413	}
4414
4415	mp = atomic_load_ptr(&dvp->v_mount);
4416	if (__predict_false(mp == NULL)) {
4417		return (cache_fpl_aborted(fpl));
4418	}
4419
4420	if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4421		cache_fpl_smr_exit(fpl);
4422		/*
4423		 * Original code keeps not checking for CREATE which
4424		 * might be a bug. For now let the old lookup decide.
4425		 */
4426		if (cnp->cn_nameiop == CREATE) {
4427			return (cache_fpl_aborted(fpl));
4428		}
4429		return (cache_fpl_handled_error(fpl, EROFS));
4430	}
4431
4432	if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4433		cache_fpl_smr_exit(fpl);
4434		return (cache_fpl_handled_error(fpl, EEXIST));
4435	}
4436
4437	/*
4438	 * Secure access to dvp; check cache_fplookup_partial_setup for
4439	 * reasoning.
4440	 *
4441	 * XXX At least UFS requires its lookup routine to be called for
4442	 * the last path component, which leads to some level of complication
4443	 * and inefficiency:
4444	 * - the target routine always locks the target vnode, but our caller
4445	 *   may not need it locked
4446	 * - some of the VOP machinery asserts that the parent is locked, which
4447	 *   once more may be not required
4448	 *
4449	 * TODO: add a flag for filesystems which don't need this.
4450	 */
4451	dvs = vget_prep_smr(dvp);
4452	cache_fpl_smr_exit(fpl);
4453	if (__predict_false(dvs == VGET_NONE)) {
4454		return (cache_fpl_aborted(fpl));
4455	}
4456
4457	vget_finish_ref(dvp, dvs);
4458	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4459		vrele(dvp);
4460		return (cache_fpl_aborted(fpl));
4461	}
4462
4463	error = vn_lock(dvp, LK_EXCLUSIVE);
4464	if (__predict_false(error != 0)) {
4465		vrele(dvp);
4466		return (cache_fpl_aborted(fpl));
4467	}
4468
4469	tvp = NULL;
4470	cnp->cn_flags |= ISLASTCN;
4471	if (docache)
4472		cnp->cn_flags |= MAKEENTRY;
4473	if (cache_fpl_isdotdot(cnp))
4474		cnp->cn_flags |= ISDOTDOT;
4475	cnp->cn_lkflags = LK_EXCLUSIVE;
4476	error = VOP_LOOKUP(dvp, &tvp, cnp);
4477	switch (error) {
4478	case EJUSTRETURN:
4479	case 0:
4480		break;
4481	case ENOTDIR:
4482	case ENOENT:
4483		vput(dvp);
4484		return (cache_fpl_handled_error(fpl, error));
4485	default:
4486		vput(dvp);
4487		return (cache_fpl_aborted(fpl));
4488	}
4489
4490	fpl->tvp = tvp;
4491	fpl->savename = (cnp->cn_flags & SAVENAME) != 0;
4492
4493	if (tvp == NULL) {
4494		if ((cnp->cn_flags & SAVESTART) != 0) {
4495			ndp->ni_startdir = dvp;
4496			vrefact(ndp->ni_startdir);
4497			cnp->cn_flags |= SAVENAME;
4498			fpl->savename = true;
4499		}
4500		MPASS(error == EJUSTRETURN);
4501		if ((cnp->cn_flags & LOCKPARENT) == 0) {
4502			VOP_UNLOCK(dvp);
4503		}
4504		return (cache_fpl_handled(fpl));
4505	}
4506
4507	/*
4508	 * There are very hairy corner cases concerning various flag combinations
4509	 * and locking state. In particular here we only hold one lock instead of
4510	 * two.
4511	 *
4512	 * Skip the complexity as it is of no significance for normal workloads.
4513	 */
4514	if (__predict_false(tvp == dvp)) {
4515		vput(dvp);
4516		vrele(tvp);
4517		return (cache_fpl_aborted(fpl));
4518	}
4519
4520	/*
4521	 * If they want the symlink itself we are fine, but if they want to
4522	 * follow it regular lookup has to be engaged.
4523	 */
4524	if (tvp->v_type == VLNK) {
4525		if ((cnp->cn_flags & FOLLOW) != 0) {
4526			vput(dvp);
4527			vput(tvp);
4528			return (cache_fpl_aborted(fpl));
4529		}
4530	}
4531
4532	/*
4533	 * Since we expect this to be the terminal vnode it should almost never
4534	 * be a mount point.
4535	 */
4536	if (__predict_false(cache_fplookup_is_mp(fpl))) {
4537		vput(dvp);
4538		vput(tvp);
4539		return (cache_fpl_aborted(fpl));
4540	}
4541
4542	if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4543		vput(dvp);
4544		vput(tvp);
4545		return (cache_fpl_handled_error(fpl, EEXIST));
4546	}
4547
4548	if ((cnp->cn_flags & LOCKLEAF) == 0) {
4549		VOP_UNLOCK(tvp);
4550	}
4551
4552	if ((cnp->cn_flags & LOCKPARENT) == 0) {
4553		VOP_UNLOCK(dvp);
4554	}
4555
4556	if ((cnp->cn_flags & SAVESTART) != 0) {
4557		ndp->ni_startdir = dvp;
4558		vrefact(ndp->ni_startdir);
4559		cnp->cn_flags |= SAVENAME;
4560		fpl->savename = true;
4561	}
4562
4563	return (cache_fpl_handled(fpl));
4564}
4565
4566static int __noinline
4567cache_fplookup_modifying(struct cache_fpl *fpl)
4568{
4569	struct nameidata *ndp;
4570
4571	ndp = fpl->ndp;
4572
4573	if (!cache_fpl_islastcn(ndp)) {
4574		return (cache_fpl_partial(fpl));
4575	}
4576	return (cache_fplookup_final_modifying(fpl));
4577}
4578
4579static int __noinline
4580cache_fplookup_final_withparent(struct cache_fpl *fpl)
4581{
4582	struct componentname *cnp;
4583	enum vgetstate dvs, tvs;
4584	struct vnode *dvp, *tvp;
4585	seqc_t dvp_seqc;
4586	int error;
4587
4588	cnp = fpl->cnp;
4589	dvp = fpl->dvp;
4590	dvp_seqc = fpl->dvp_seqc;
4591	tvp = fpl->tvp;
4592
4593	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4594
4595	/*
4596	 * This is less efficient than it can be for simplicity.
4597	 */
4598	dvs = vget_prep_smr(dvp);
4599	if (__predict_false(dvs == VGET_NONE)) {
4600		return (cache_fpl_aborted(fpl));
4601	}
4602	tvs = vget_prep_smr(tvp);
4603	if (__predict_false(tvs == VGET_NONE)) {
4604		cache_fpl_smr_exit(fpl);
4605		vget_abort(dvp, dvs);
4606		return (cache_fpl_aborted(fpl));
4607	}
4608
4609	cache_fpl_smr_exit(fpl);
4610
4611	if ((cnp->cn_flags & LOCKPARENT) != 0) {
4612		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4613		if (__predict_false(error != 0)) {
4614			vget_abort(tvp, tvs);
4615			return (cache_fpl_aborted(fpl));
4616		}
4617	} else {
4618		vget_finish_ref(dvp, dvs);
4619	}
4620
4621	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4622		vget_abort(tvp, tvs);
4623		if ((cnp->cn_flags & LOCKPARENT) != 0)
4624			vput(dvp);
4625		else
4626			vrele(dvp);
4627		return (cache_fpl_aborted(fpl));
4628	}
4629
4630	error = cache_fplookup_final_child(fpl, tvs);
4631	if (__predict_false(error != 0)) {
4632		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4633		    fpl->status == CACHE_FPL_STATUS_DESTROYED);
4634		if ((cnp->cn_flags & LOCKPARENT) != 0)
4635			vput(dvp);
4636		else
4637			vrele(dvp);
4638		return (error);
4639	}
4640
4641	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4642	return (0);
4643}
4644
4645static int
4646cache_fplookup_final(struct cache_fpl *fpl)
4647{
4648	struct componentname *cnp;
4649	enum vgetstate tvs;
4650	struct vnode *dvp, *tvp;
4651	seqc_t dvp_seqc;
4652
4653	cnp = fpl->cnp;
4654	dvp = fpl->dvp;
4655	dvp_seqc = fpl->dvp_seqc;
4656	tvp = fpl->tvp;
4657
4658	MPASS(*(cnp->cn_nameptr) != '/');
4659
4660	if (cnp->cn_nameiop != LOOKUP) {
4661		return (cache_fplookup_final_modifying(fpl));
4662	}
4663
4664	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4665		return (cache_fplookup_final_withparent(fpl));
4666
4667	tvs = vget_prep_smr(tvp);
4668	if (__predict_false(tvs == VGET_NONE)) {
4669		return (cache_fpl_partial(fpl));
4670	}
4671
4672	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4673		cache_fpl_smr_exit(fpl);
4674		vget_abort(tvp, tvs);
4675		return (cache_fpl_aborted(fpl));
4676	}
4677
4678	cache_fpl_smr_exit(fpl);
4679	return (cache_fplookup_final_child(fpl, tvs));
4680}
4681
4682/*
4683 * Comment from locked lookup:
4684 * Check for degenerate name (e.g. / or "") which is a way of talking about a
4685 * directory, e.g. like "/." or ".".
4686 */
4687static int __noinline
4688cache_fplookup_degenerate(struct cache_fpl *fpl)
4689{
4690	struct componentname *cnp;
4691	struct vnode *dvp;
4692	enum vgetstate dvs;
4693	int error, lkflags;
4694#ifdef INVARIANTS
4695	char *cp;
4696#endif
4697
4698	fpl->tvp = fpl->dvp;
4699	fpl->tvp_seqc = fpl->dvp_seqc;
4700
4701	cnp = fpl->cnp;
4702	dvp = fpl->dvp;
4703
4704#ifdef INVARIANTS
4705	for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
4706		KASSERT(*cp == '/',
4707		    ("%s: encountered non-slash; string [%s]\n", __func__,
4708		    cnp->cn_pnbuf));
4709	}
4710#endif
4711
4712	if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
4713		cache_fpl_smr_exit(fpl);
4714		return (cache_fpl_handled_error(fpl, EISDIR));
4715	}
4716
4717	MPASS((cnp->cn_flags & SAVESTART) == 0);
4718
4719	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
4720		return (cache_fplookup_final_withparent(fpl));
4721	}
4722
4723	dvs = vget_prep_smr(dvp);
4724	cache_fpl_smr_exit(fpl);
4725	if (__predict_false(dvs == VGET_NONE)) {
4726		return (cache_fpl_aborted(fpl));
4727	}
4728
4729	if ((cnp->cn_flags & LOCKLEAF) != 0) {
4730		lkflags = LK_SHARED;
4731		if ((cnp->cn_flags & LOCKSHARED) == 0)
4732			lkflags = LK_EXCLUSIVE;
4733		error = vget_finish(dvp, lkflags, dvs);
4734		if (__predict_false(error != 0)) {
4735			return (cache_fpl_aborted(fpl));
4736		}
4737	} else {
4738		vget_finish_ref(dvp, dvs);
4739	}
4740	return (cache_fpl_handled(fpl));
4741}
4742
4743static int __noinline
4744cache_fplookup_noentry(struct cache_fpl *fpl)
4745{
4746	struct nameidata *ndp;
4747	struct componentname *cnp;
4748	enum vgetstate dvs;
4749	struct vnode *dvp, *tvp;
4750	seqc_t dvp_seqc;
4751	int error;
4752	bool docache;
4753
4754	ndp = fpl->ndp;
4755	cnp = fpl->cnp;
4756	dvp = fpl->dvp;
4757	dvp_seqc = fpl->dvp_seqc;
4758
4759	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4760	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4761	MPASS(!cache_fpl_isdotdot(cnp));
4762
4763	/*
4764	 * Hack: delayed name len checking.
4765	 */
4766	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4767		cache_fpl_smr_exit(fpl);
4768		return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
4769	}
4770
4771	if (cnp->cn_nameptr[0] == '/') {
4772		return (cache_fplookup_skip_slashes(fpl));
4773	}
4774
4775	if (cnp->cn_nameptr[0] == '\0') {
4776		if (fpl->tvp == NULL) {
4777			return (cache_fplookup_degenerate(fpl));
4778		}
4779		return (cache_fplookup_trailingslash(fpl));
4780	}
4781
4782	if (cnp->cn_nameiop != LOOKUP) {
4783		fpl->tvp = NULL;
4784		return (cache_fplookup_modifying(fpl));
4785	}
4786
4787	MPASS((cnp->cn_flags & SAVESTART) == 0);
4788
4789	/*
4790	 * Only try to fill in the component if it is the last one,
4791	 * otherwise not only there may be several to handle but the
4792	 * walk may be complicated.
4793	 */
4794	if (!cache_fpl_islastcn(ndp)) {
4795		return (cache_fpl_partial(fpl));
4796	}
4797
4798	/*
4799	 * Regular lookup nulifies the slash, which we don't do here.
4800	 * Don't take chances with filesystem routines seeing it for
4801	 * the last entry.
4802	 */
4803	if (cache_fpl_istrailingslash(fpl)) {
4804		return (cache_fpl_partial(fpl));
4805	}
4806
4807	/*
4808	 * Secure access to dvp; check cache_fplookup_partial_setup for
4809	 * reasoning.
4810	 */
4811	dvs = vget_prep_smr(dvp);
4812	cache_fpl_smr_exit(fpl);
4813	if (__predict_false(dvs == VGET_NONE)) {
4814		return (cache_fpl_aborted(fpl));
4815	}
4816
4817	vget_finish_ref(dvp, dvs);
4818	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4819		vrele(dvp);
4820		return (cache_fpl_aborted(fpl));
4821	}
4822
4823	error = vn_lock(dvp, LK_SHARED);
4824	if (__predict_false(error != 0)) {
4825		vrele(dvp);
4826		return (cache_fpl_aborted(fpl));
4827	}
4828
4829	tvp = NULL;
4830	/*
4831	 * TODO: provide variants which don't require locking either vnode.
4832	 */
4833	cnp->cn_flags |= ISLASTCN;
4834	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4835	if (docache)
4836		cnp->cn_flags |= MAKEENTRY;
4837	cnp->cn_lkflags = LK_SHARED;
4838	if ((cnp->cn_flags & LOCKSHARED) == 0) {
4839		cnp->cn_lkflags = LK_EXCLUSIVE;
4840	}
4841	error = VOP_LOOKUP(dvp, &tvp, cnp);
4842	switch (error) {
4843	case EJUSTRETURN:
4844	case 0:
4845		break;
4846	case ENOTDIR:
4847	case ENOENT:
4848		vput(dvp);
4849		return (cache_fpl_handled_error(fpl, error));
4850	default:
4851		vput(dvp);
4852		return (cache_fpl_aborted(fpl));
4853	}
4854
4855	fpl->tvp = tvp;
4856	if (!fpl->savename) {
4857		MPASS((cnp->cn_flags & SAVENAME) == 0);
4858	}
4859
4860	if (tvp == NULL) {
4861		MPASS(error == EJUSTRETURN);
4862		if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4863			vput(dvp);
4864		} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4865			VOP_UNLOCK(dvp);
4866		}
4867		return (cache_fpl_handled(fpl));
4868	}
4869
4870	if (tvp->v_type == VLNK) {
4871		if ((cnp->cn_flags & FOLLOW) != 0) {
4872			vput(dvp);
4873			vput(tvp);
4874			return (cache_fpl_aborted(fpl));
4875		}
4876	}
4877
4878	if (__predict_false(cache_fplookup_is_mp(fpl))) {
4879		vput(dvp);
4880		vput(tvp);
4881		return (cache_fpl_aborted(fpl));
4882	}
4883
4884	if ((cnp->cn_flags & LOCKLEAF) == 0) {
4885		VOP_UNLOCK(tvp);
4886	}
4887
4888	if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4889		vput(dvp);
4890	} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4891		VOP_UNLOCK(dvp);
4892	}
4893	return (cache_fpl_handled(fpl));
4894}
4895
4896static int __noinline
4897cache_fplookup_dot(struct cache_fpl *fpl)
4898{
4899	int error;
4900
4901	MPASS(!seqc_in_modify(fpl->dvp_seqc));
4902	/*
4903	 * Just re-assign the value. seqc will be checked later for the first
4904	 * non-dot path component in line and/or before deciding to return the
4905	 * vnode.
4906	 */
4907	fpl->tvp = fpl->dvp;
4908	fpl->tvp_seqc = fpl->dvp_seqc;
4909
4910	counter_u64_add(dothits, 1);
4911	SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
4912
4913	error = 0;
4914	if (cache_fplookup_is_mp(fpl)) {
4915		error = cache_fplookup_cross_mount(fpl);
4916	}
4917	return (error);
4918}
4919
4920static int __noinline
4921cache_fplookup_dotdot(struct cache_fpl *fpl)
4922{
4923	struct nameidata *ndp;
4924	struct componentname *cnp;
4925	struct namecache *ncp;
4926	struct vnode *dvp;
4927	struct prison *pr;
4928	u_char nc_flag;
4929
4930	ndp = fpl->ndp;
4931	cnp = fpl->cnp;
4932	dvp = fpl->dvp;
4933
4934	MPASS(cache_fpl_isdotdot(cnp));
4935
4936	/*
4937	 * XXX this is racy the same way regular lookup is
4938	 */
4939	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
4940	    pr = pr->pr_parent)
4941		if (dvp == pr->pr_root)
4942			break;
4943
4944	if (dvp == ndp->ni_rootdir ||
4945	    dvp == ndp->ni_topdir ||
4946	    dvp == rootvnode ||
4947	    pr != NULL) {
4948		fpl->tvp = dvp;
4949		fpl->tvp_seqc = vn_seqc_read_any(dvp);
4950		if (seqc_in_modify(fpl->tvp_seqc)) {
4951			return (cache_fpl_aborted(fpl));
4952		}
4953		return (0);
4954	}
4955
4956	if ((dvp->v_vflag & VV_ROOT) != 0) {
4957		/*
4958		 * TODO
4959		 * The opposite of climb mount is needed here.
4960		 */
4961		return (cache_fpl_partial(fpl));
4962	}
4963
4964	ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
4965	if (ncp == NULL) {
4966		return (cache_fpl_aborted(fpl));
4967	}
4968
4969	nc_flag = atomic_load_char(&ncp->nc_flag);
4970	if ((nc_flag & NCF_ISDOTDOT) != 0) {
4971		if ((nc_flag & NCF_NEGATIVE) != 0)
4972			return (cache_fpl_aborted(fpl));
4973		fpl->tvp = ncp->nc_vp;
4974	} else {
4975		fpl->tvp = ncp->nc_dvp;
4976	}
4977
4978	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4979	if (seqc_in_modify(fpl->tvp_seqc)) {
4980		return (cache_fpl_partial(fpl));
4981	}
4982
4983	/*
4984	 * Acquire fence provided by vn_seqc_read_any above.
4985	 */
4986	if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
4987		return (cache_fpl_aborted(fpl));
4988	}
4989
4990	if (!cache_ncp_canuse(ncp)) {
4991		return (cache_fpl_aborted(fpl));
4992	}
4993
4994	counter_u64_add(dotdothits, 1);
4995	return (0);
4996}
4997
4998static int __noinline
4999cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5000{
5001	u_char nc_flag;
5002	bool neg_promote;
5003
5004	nc_flag = atomic_load_char(&ncp->nc_flag);
5005	MPASS((nc_flag & NCF_NEGATIVE) != 0);
5006	/*
5007	 * If they want to create an entry we need to replace this one.
5008	 */
5009	if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5010		fpl->tvp = NULL;
5011		return (cache_fplookup_modifying(fpl));
5012	}
5013	neg_promote = cache_neg_hit_prep(ncp);
5014	if (!cache_fpl_neg_ncp_canuse(ncp)) {
5015		cache_neg_hit_abort(ncp);
5016		return (cache_fpl_partial(fpl));
5017	}
5018	if (neg_promote) {
5019		return (cache_fplookup_negative_promote(fpl, ncp, hash));
5020	}
5021	cache_neg_hit_finish(ncp);
5022	cache_fpl_smr_exit(fpl);
5023	return (cache_fpl_handled_error(fpl, ENOENT));
5024}
5025
5026/*
5027 * Resolve a symlink. Called by filesystem-specific routines.
5028 *
5029 * Code flow is:
5030 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5031 */
5032int
5033cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5034{
5035	struct nameidata *ndp;
5036	struct componentname *cnp;
5037	size_t adjust;
5038
5039	ndp = fpl->ndp;
5040	cnp = fpl->cnp;
5041
5042	if (__predict_false(len == 0)) {
5043		return (ENOENT);
5044	}
5045
5046	if (__predict_false(len > MAXPATHLEN - 2)) {
5047		if (cache_fpl_istrailingslash(fpl)) {
5048			return (EAGAIN);
5049		}
5050	}
5051
5052	ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5053#ifdef INVARIANTS
5054	if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5055		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5056		    __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5057		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5058	}
5059#endif
5060
5061	if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5062		return (ENAMETOOLONG);
5063	}
5064
5065	if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5066		return (ELOOP);
5067	}
5068
5069	adjust = len;
5070	if (ndp->ni_pathlen > 1) {
5071		bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5072	} else {
5073		if (cache_fpl_istrailingslash(fpl)) {
5074			adjust = len + 1;
5075			cnp->cn_pnbuf[len] = '/';
5076			cnp->cn_pnbuf[len + 1] = '\0';
5077		} else {
5078			cnp->cn_pnbuf[len] = '\0';
5079		}
5080	}
5081	bcopy(string, cnp->cn_pnbuf, len);
5082
5083	ndp->ni_pathlen += adjust;
5084	cache_fpl_pathlen_add(fpl, adjust);
5085	cnp->cn_nameptr = cnp->cn_pnbuf;
5086	fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5087	fpl->tvp = NULL;
5088	return (0);
5089}
5090
5091static int __noinline
5092cache_fplookup_symlink(struct cache_fpl *fpl)
5093{
5094	struct mount *mp;
5095	struct nameidata *ndp;
5096	struct componentname *cnp;
5097	struct vnode *dvp, *tvp;
5098	int error;
5099
5100	ndp = fpl->ndp;
5101	cnp = fpl->cnp;
5102	dvp = fpl->dvp;
5103	tvp = fpl->tvp;
5104
5105	if (cache_fpl_islastcn(ndp)) {
5106		if ((cnp->cn_flags & FOLLOW) == 0) {
5107			return (cache_fplookup_final(fpl));
5108		}
5109	}
5110
5111	mp = atomic_load_ptr(&dvp->v_mount);
5112	if (__predict_false(mp == NULL)) {
5113		return (cache_fpl_aborted(fpl));
5114	}
5115
5116	/*
5117	 * Note this check races against setting the flag just like regular
5118	 * lookup.
5119	 */
5120	if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5121		cache_fpl_smr_exit(fpl);
5122		return (cache_fpl_handled_error(fpl, EACCES));
5123	}
5124
5125	error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5126	if (__predict_false(error != 0)) {
5127		switch (error) {
5128		case EAGAIN:
5129			return (cache_fpl_partial(fpl));
5130		case ENOENT:
5131		case ENAMETOOLONG:
5132		case ELOOP:
5133			cache_fpl_smr_exit(fpl);
5134			return (cache_fpl_handled_error(fpl, error));
5135		default:
5136			return (cache_fpl_aborted(fpl));
5137		}
5138	}
5139
5140	if (*(cnp->cn_nameptr) == '/') {
5141		fpl->dvp = cache_fpl_handle_root(fpl);
5142		fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5143		if (seqc_in_modify(fpl->dvp_seqc)) {
5144			return (cache_fpl_aborted(fpl));
5145		}
5146		/*
5147		 * The main loop assumes that ->dvp points to a vnode belonging
5148		 * to a filesystem which can do lockless lookup, but the absolute
5149		 * symlink can be wandering off to one which does not.
5150		 */
5151		mp = atomic_load_ptr(&fpl->dvp->v_mount);
5152		if (__predict_false(mp == NULL)) {
5153			return (cache_fpl_aborted(fpl));
5154		}
5155		if (!cache_fplookup_mp_supported(mp)) {
5156			cache_fpl_checkpoint(fpl);
5157			return (cache_fpl_partial(fpl));
5158		}
5159	}
5160	return (0);
5161}
5162
5163static int
5164cache_fplookup_next(struct cache_fpl *fpl)
5165{
5166	struct componentname *cnp;
5167	struct namecache *ncp;
5168	struct vnode *dvp, *tvp;
5169	u_char nc_flag;
5170	uint32_t hash;
5171	int error;
5172
5173	cnp = fpl->cnp;
5174	dvp = fpl->dvp;
5175	hash = fpl->hash;
5176
5177	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5178		if (cnp->cn_namelen == 1) {
5179			return (cache_fplookup_dot(fpl));
5180		}
5181		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5182			return (cache_fplookup_dotdot(fpl));
5183		}
5184	}
5185
5186	MPASS(!cache_fpl_isdotdot(cnp));
5187
5188	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
5189		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
5190		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
5191			break;
5192	}
5193
5194	if (__predict_false(ncp == NULL)) {
5195		return (cache_fplookup_noentry(fpl));
5196	}
5197
5198	tvp = atomic_load_ptr(&ncp->nc_vp);
5199	nc_flag = atomic_load_char(&ncp->nc_flag);
5200	if ((nc_flag & NCF_NEGATIVE) != 0) {
5201		return (cache_fplookup_neg(fpl, ncp, hash));
5202	}
5203
5204	if (!cache_ncp_canuse(ncp)) {
5205		return (cache_fpl_partial(fpl));
5206	}
5207
5208	fpl->tvp = tvp;
5209	fpl->tvp_seqc = vn_seqc_read_any(tvp);
5210	if (seqc_in_modify(fpl->tvp_seqc)) {
5211		return (cache_fpl_partial(fpl));
5212	}
5213
5214	counter_u64_add(numposhits, 1);
5215	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5216
5217	error = 0;
5218	if (cache_fplookup_is_mp(fpl)) {
5219		error = cache_fplookup_cross_mount(fpl);
5220	}
5221	return (error);
5222}
5223
5224static bool
5225cache_fplookup_mp_supported(struct mount *mp)
5226{
5227
5228	MPASS(mp != NULL);
5229	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5230		return (false);
5231	return (true);
5232}
5233
5234/*
5235 * Walk up the mount stack (if any).
5236 *
5237 * Correctness is provided in the following ways:
5238 * - all vnodes are protected from freeing with SMR
5239 * - struct mount objects are type stable making them always safe to access
5240 * - stability of the particular mount is provided by busying it
5241 * - relationship between the vnode which is mounted on and the mount is
5242 *   verified with the vnode sequence counter after busying
5243 * - association between root vnode of the mount and the mount is protected
5244 *   by busy
5245 *
5246 * From that point on we can read the sequence counter of the root vnode
5247 * and get the next mount on the stack (if any) using the same protection.
5248 *
5249 * By the end of successful walk we are guaranteed the reached state was
5250 * indeed present at least at some point which matches the regular lookup.
5251 */
5252static int __noinline
5253cache_fplookup_climb_mount(struct cache_fpl *fpl)
5254{
5255	struct mount *mp, *prev_mp;
5256	struct mount_pcpu *mpcpu, *prev_mpcpu;
5257	struct vnode *vp;
5258	seqc_t vp_seqc;
5259
5260	vp = fpl->tvp;
5261	vp_seqc = fpl->tvp_seqc;
5262
5263	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
5264	mp = atomic_load_ptr(&vp->v_mountedhere);
5265	if (__predict_false(mp == NULL)) {
5266		return (0);
5267	}
5268
5269	prev_mp = NULL;
5270	for (;;) {
5271		if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5272			if (prev_mp != NULL)
5273				vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5274			return (cache_fpl_partial(fpl));
5275		}
5276		if (prev_mp != NULL)
5277			vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5278		if (!vn_seqc_consistent(vp, vp_seqc)) {
5279			vfs_op_thread_exit_crit(mp, mpcpu);
5280			return (cache_fpl_partial(fpl));
5281		}
5282		if (!cache_fplookup_mp_supported(mp)) {
5283			vfs_op_thread_exit_crit(mp, mpcpu);
5284			return (cache_fpl_partial(fpl));
5285		}
5286		vp = atomic_load_ptr(&mp->mnt_rootvnode);
5287		if (vp == NULL) {
5288			vfs_op_thread_exit_crit(mp, mpcpu);
5289			return (cache_fpl_partial(fpl));
5290		}
5291		vp_seqc = vn_seqc_read_any(vp);
5292		if (seqc_in_modify(vp_seqc)) {
5293			vfs_op_thread_exit_crit(mp, mpcpu);
5294			return (cache_fpl_partial(fpl));
5295		}
5296		prev_mp = mp;
5297		prev_mpcpu = mpcpu;
5298		mp = atomic_load_ptr(&vp->v_mountedhere);
5299		if (mp == NULL)
5300			break;
5301	}
5302
5303	vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5304	fpl->tvp = vp;
5305	fpl->tvp_seqc = vp_seqc;
5306	return (0);
5307}
5308
5309static int __noinline
5310cache_fplookup_cross_mount(struct cache_fpl *fpl)
5311{
5312	struct mount *mp;
5313	struct mount_pcpu *mpcpu;
5314	struct vnode *vp;
5315	seqc_t vp_seqc;
5316
5317	vp = fpl->tvp;
5318	vp_seqc = fpl->tvp_seqc;
5319
5320	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
5321	mp = atomic_load_ptr(&vp->v_mountedhere);
5322	if (__predict_false(mp == NULL)) {
5323		return (0);
5324	}
5325
5326	if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5327		return (cache_fpl_partial(fpl));
5328	}
5329	if (!vn_seqc_consistent(vp, vp_seqc)) {
5330		vfs_op_thread_exit_crit(mp, mpcpu);
5331		return (cache_fpl_partial(fpl));
5332	}
5333	if (!cache_fplookup_mp_supported(mp)) {
5334		vfs_op_thread_exit_crit(mp, mpcpu);
5335		return (cache_fpl_partial(fpl));
5336	}
5337	vp = atomic_load_ptr(&mp->mnt_rootvnode);
5338	if (__predict_false(vp == NULL)) {
5339		vfs_op_thread_exit_crit(mp, mpcpu);
5340		return (cache_fpl_partial(fpl));
5341	}
5342	vp_seqc = vn_seqc_read_any(vp);
5343	vfs_op_thread_exit_crit(mp, mpcpu);
5344	if (seqc_in_modify(vp_seqc)) {
5345		return (cache_fpl_partial(fpl));
5346	}
5347	mp = atomic_load_ptr(&vp->v_mountedhere);
5348	if (__predict_false(mp != NULL)) {
5349		/*
5350		 * There are possibly more mount points on top.
5351		 * Normally this does not happen so for simplicity just start
5352		 * over.
5353		 */
5354		return (cache_fplookup_climb_mount(fpl));
5355	}
5356
5357	fpl->tvp = vp;
5358	fpl->tvp_seqc = vp_seqc;
5359	return (0);
5360}
5361
5362/*
5363 * Check if a vnode is mounted on.
5364 */
5365static bool
5366cache_fplookup_is_mp(struct cache_fpl *fpl)
5367{
5368	struct vnode *vp;
5369
5370	vp = fpl->tvp;
5371	return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5372}
5373
5374/*
5375 * Parse the path.
5376 *
5377 * The code was originally copy-pasted from regular lookup and despite
5378 * clean ups leaves performance on the table. Any modifications here
5379 * must take into account that in case off fallback the resulting
5380 * nameidata state has to be compatible with the original.
5381 */
5382
5383/*
5384 * Debug ni_pathlen tracking.
5385 */
5386#ifdef INVARIANTS
5387static void
5388cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5389{
5390
5391	fpl->debug.ni_pathlen += n;
5392	KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5393	    ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5394}
5395
5396static void
5397cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5398{
5399
5400	fpl->debug.ni_pathlen -= n;
5401	KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5402	    ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5403}
5404
5405static void
5406cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5407{
5408
5409	cache_fpl_pathlen_add(fpl, 1);
5410}
5411
5412static void
5413cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5414{
5415
5416	cache_fpl_pathlen_sub(fpl, 1);
5417}
5418#else
5419static void
5420cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5421{
5422}
5423
5424static void
5425cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5426{
5427}
5428
5429static void
5430cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5431{
5432}
5433
5434static void
5435cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5436{
5437}
5438#endif
5439
5440static void
5441cache_fplookup_parse(struct cache_fpl *fpl)
5442{
5443	struct nameidata *ndp;
5444	struct componentname *cnp;
5445	struct vnode *dvp;
5446	char *cp;
5447	uint32_t hash;
5448
5449	ndp = fpl->ndp;
5450	cnp = fpl->cnp;
5451	dvp = fpl->dvp;
5452
5453	/*
5454	 * Find the end of this path component, it is either / or nul.
5455	 *
5456	 * Store / as a temporary sentinel so that we only have one character
5457	 * to test for. Pathnames tend to be short so this should not be
5458	 * resulting in cache misses.
5459	 *
5460	 * TODO: fix this to be word-sized.
5461	 */
5462	KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5463	    ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5464	    __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5465	    fpl->nulchar, cnp->cn_pnbuf));
5466	KASSERT(*fpl->nulchar == '\0',
5467	    ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5468	    cnp->cn_pnbuf));
5469	hash = cache_get_hash_iter_start(dvp);
5470	*fpl->nulchar = '/';
5471	for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5472		KASSERT(*cp != '\0',
5473		    ("%s: encountered unexpected nul; string [%s]\n", __func__,
5474		    cnp->cn_nameptr));
5475		hash = cache_get_hash_iter(*cp, hash);
5476		continue;
5477	}
5478	*fpl->nulchar = '\0';
5479	fpl->hash = cache_get_hash_iter_finish(hash);
5480
5481	cnp->cn_namelen = cp - cnp->cn_nameptr;
5482	cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5483
5484#ifdef INVARIANTS
5485	/*
5486	 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5487	 * we are going to fail this lookup with ENAMETOOLONG (see below).
5488	 */
5489	if (cnp->cn_namelen <= NAME_MAX) {
5490		if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5491			panic("%s: mismatched hash for [%s] len %ld", __func__,
5492			    cnp->cn_nameptr, cnp->cn_namelen);
5493		}
5494	}
5495#endif
5496
5497	/*
5498	 * Hack: we have to check if the found path component's length exceeds
5499	 * NAME_MAX. However, the condition is very rarely true and check can
5500	 * be elided in the common case -- if an entry was found in the cache,
5501	 * then it could not have been too long to begin with.
5502	 */
5503	ndp->ni_next = cp;
5504}
5505
5506static void
5507cache_fplookup_parse_advance(struct cache_fpl *fpl)
5508{
5509	struct nameidata *ndp;
5510	struct componentname *cnp;
5511
5512	ndp = fpl->ndp;
5513	cnp = fpl->cnp;
5514
5515	cnp->cn_nameptr = ndp->ni_next;
5516	KASSERT(*(cnp->cn_nameptr) == '/',
5517	    ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5518	    cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5519	cnp->cn_nameptr++;
5520	cache_fpl_pathlen_dec(fpl);
5521}
5522
5523/*
5524 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5525 *
5526 * Lockless lookup tries to elide checking for spurious slashes and should they
5527 * be present is guaranteed to fail to find an entry. In this case the caller
5528 * must check if the name starts with a slash and call this routine.  It is
5529 * going to fast forward across the spurious slashes and set the state up for
5530 * retry.
5531 */
5532static int __noinline
5533cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5534{
5535	struct nameidata *ndp;
5536	struct componentname *cnp;
5537
5538	ndp = fpl->ndp;
5539	cnp = fpl->cnp;
5540
5541	MPASS(*(cnp->cn_nameptr) == '/');
5542	do {
5543		cnp->cn_nameptr++;
5544		cache_fpl_pathlen_dec(fpl);
5545	} while (*(cnp->cn_nameptr) == '/');
5546
5547	/*
5548	 * Go back to one slash so that cache_fplookup_parse_advance has
5549	 * something to skip.
5550	 */
5551	cnp->cn_nameptr--;
5552	cache_fpl_pathlen_inc(fpl);
5553
5554	/*
5555	 * cache_fplookup_parse_advance starts from ndp->ni_next
5556	 */
5557	ndp->ni_next = cnp->cn_nameptr;
5558
5559	/*
5560	 * See cache_fplookup_dot.
5561	 */
5562	fpl->tvp = fpl->dvp;
5563	fpl->tvp_seqc = fpl->dvp_seqc;
5564
5565	return (0);
5566}
5567
5568/*
5569 * Handle trailing slashes (e.g., "foo/").
5570 *
5571 * If a trailing slash is found the terminal vnode must be a directory.
5572 * Regular lookup shortens the path by nulifying the first trailing slash and
5573 * sets the TRAILINGSLASH flag to denote this took place. There are several
5574 * checks on it performed later.
5575 *
5576 * Similarly to spurious slashes, lockless lookup handles this in a speculative
5577 * manner relying on an invariant that a non-directory vnode will get a miss.
5578 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5579 *
5580 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5581 * and denotes this is the last path component, which avoids looping back.
5582 *
5583 * Only plain lookups are supported for now to restrict corner cases to handle.
5584 */
5585static int __noinline
5586cache_fplookup_trailingslash(struct cache_fpl *fpl)
5587{
5588#ifdef INVARIANTS
5589	size_t ni_pathlen;
5590#endif
5591	struct nameidata *ndp;
5592	struct componentname *cnp;
5593	struct namecache *ncp;
5594	struct vnode *tvp;
5595	char *cn_nameptr_orig, *cn_nameptr_slash;
5596	seqc_t tvp_seqc;
5597	u_char nc_flag;
5598
5599	ndp = fpl->ndp;
5600	cnp = fpl->cnp;
5601	tvp = fpl->tvp;
5602	tvp_seqc = fpl->tvp_seqc;
5603
5604	MPASS(fpl->dvp == fpl->tvp);
5605	KASSERT(cache_fpl_istrailingslash(fpl),
5606	    ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
5607	    cnp->cn_pnbuf));
5608	KASSERT(cnp->cn_nameptr[0] == '\0',
5609	    ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
5610	    cnp->cn_pnbuf));
5611	KASSERT(cnp->cn_namelen == 0,
5612	    ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
5613	    cnp->cn_pnbuf));
5614	MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
5615
5616	if (cnp->cn_nameiop != LOOKUP) {
5617		return (cache_fpl_aborted(fpl));
5618	}
5619
5620	if (__predict_false(tvp->v_type != VDIR)) {
5621		if (!vn_seqc_consistent(tvp, tvp_seqc)) {
5622			return (cache_fpl_aborted(fpl));
5623		}
5624		cache_fpl_smr_exit(fpl);
5625		return (cache_fpl_handled_error(fpl, ENOTDIR));
5626	}
5627
5628	/*
5629	 * Denote the last component.
5630	 */
5631	ndp->ni_next = &cnp->cn_nameptr[0];
5632	MPASS(cache_fpl_islastcn(ndp));
5633
5634	/*
5635	 * Unwind trailing slashes.
5636	 */
5637	cn_nameptr_orig = cnp->cn_nameptr;
5638	while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
5639		cnp->cn_nameptr--;
5640		if (cnp->cn_nameptr[0] != '/') {
5641			break;
5642		}
5643	}
5644
5645	/*
5646	 * Unwind to the beginning of the path component.
5647	 *
5648	 * Note the path may or may not have started with a slash.
5649	 */
5650	cn_nameptr_slash = cnp->cn_nameptr;
5651	while (cnp->cn_nameptr > cnp->cn_pnbuf) {
5652		cnp->cn_nameptr--;
5653		if (cnp->cn_nameptr[0] == '/') {
5654			break;
5655		}
5656	}
5657	if (cnp->cn_nameptr[0] == '/') {
5658		cnp->cn_nameptr++;
5659	}
5660
5661	cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
5662	cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
5663	cache_fpl_checkpoint(fpl);
5664
5665#ifdef INVARIANTS
5666	ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
5667	if (ni_pathlen != fpl->debug.ni_pathlen) {
5668		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5669		    __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5670		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5671	}
5672#endif
5673
5674	/*
5675	 * If this was a "./" lookup the parent directory is already correct.
5676	 */
5677	if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
5678		return (0);
5679	}
5680
5681	/*
5682	 * Otherwise we need to look it up.
5683	 */
5684	tvp = fpl->tvp;
5685	ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
5686	if (__predict_false(ncp == NULL)) {
5687		return (cache_fpl_aborted(fpl));
5688	}
5689	nc_flag = atomic_load_char(&ncp->nc_flag);
5690	if ((nc_flag & NCF_ISDOTDOT) != 0) {
5691		return (cache_fpl_aborted(fpl));
5692	}
5693	fpl->dvp = ncp->nc_dvp;
5694	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5695	if (seqc_in_modify(fpl->dvp_seqc)) {
5696		return (cache_fpl_aborted(fpl));
5697	}
5698	return (0);
5699}
5700
5701/*
5702 * See the API contract for VOP_FPLOOKUP_VEXEC.
5703 */
5704static int __noinline
5705cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
5706{
5707	struct componentname *cnp;
5708	struct vnode *dvp;
5709	seqc_t dvp_seqc;
5710
5711	cnp = fpl->cnp;
5712	dvp = fpl->dvp;
5713	dvp_seqc = fpl->dvp_seqc;
5714
5715	/*
5716	 * TODO: Due to ignoring trailing slashes lookup will perform a
5717	 * permission check on the last dir when it should not be doing it.  It
5718	 * may fail, but said failure should be ignored. It is possible to fix
5719	 * it up fully without resorting to regular lookup, but for now just
5720	 * abort.
5721	 */
5722	if (cache_fpl_istrailingslash(fpl)) {
5723		return (cache_fpl_aborted(fpl));
5724	}
5725
5726	/*
5727	 * Hack: delayed degenerate path checking.
5728	 */
5729	if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
5730		return (cache_fplookup_degenerate(fpl));
5731	}
5732
5733	/*
5734	 * Hack: delayed name len checking.
5735	 */
5736	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5737		cache_fpl_smr_exit(fpl);
5738		return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5739	}
5740
5741	/*
5742	 * Hack: they may be looking up foo/bar, where foo is not a directory.
5743	 * In such a case we need to return ENOTDIR, but we may happen to get
5744	 * here with a different error.
5745	 */
5746	if (dvp->v_type != VDIR) {
5747		error = ENOTDIR;
5748	}
5749
5750	/*
5751	 * Hack: handle O_SEARCH.
5752	 *
5753	 * Open Group Base Specifications Issue 7, 2018 edition states:
5754	 * <quote>
5755	 * If the access mode of the open file description associated with the
5756	 * file descriptor is not O_SEARCH, the function shall check whether
5757	 * directory searches are permitted using the current permissions of
5758	 * the directory underlying the file descriptor. If the access mode is
5759	 * O_SEARCH, the function shall not perform the check.
5760	 * </quote>
5761	 *
5762	 * Regular lookup tests for the NOEXECCHECK flag for every path
5763	 * component to decide whether to do the permission check. However,
5764	 * since most lookups never have the flag (and when they do it is only
5765	 * present for the first path component), lockless lookup only acts on
5766	 * it if there is a permission problem. Here the flag is represented
5767	 * with a boolean so that we don't have to clear it on the way out.
5768	 *
5769	 * For simplicity this always aborts.
5770	 * TODO: check if this is the first lookup and ignore the permission
5771	 * problem. Note the flag has to survive fallback (if it happens to be
5772	 * performed).
5773	 */
5774	if (fpl->fsearch) {
5775		return (cache_fpl_aborted(fpl));
5776	}
5777
5778	switch (error) {
5779	case EAGAIN:
5780		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5781			error = cache_fpl_aborted(fpl);
5782		} else {
5783			cache_fpl_partial(fpl);
5784		}
5785		break;
5786	default:
5787		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5788			error = cache_fpl_aborted(fpl);
5789		} else {
5790			cache_fpl_smr_exit(fpl);
5791			cache_fpl_handled_error(fpl, error);
5792		}
5793		break;
5794	}
5795	return (error);
5796}
5797
5798static int
5799cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
5800{
5801	struct nameidata *ndp;
5802	struct componentname *cnp;
5803	struct mount *mp;
5804	int error;
5805
5806	ndp = fpl->ndp;
5807	cnp = fpl->cnp;
5808
5809	cache_fpl_checkpoint(fpl);
5810
5811	/*
5812	 * The vnode at hand is almost always stable, skip checking for it.
5813	 * Worst case this postpones the check towards the end of the iteration
5814	 * of the main loop.
5815	 */
5816	fpl->dvp = dvp;
5817	fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
5818
5819	mp = atomic_load_ptr(&dvp->v_mount);
5820	if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
5821		return (cache_fpl_aborted(fpl));
5822	}
5823
5824	MPASS(fpl->tvp == NULL);
5825
5826	for (;;) {
5827		cache_fplookup_parse(fpl);
5828
5829		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
5830		if (__predict_false(error != 0)) {
5831			error = cache_fplookup_failed_vexec(fpl, error);
5832			break;
5833		}
5834
5835		error = cache_fplookup_next(fpl);
5836		if (__predict_false(cache_fpl_terminated(fpl))) {
5837			break;
5838		}
5839
5840		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
5841
5842		if (fpl->tvp->v_type == VLNK) {
5843			error = cache_fplookup_symlink(fpl);
5844			if (cache_fpl_terminated(fpl)) {
5845				break;
5846			}
5847		} else {
5848			if (cache_fpl_islastcn(ndp)) {
5849				error = cache_fplookup_final(fpl);
5850				break;
5851			}
5852
5853			if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
5854				error = cache_fpl_aborted(fpl);
5855				break;
5856			}
5857
5858			fpl->dvp = fpl->tvp;
5859			fpl->dvp_seqc = fpl->tvp_seqc;
5860			cache_fplookup_parse_advance(fpl);
5861		}
5862
5863		cache_fpl_checkpoint(fpl);
5864	}
5865
5866	return (error);
5867}
5868
5869/*
5870 * Fast path lookup protected with SMR and sequence counters.
5871 *
5872 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
5873 *
5874 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
5875 * outlined below.
5876 *
5877 * Traditional vnode lookup conceptually looks like this:
5878 *
5879 * vn_lock(current);
5880 * for (;;) {
5881 *	next = find();
5882 *	vn_lock(next);
5883 *	vn_unlock(current);
5884 *	current = next;
5885 *	if (last)
5886 *	    break;
5887 * }
5888 * return (current);
5889 *
5890 * Each jump to the next vnode is safe memory-wise and atomic with respect to
5891 * any modifications thanks to holding respective locks.
5892 *
5893 * The same guarantee can be provided with a combination of safe memory
5894 * reclamation and sequence counters instead. If all operations which affect
5895 * the relationship between the current vnode and the one we are looking for
5896 * also modify the counter, we can verify whether all the conditions held as
5897 * we made the jump. This includes things like permissions, mount points etc.
5898 * Counter modification is provided by enclosing relevant places in
5899 * vn_seqc_write_begin()/end() calls.
5900 *
5901 * Thus this translates to:
5902 *
5903 * vfs_smr_enter();
5904 * dvp_seqc = seqc_read_any(dvp);
5905 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
5906 *     abort();
5907 * for (;;) {
5908 * 	tvp = find();
5909 * 	tvp_seqc = seqc_read_any(tvp);
5910 * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
5911 * 	    abort();
5912 * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
5913 * 	    abort();
5914 * 	dvp = tvp; // we know nothing of importance has changed
5915 * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
5916 * 	if (last)
5917 * 	    break;
5918 * }
5919 * vget(); // secure the vnode
5920 * if (!seqc_consistent(tvp, tvp_seqc) // final check
5921 * 	    abort();
5922 * // at this point we know nothing has changed for any parent<->child pair
5923 * // as they were crossed during the lookup, meaning we matched the guarantee
5924 * // of the locked variant
5925 * return (tvp);
5926 *
5927 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
5928 * - they are called while within vfs_smr protection which they must never exit
5929 * - EAGAIN can be returned to denote checking could not be performed, it is
5930 *   always valid to return it
5931 * - if the sequence counter has not changed the result must be valid
5932 * - if the sequence counter has changed both false positives and false negatives
5933 *   are permitted (since the result will be rejected later)
5934 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
5935 *
5936 * Caveats to watch out for:
5937 * - vnodes are passed unlocked and unreferenced with nothing stopping
5938 *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
5939 *   to use atomic_load_ptr to fetch it.
5940 * - the aforementioned object can also get freed, meaning absent other means it
5941 *   should be protected with vfs_smr
5942 * - either safely checking permissions as they are modified or guaranteeing
5943 *   their stability is left to the routine
5944 */
5945int
5946cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
5947    struct pwd **pwdp)
5948{
5949	struct cache_fpl fpl;
5950	struct pwd *pwd;
5951	struct vnode *dvp;
5952	struct componentname *cnp;
5953	int error;
5954
5955	fpl.status = CACHE_FPL_STATUS_UNSET;
5956	fpl.in_smr = false;
5957	fpl.ndp = ndp;
5958	fpl.cnp = cnp = &ndp->ni_cnd;
5959	MPASS(ndp->ni_lcf == 0);
5960	MPASS(curthread == cnp->cn_thread);
5961	KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
5962	    ("%s: internal flags found in cn_flags %" PRIx64, __func__,
5963	    cnp->cn_flags));
5964	if ((cnp->cn_flags & SAVESTART) != 0) {
5965		MPASS(cnp->cn_nameiop != LOOKUP);
5966	}
5967	MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
5968
5969	if (__predict_false(!cache_can_fplookup(&fpl))) {
5970		*status = fpl.status;
5971		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
5972		return (EOPNOTSUPP);
5973	}
5974
5975	cache_fpl_checkpoint_outer(&fpl);
5976
5977	cache_fpl_smr_enter_initial(&fpl);
5978#ifdef INVARIANTS
5979	fpl.debug.ni_pathlen = ndp->ni_pathlen;
5980#endif
5981	fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5982	fpl.fsearch = false;
5983	fpl.savename = (cnp->cn_flags & SAVENAME) != 0;
5984	fpl.tvp = NULL; /* for degenerate path handling */
5985	fpl.pwd = pwdp;
5986	pwd = pwd_get_smr();
5987	*(fpl.pwd) = pwd;
5988	ndp->ni_rootdir = pwd->pwd_rdir;
5989	ndp->ni_topdir = pwd->pwd_jdir;
5990
5991	if (cnp->cn_pnbuf[0] == '/') {
5992		dvp = cache_fpl_handle_root(&fpl);
5993		MPASS(ndp->ni_resflags == 0);
5994		ndp->ni_resflags = NIRES_ABS;
5995	} else {
5996		if (ndp->ni_dirfd == AT_FDCWD) {
5997			dvp = pwd->pwd_cdir;
5998		} else {
5999			error = cache_fplookup_dirfd(&fpl, &dvp);
6000			if (__predict_false(error != 0)) {
6001				goto out;
6002			}
6003		}
6004	}
6005
6006	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6007	error = cache_fplookup_impl(dvp, &fpl);
6008out:
6009	cache_fpl_smr_assert_not_entered(&fpl);
6010	cache_fpl_assert_status(&fpl);
6011	*status = fpl.status;
6012	if (SDT_PROBES_ENABLED()) {
6013		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6014		if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6015			SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6016			    ndp);
6017	}
6018
6019	if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6020		MPASS(error != CACHE_FPL_FAILED);
6021		if (error != 0) {
6022			MPASS(fpl.dvp == NULL);
6023			MPASS(fpl.tvp == NULL);
6024			MPASS(fpl.savename == false);
6025		}
6026		ndp->ni_dvp = fpl.dvp;
6027		ndp->ni_vp = fpl.tvp;
6028		if (fpl.savename) {
6029			cnp->cn_flags |= HASBUF;
6030		} else {
6031			cache_fpl_cleanup_cnp(cnp);
6032		}
6033	}
6034	return (error);
6035}
6036