1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#include <sys/cdefs.h>
38#include "opt_capsicum.h"
39#include "opt_ktrace.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/dirent.h>
44#include <sys/kernel.h>
45#include <sys/capsicum.h>
46#include <sys/fcntl.h>
47#include <sys/jail.h>
48#include <sys/lock.h>
49#include <sys/mutex.h>
50#include <sys/namei.h>
51#include <sys/vnode.h>
52#include <sys/mount.h>
53#include <sys/filedesc.h>
54#include <sys/proc.h>
55#include <sys/sdt.h>
56#include <sys/syscallsubr.h>
57#include <sys/sysctl.h>
58#ifdef KTRACE
59#include <sys/ktrace.h>
60#endif
61#ifdef INVARIANTS
62#include <machine/_inttypes.h>
63#endif
64
65#include <security/audit/audit.h>
66#include <security/mac/mac_framework.h>
67
68#include <vm/uma.h>
69
70#ifdef INVARIANTS
71static void NDVALIDATE_impl(struct nameidata *, int);
72#define NDVALIDATE(ndp) NDVALIDATE_impl(ndp, __LINE__)
73#else
74#define NDVALIDATE(ndp)
75#endif
76
77/*
78 * Prepare namei() to restart. Reset components to its original state and set
79 * ISRESTARTED flag which signals the underlying lookup code to change the root
80 * from ABI root to actual root and prevents a further restarts.
81 */
82#define	NDRESTART(ndp) do {						\
83	NDREINIT_DBG(ndp);						\
84	ndp->ni_resflags = 0;						\
85	ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;			\
86	ndp->ni_cnd.cn_flags |= ISRESTARTED;				\
87} while (0)
88
89#ifdef KTRACE
90#define	NIKTRCAPFAIL(path)	ktrcapfail(CAPFAIL_NAMEI, (path))
91#else
92#define	NIKTRCAPFAIL(path)
93#endif
94
95#define	NI_CAP_VIOLATION(ndp, path)	do {			\
96	NIKTRCAPFAIL(path);					\
97	(ndp)->ni_lcf &= ~NI_LCF_KTR_FLAGS;			\
98} while (0)
99
100SDT_PROVIDER_DEFINE(vfs);
101SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
102    "unsigned long", "bool");
103SDT_PROBE_DEFINE4(vfs, namei, lookup, return, "int", "struct vnode *", "bool",
104    "struct nameidata");
105
106/* Allocation zone for namei. */
107uma_zone_t namei_zone;
108
109/* Placeholder vnode for mp traversal. */
110static struct vnode *vp_crossmp;
111
112static int
113crossmp_vop_islocked(struct vop_islocked_args *ap)
114{
115
116	return (LK_SHARED);
117}
118
119static int
120crossmp_vop_lock1(struct vop_lock1_args *ap)
121{
122	struct vnode *vp;
123	struct lock *lk __diagused;
124	int flags;
125
126	vp = ap->a_vp;
127	lk = vp->v_vnlock;
128	flags = ap->a_flags;
129
130	KASSERT((flags & (LK_SHARED | LK_NOWAIT)) == (LK_SHARED | LK_NOWAIT),
131	    ("%s: invalid lock request 0x%x for crossmp", __func__, flags));
132
133	if ((flags & LK_INTERLOCK) != 0)
134		VI_UNLOCK(vp);
135	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, ap->a_line);
136	return (0);
137}
138
139static int
140crossmp_vop_unlock(struct vop_unlock_args *ap)
141{
142	struct vnode *vp;
143	struct lock *lk __diagused;
144
145	vp = ap->a_vp;
146	lk = vp->v_vnlock;
147
148	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE,
149	    LOCK_LINE);
150	return (0);
151}
152
153static struct vop_vector crossmp_vnodeops = {
154	.vop_default =		&default_vnodeops,
155	.vop_islocked =		crossmp_vop_islocked,
156	.vop_lock1 =		crossmp_vop_lock1,
157	.vop_unlock =		crossmp_vop_unlock,
158};
159/*
160 * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode
161 * gets allocated early. See nameiinit for the direct call below.
162 */
163
164struct nameicap_tracker {
165	struct vnode *dp;
166	TAILQ_ENTRY(nameicap_tracker) nm_link;
167};
168
169/* Zone for cap mode tracker elements used for dotdot capability checks. */
170MALLOC_DEFINE(M_NAMEITRACKER, "namei_tracker", "namei tracking for dotdot");
171
172static void
173nameiinit(void *dummy __unused)
174{
175
176	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
177	    UMA_ALIGN_PTR, 0);
178	vfs_vector_op_register(&crossmp_vnodeops);
179	getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp);
180	vp_crossmp->v_state = VSTATE_CONSTRUCTED;
181	vp_crossmp->v_irflag |= VIRF_CROSSMP;
182}
183SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
184
185static int lookup_cap_dotdot = 1;
186SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
187    &lookup_cap_dotdot, 0,
188    "enables \"..\" components in path lookup in capability mode");
189static int lookup_cap_dotdot_nonlocal = 1;
190SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
191    &lookup_cap_dotdot_nonlocal, 0,
192    "enables \"..\" components in path lookup in capability mode "
193    "on non-local mount");
194
195static void
196nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
197{
198	struct nameicap_tracker *nt;
199
200	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
201		return;
202	nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head);
203	if (nt != NULL && nt->dp == dp)
204		return;
205	nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK);
206	vhold(dp);
207	nt->dp = dp;
208	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
209}
210
211static void
212nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first)
213{
214	struct nameicap_tracker *nt, *nt1;
215
216	nt = first;
217	TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
218		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
219		vdrop(nt->dp);
220		free(nt, M_NAMEITRACKER);
221	}
222}
223
224static void
225nameicap_cleanup(struct nameidata *ndp)
226{
227	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
228	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
229	nameicap_cleanup_from(ndp, NULL);
230}
231
232/*
233 * For dotdot lookups in capability mode, only allow the component
234 * lookup to succeed if the resulting directory was already traversed
235 * during the operation.  This catches situations where already
236 * traversed directory is moved to different parent, and then we walk
237 * over it with dotdots.
238 *
239 * Also allow to force failure of dotdot lookups for non-local
240 * filesystems, where external agents might assist local lookups to
241 * escape the compartment.
242 */
243static int
244nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
245{
246	struct nameicap_tracker *nt;
247	struct mount *mp;
248
249	if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf &
250	    NI_LCF_STRICTREL) == 0)
251		return (0);
252	if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL_KTR |
253	    NI_LCF_CAP_DOTDOT_KTR)) == NI_LCF_STRICTREL_KTR))
254		NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
255	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0)
256		return (ENOTCAPABLE);
257	mp = dp->v_mount;
258	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
259	    (mp->mnt_flag & MNT_LOCAL) == 0)
260		goto capfail;
261	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
262	    nm_link) {
263		if (dp == nt->dp) {
264			nt = TAILQ_NEXT(nt, nm_link);
265			if (nt != NULL)
266				nameicap_cleanup_from(ndp, nt);
267			return (0);
268		}
269	}
270
271capfail:
272	if (__predict_false((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0))
273		NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
274	return (ENOTCAPABLE);
275}
276
277static void
278namei_cleanup_cnp(struct componentname *cnp)
279{
280
281	uma_zfree(namei_zone, cnp->cn_pnbuf);
282	cnp->cn_pnbuf = NULL;
283	cnp->cn_nameptr = NULL;
284}
285
286static int
287namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
288{
289	struct componentname *cnp;
290
291	cnp = &ndp->ni_cnd;
292	if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL |
293	    NI_LCF_STRICTREL_KTR)) != 0)) {
294		if ((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0)
295			NI_CAP_VIOLATION(ndp, cnp->cn_pnbuf);
296		if ((ndp->ni_lcf & NI_LCF_STRICTREL) != 0)
297			return (ENOTCAPABLE);
298	}
299	while (*(cnp->cn_nameptr) == '/') {
300		cnp->cn_nameptr++;
301		ndp->ni_pathlen--;
302	}
303	*dpp = ndp->ni_rootdir;
304	vrefact(*dpp);
305	return (0);
306}
307
308static int
309namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
310{
311	struct componentname *cnp;
312	struct thread *td;
313	struct pwd *pwd;
314	int error;
315	bool startdir_used;
316
317	cnp = &ndp->ni_cnd;
318	td = curthread;
319
320	startdir_used = false;
321	*pwdp = NULL;
322	*dpp = NULL;
323
324#ifdef CAPABILITY_MODE
325	/*
326	 * In capability mode, lookups must be restricted to happen in
327	 * the subtree with the root specified by the file descriptor:
328	 * - The root must be real file descriptor, not the pseudo-descriptor
329	 *   AT_FDCWD.
330	 * - The passed path must be relative and not absolute.
331	 * - If lookup_cap_dotdot is disabled, path must not contain the
332	 *   '..' components.
333	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
334	 *   components lookups result in the directories which were
335	 *   previously walked by us, which prevents an escape from
336	 *   the relative root.
337	 */
338	if ((cnp->cn_flags & NOCAPCHECK) == 0) {
339		if (CAP_TRACING(td)) {
340			ndp->ni_lcf |= NI_LCF_STRICTREL_KTR;
341			if (ndp->ni_dirfd == AT_FDCWD)
342				NI_CAP_VIOLATION(ndp, "AT_FDCWD");
343		}
344		if (IN_CAPABILITY_MODE(td)) {
345			ndp->ni_lcf |= NI_LCF_STRICTREL;
346			ndp->ni_resflags |= NIRES_STRICTREL;
347			if (ndp->ni_dirfd == AT_FDCWD)
348				return (ECAPMODE);
349		}
350	}
351#endif
352	error = 0;
353
354	/*
355	 * Get starting point for the translation.
356	 */
357	pwd = pwd_hold(td);
358	/*
359	 * The reference on ni_rootdir is acquired in the block below to avoid
360	 * back-to-back atomics for absolute lookups.
361	 */
362	namei_setup_rootdir(ndp, cnp, pwd);
363	ndp->ni_topdir = pwd->pwd_jdir;
364
365	if (cnp->cn_pnbuf[0] == '/') {
366		ndp->ni_resflags |= NIRES_ABS;
367		error = namei_handle_root(ndp, dpp);
368	} else {
369		if (ndp->ni_startdir != NULL) {
370			*dpp = ndp->ni_startdir;
371			startdir_used = true;
372		} else if (ndp->ni_dirfd == AT_FDCWD) {
373			*dpp = pwd->pwd_cdir;
374			vrefact(*dpp);
375		} else {
376			if (cnp->cn_flags & AUDITVNODE1)
377				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
378			if (cnp->cn_flags & AUDITVNODE2)
379				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
380
381			error = fgetvp_lookup(ndp, dpp);
382		}
383		if (error == 0 && (*dpp)->v_type != VDIR &&
384		    (cnp->cn_pnbuf[0] != '\0' ||
385		    (cnp->cn_flags & EMPTYPATH) == 0))
386			error = ENOTDIR;
387	}
388	if (error == 0 && (cnp->cn_flags & RBENEATH) != 0) {
389		if (cnp->cn_pnbuf[0] == '/') {
390			error = ENOTCAPABLE;
391		} else if ((ndp->ni_lcf & NI_LCF_STRICTREL) == 0) {
392			ndp->ni_lcf |= NI_LCF_STRICTREL |
393			    NI_LCF_CAP_DOTDOT;
394		}
395	}
396
397	/*
398	 * If we are auditing the kernel pathname, save the user pathname.
399	 */
400	if (AUDITING_TD(td)) {
401		if (cnp->cn_flags & AUDITVNODE1)
402			AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
403		if (cnp->cn_flags & AUDITVNODE2)
404			AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
405	}
406	if (ndp->ni_startdir != NULL && !startdir_used)
407		vrele(ndp->ni_startdir);
408	if (error != 0) {
409		if (*dpp != NULL)
410			vrele(*dpp);
411		pwd_drop(pwd);
412		return (error);
413	}
414	if (lookup_cap_dotdot != 0) {
415		if ((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0)
416			ndp->ni_lcf |= NI_LCF_CAP_DOTDOT_KTR;
417		if ((ndp->ni_lcf & NI_LCF_STRICTREL) != 0)
418			ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
419	}
420	SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
421	    cnp->cn_flags, false);
422	*pwdp = pwd;
423	return (0);
424}
425
426static int
427namei_getpath(struct nameidata *ndp)
428{
429	struct componentname *cnp;
430	int error;
431
432	cnp = &ndp->ni_cnd;
433
434	/*
435	 * Get a buffer for the name to be translated, and copy the
436	 * name into the buffer.
437	 */
438	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
439	if (ndp->ni_segflg == UIO_SYSSPACE) {
440		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
441		    &ndp->ni_pathlen);
442	} else {
443		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
444		    &ndp->ni_pathlen);
445	}
446
447	return (error);
448}
449
450static int
451namei_emptypath(struct nameidata *ndp)
452{
453	struct componentname *cnp;
454	struct pwd *pwd;
455	struct vnode *dp;
456	int error;
457
458	cnp = &ndp->ni_cnd;
459	MPASS(*cnp->cn_pnbuf == '\0');
460	MPASS((cnp->cn_flags & EMPTYPATH) != 0);
461	MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
462
463	ndp->ni_resflags |= NIRES_EMPTYPATH;
464	error = namei_setup(ndp, &dp, &pwd);
465	if (error != 0) {
466		goto errout;
467	}
468
469	/*
470	 * Usecount on dp already provided by namei_setup.
471	 */
472	ndp->ni_vp = dp;
473	pwd_drop(pwd);
474	NDVALIDATE(ndp);
475	if ((cnp->cn_flags & LOCKLEAF) != 0) {
476		VOP_LOCK(dp, (cnp->cn_flags & LOCKSHARED) != 0 ?
477		    LK_SHARED : LK_EXCLUSIVE);
478		if (VN_IS_DOOMED(dp)) {
479			vput(dp);
480			error = ENOENT;
481			goto errout;
482		}
483	}
484	SDT_PROBE4(vfs, namei, lookup, return, 0, ndp->ni_vp, false, ndp);
485	return (0);
486
487errout:
488	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
489	namei_cleanup_cnp(cnp);
490	return (error);
491}
492
493static int __noinline
494namei_follow_link(struct nameidata *ndp)
495{
496	char *cp;
497	struct iovec aiov;
498	struct uio auio;
499	struct componentname *cnp;
500	struct thread *td;
501	int error, linklen;
502
503	error = 0;
504	cnp = &ndp->ni_cnd;
505	td = curthread;
506
507	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
508		error = ELOOP;
509		goto out;
510	}
511#ifdef MAC
512	if ((cnp->cn_flags & NOMACCHECK) == 0) {
513		error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp);
514		if (error != 0)
515			goto out;
516	}
517#endif
518	if (ndp->ni_pathlen > 1)
519		cp = uma_zalloc(namei_zone, M_WAITOK);
520	else
521		cp = cnp->cn_pnbuf;
522	aiov.iov_base = cp;
523	aiov.iov_len = MAXPATHLEN;
524	auio.uio_iov = &aiov;
525	auio.uio_iovcnt = 1;
526	auio.uio_offset = 0;
527	auio.uio_rw = UIO_READ;
528	auio.uio_segflg = UIO_SYSSPACE;
529	auio.uio_td = td;
530	auio.uio_resid = MAXPATHLEN;
531	error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
532	if (error != 0) {
533		if (ndp->ni_pathlen > 1)
534			uma_zfree(namei_zone, cp);
535		goto out;
536	}
537	linklen = MAXPATHLEN - auio.uio_resid;
538	if (linklen == 0) {
539		if (ndp->ni_pathlen > 1)
540			uma_zfree(namei_zone, cp);
541		error = ENOENT;
542		goto out;
543	}
544	if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
545		if (ndp->ni_pathlen > 1)
546			uma_zfree(namei_zone, cp);
547		error = ENAMETOOLONG;
548		goto out;
549	}
550	if (ndp->ni_pathlen > 1) {
551		bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
552		uma_zfree(namei_zone, cnp->cn_pnbuf);
553		cnp->cn_pnbuf = cp;
554	} else
555		cnp->cn_pnbuf[linklen] = '\0';
556	ndp->ni_pathlen += linklen;
557out:
558	return (error);
559}
560
561/*
562 * Convert a pathname into a pointer to a locked vnode.
563 *
564 * The FOLLOW flag is set when symbolic links are to be followed
565 * when they occur at the end of the name translation process.
566 * Symbolic links are always followed for all other pathname
567 * components other than the last.
568 *
569 * The segflg defines whether the name is to be copied from user
570 * space or kernel space.
571 *
572 * Overall outline of namei:
573 *
574 *	copy in name
575 *	get starting directory
576 *	while (!done && !error) {
577 *		call lookup to search path.
578 *		if symbolic link, massage name in buffer and continue
579 *	}
580 */
581int
582namei(struct nameidata *ndp)
583{
584	struct vnode *dp;	/* the directory we are searching */
585	struct componentname *cnp;
586	struct thread *td;
587	struct pwd *pwd;
588	int error;
589	enum cache_fpl_status status;
590
591	cnp = &ndp->ni_cnd;
592	td = curthread;
593#ifdef INVARIANTS
594	KASSERT((ndp->ni_debugflags & NAMEI_DBG_CALLED) == 0,
595	    ("%s: repeated call to namei without NDREINIT", __func__));
596	KASSERT(ndp->ni_debugflags == NAMEI_DBG_INITED,
597	    ("%s: bad debugflags %d", __func__, ndp->ni_debugflags));
598	ndp->ni_debugflags |= NAMEI_DBG_CALLED;
599	if (ndp->ni_startdir != NULL)
600		ndp->ni_debugflags |= NAMEI_DBG_HADSTARTDIR;
601	if (cnp->cn_flags & FAILIFEXISTS) {
602		KASSERT(cnp->cn_nameiop == CREATE,
603		    ("%s: FAILIFEXISTS passed for op %d", __func__, cnp->cn_nameiop));
604		/*
605		 * The limitation below is to restrict hairy corner cases.
606		 */
607		KASSERT((cnp->cn_flags & (LOCKPARENT | LOCKLEAF)) == LOCKPARENT,
608		    ("%s: FAILIFEXISTS must be passed with LOCKPARENT and without LOCKLEAF",
609		    __func__));
610	}
611#endif
612	ndp->ni_cnd.cn_cred = td->td_ucred;
613	KASSERT(ndp->ni_resflags == 0, ("%s: garbage in ni_resflags: %x\n",
614	    __func__, ndp->ni_resflags));
615	KASSERT(cnp->cn_cred && td->td_proc, ("namei: bad cred/proc"));
616	KASSERT((cnp->cn_flags & NAMEI_INTERNAL_FLAGS) == 0,
617	    ("namei: unexpected flags: %" PRIx64 "\n",
618	    cnp->cn_flags & NAMEI_INTERNAL_FLAGS));
619	if (cnp->cn_flags & NOCACHE)
620		KASSERT(cnp->cn_nameiop != LOOKUP,
621		    ("%s: NOCACHE passed with LOOKUP", __func__));
622	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
623	    ndp->ni_startdir->v_type == VBAD);
624
625restart:
626	ndp->ni_lcf = 0;
627	ndp->ni_loopcnt = 0;
628	ndp->ni_vp = NULL;
629
630	error = namei_getpath(ndp);
631	if (__predict_false(error != 0)) {
632		namei_cleanup_cnp(cnp);
633		SDT_PROBE4(vfs, namei, lookup, return, error, NULL,
634		    false, ndp);
635		return (error);
636	}
637
638	cnp->cn_nameptr = cnp->cn_pnbuf;
639
640#ifdef KTRACE
641	if (KTRPOINT(td, KTR_NAMEI)) {
642		ktrnamei(cnp->cn_pnbuf);
643	}
644#endif
645	TSNAMEI(curthread->td_proc->p_pid, cnp->cn_pnbuf);
646
647	/*
648	 * First try looking up the target without locking any vnodes.
649	 *
650	 * We may need to start from scratch or pick up where it left off.
651	 */
652	error = cache_fplookup(ndp, &status, &pwd);
653	switch (status) {
654	case CACHE_FPL_STATUS_UNSET:
655		__assert_unreachable();
656		break;
657	case CACHE_FPL_STATUS_HANDLED:
658		if (error == 0)
659			NDVALIDATE(ndp);
660		else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
661		    (cnp->cn_flags & ISRESTARTED) == 0)) {
662			namei_cleanup_cnp(cnp);
663			NDRESTART(ndp);
664			goto restart;
665		}
666		return (error);
667	case CACHE_FPL_STATUS_PARTIAL:
668		TAILQ_INIT(&ndp->ni_cap_tracker);
669		dp = ndp->ni_startdir;
670		break;
671	case CACHE_FPL_STATUS_DESTROYED:
672		ndp->ni_loopcnt = 0;
673		error = namei_getpath(ndp);
674		if (__predict_false(error != 0)) {
675			namei_cleanup_cnp(cnp);
676			return (error);
677		}
678		cnp->cn_nameptr = cnp->cn_pnbuf;
679		/* FALLTHROUGH */
680	case CACHE_FPL_STATUS_ABORTED:
681		TAILQ_INIT(&ndp->ni_cap_tracker);
682		MPASS(ndp->ni_lcf == 0);
683		if (*cnp->cn_pnbuf == '\0') {
684			if ((cnp->cn_flags & EMPTYPATH) != 0) {
685				return (namei_emptypath(ndp));
686			}
687			namei_cleanup_cnp(cnp);
688			SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL,
689			    false, ndp);
690			return (ENOENT);
691		}
692		error = namei_setup(ndp, &dp, &pwd);
693		if (error != 0) {
694			namei_cleanup_cnp(cnp);
695			return (error);
696		}
697		break;
698	}
699
700	/*
701	 * Locked lookup.
702	 */
703	for (;;) {
704		ndp->ni_startdir = dp;
705		error = vfs_lookup(ndp);
706		if (error != 0) {
707			if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
708			    error == ENOENT &&
709			    (cnp->cn_flags & ISRESTARTED) == 0)) {
710				nameicap_cleanup(ndp);
711				pwd_drop(pwd);
712				namei_cleanup_cnp(cnp);
713				NDRESTART(ndp);
714				goto restart;
715			} else
716				goto out;
717		}
718
719		/*
720		 * If not a symbolic link, we're done.
721		 */
722		if ((cnp->cn_flags & ISSYMLINK) == 0) {
723			SDT_PROBE4(vfs, namei, lookup, return, error,
724			    ndp->ni_vp, false, ndp);
725			nameicap_cleanup(ndp);
726			pwd_drop(pwd);
727			NDVALIDATE(ndp);
728			return (0);
729		}
730		error = namei_follow_link(ndp);
731		if (error != 0)
732			break;
733		vput(ndp->ni_vp);
734		dp = ndp->ni_dvp;
735		/*
736		 * Check if root directory should replace current directory.
737		 */
738		cnp->cn_nameptr = cnp->cn_pnbuf;
739		if (*(cnp->cn_nameptr) == '/') {
740			/*
741			 * Reset the lookup to start from the real root without
742			 * origin path name reloading.
743			 */
744			if (__predict_false(ndp->ni_rootdir != pwd->pwd_rdir)) {
745				cnp->cn_flags |= ISRESTARTED;
746				ndp->ni_rootdir = pwd->pwd_rdir;
747			}
748			vrele(dp);
749			error = namei_handle_root(ndp, &dp);
750			if (error != 0)
751				goto out;
752		}
753	}
754	vput(ndp->ni_vp);
755	ndp->ni_vp = NULL;
756	vrele(ndp->ni_dvp);
757out:
758	MPASS(error != 0);
759	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
760	namei_cleanup_cnp(cnp);
761	nameicap_cleanup(ndp);
762	pwd_drop(pwd);
763	return (error);
764}
765
766static int
767enforce_lkflags(struct mount *mp, int lkflags)
768{
769
770	if (mp == NULL || ((lkflags & LK_SHARED) &&
771	    !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
772		lkflags &= ~LK_SHARED;
773		lkflags |= LK_EXCLUSIVE;
774	}
775	lkflags |= LK_NODDLKTREAT;
776	return (lkflags);
777}
778
779static __inline int
780needs_exclusive_leaf(struct mount *mp, int flags)
781{
782
783	/*
784	 * Intermediate nodes can use shared locks, we only need to
785	 * force an exclusive lock for leaf nodes.
786	 */
787	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
788		return (0);
789
790	/* Always use exclusive locks if LOCKSHARED isn't set. */
791	if (!(flags & LOCKSHARED))
792		return (1);
793
794	/*
795	 * For lookups during open(), if the mount point supports
796	 * extended shared operations, then use a shared lock for the
797	 * leaf node, otherwise use an exclusive lock.
798	 */
799	if ((flags & ISOPEN) != 0)
800		return (!MNT_EXTENDED_SHARED(mp));
801
802	/*
803	 * Lookup requests outside of open() that specify LOCKSHARED
804	 * only need a shared lock on the leaf vnode.
805	 */
806	return (0);
807}
808
809/*
810 * Various filesystems expect to be able to copy a name component with length
811 * bounded by NAME_MAX into a directory entry buffer of size MAXNAMLEN.  Make
812 * sure that these are the same size.
813 */
814_Static_assert(MAXNAMLEN == NAME_MAX,
815    "MAXNAMLEN and NAME_MAX have different values");
816
817static int __noinline
818vfs_lookup_degenerate(struct nameidata *ndp, struct vnode *dp, int wantparent)
819{
820	struct componentname *cnp;
821	struct mount *mp;
822	int error;
823
824	cnp = &ndp->ni_cnd;
825
826	cnp->cn_flags |= ISLASTCN;
827
828	mp = atomic_load_ptr(&dp->v_mount);
829	if (needs_exclusive_leaf(mp, cnp->cn_flags)) {
830		cnp->cn_lkflags &= ~LK_SHARED;
831		cnp->cn_lkflags |= LK_EXCLUSIVE;
832	}
833
834	vn_lock(dp, enforce_lkflags(mp, cnp->cn_lkflags | LK_RETRY));
835
836	if (dp->v_type != VDIR) {
837		error = ENOTDIR;
838		goto bad;
839	}
840	if (cnp->cn_nameiop != LOOKUP) {
841		error = EISDIR;
842		goto bad;
843	}
844	if (wantparent) {
845		ndp->ni_dvp = dp;
846		VREF(dp);
847	}
848	ndp->ni_vp = dp;
849	cnp->cn_namelen = 0;
850
851	if (cnp->cn_flags & AUDITVNODE1)
852		AUDIT_ARG_VNODE1(dp);
853	else if (cnp->cn_flags & AUDITVNODE2)
854		AUDIT_ARG_VNODE2(dp);
855
856	if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
857		VOP_UNLOCK(dp);
858	return (0);
859bad:
860	VOP_UNLOCK(dp);
861	return (error);
862}
863
864/*
865 * FAILIFEXISTS handling.
866 *
867 * XXX namei called with LOCKPARENT but not LOCKLEAF has the strange
868 * behaviour of leaving the vnode unlocked if the target is the same
869 * vnode as the parent.
870 */
871static int __noinline
872vfs_lookup_failifexists(struct nameidata *ndp)
873{
874	struct componentname *cnp __diagused;
875
876	cnp = &ndp->ni_cnd;
877
878	MPASS((cnp->cn_flags & ISSYMLINK) == 0);
879	if (ndp->ni_vp == ndp->ni_dvp)
880		vrele(ndp->ni_dvp);
881	else
882		vput(ndp->ni_dvp);
883	vrele(ndp->ni_vp);
884	ndp->ni_dvp = NULL;
885	ndp->ni_vp = NULL;
886	NDFREE_PNBUF(ndp);
887	return (EEXIST);
888}
889
890static int __noinline
891vfs_lookup_cross_mount(struct nameidata *ndp)
892{
893	struct componentname *cnp;
894	struct mount *mp;
895	struct vnode *dp, *tdp;
896	int error, crosslkflags;
897	bool crosslock;
898
899	cnp = &ndp->ni_cnd;
900	dp = ndp->ni_vp;
901
902	/*
903	 * The vnode has been mounted on, find the root of the mounted
904	 * filesystem.
905	 */
906	do {
907		mp = dp->v_mountedhere;
908		ASSERT_VOP_LOCKED(dp, __func__);
909		VNPASS((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0 && mp != NULL, dp);
910
911		crosslock = (dp->v_vflag & VV_CROSSLOCK) != 0;
912		crosslkflags = enforce_lkflags(mp, cnp->cn_lkflags);
913		if (__predict_false(crosslock)) {
914			/*
915			 * We are going to be holding the vnode lock, which
916			 * in this case is shared by the root vnode of the
917			 * filesystem mounted at mp, across the call to
918			 * VFS_ROOT().  Make the situation clear to the
919			 * filesystem by passing LK_CANRECURSE if the
920			 * lock is held exclusive, or by clearinng
921			 * LK_NODDLKTREAT to allow recursion on the shared
922			 * lock in the presence of an exclusive waiter.
923			 */
924			if (VOP_ISLOCKED(dp) == LK_EXCLUSIVE) {
925				crosslkflags &= ~LK_SHARED;
926				crosslkflags |= LK_EXCLUSIVE | LK_CANRECURSE;
927			} else if ((crosslkflags & LK_EXCLUSIVE) != 0) {
928				error = vn_lock(dp, LK_UPGRADE);
929				if (error != 0) {
930					MPASS(error == ENOENT);
931					vrele(dp);
932					if (dp != ndp->ni_dvp)
933						vput(ndp->ni_dvp);
934					else
935						vrele(ndp->ni_dvp);
936					break;
937				}
938				if (dp->v_mountedhere != mp) {
939					/*
940					 * Note that we rely on the
941					 * VIRF_MOUNTPOINT loop condition to
942					 * ensure we stop iterating if dp is
943					 * no longer a mountpoint at all.
944					 */
945					continue;
946				}
947			} else
948				crosslkflags &= ~LK_NODDLKTREAT;
949		}
950		if (vfs_busy(mp, 0) != 0)
951			continue;
952		if (__predict_true(!crosslock))
953			vput(dp);
954		if (dp != ndp->ni_dvp)
955			vput(ndp->ni_dvp);
956		else
957			vrele(ndp->ni_dvp);
958		vrefact(vp_crossmp);
959		ndp->ni_dvp = vp_crossmp;
960		error = VFS_ROOT(mp, crosslkflags, &tdp);
961		vfs_unbusy(mp);
962		if (__predict_false(crosslock))
963			vput(dp);
964		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
965			panic("vp_crossmp exclusively locked or reclaimed");
966		if (error != 0)
967			break;
968		ndp->ni_vp = dp = tdp;
969	} while ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0);
970
971	return (error);
972}
973
974/*
975 * Search a pathname.
976 * This is a very central and rather complicated routine.
977 *
978 * The pathname is pointed to by cn_nameptr and is of length ni_pathlen.
979 * The starting directory is taken from ni_startdir. The pathname is
980 * descended until done, or a symbolic link is encountered. The cn_flags
981 * has ISLASTCN or'ed if the path is completed or ISSYMLINK or'ed if a
982 * symbolic link needing interpretation is encountered.
983 *
984 * The cn_nameiop is LOOKUP, CREATE, RENAME, or DELETE depending on
985 * whether the name is to be looked up, created, renamed, or deleted.
986 * When CREATE, RENAME, or DELETE is specified, information usable in
987 * creating, renaming, or deleting a directory entry may be calculated.
988 * If cn_flags has LOCKPARENT or'ed into it, the parent directory is returned
989 * locked. If it has WANTPARENT or'ed into it, the parent directory is
990 * returned unlocked. Otherwise the parent directory is not returned. If
991 * the target of the pathname exists and LOCKLEAF is or'ed into the cn_flags
992 * the target is returned locked, otherwise it is returned unlocked.
993 *
994 * Overall outline of lookup:
995 *
996 *	handle degenerate case where name is null string
997 *
998 * dirloop:
999 *	identify next component of name at ndp->ni_cnd.cn_nameptr
1000 *	handle .. special cases related to capabilities, chroot, jail
1001 *	if .. and crossing mount points and on mounted filesys, find parent
1002 *	call VOP_LOOKUP routine for next component name
1003 *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
1004 *	    component vnode returned in ni_vp (if it exists), locked.
1005 *	if result vnode is mounted on and crossing mount points,
1006 *	    find mounted on vnode
1007 *	if more components of name, do next level at dirloop
1008 *	if VOP_LOOKUP returns ERELOOKUP, repeat the same level at dirloop
1009 *	return the answer in ni_vp, locked if LOCKLEAF set
1010 *	    if LOCKPARENT set, return locked parent in ni_dvp
1011 *	    if WANTPARENT set, return unlocked parent in ni_dvp
1012 */
1013int
1014vfs_lookup(struct nameidata *ndp)
1015{
1016	char *cp;			/* pointer into pathname argument */
1017	char *prev_ni_next;		/* saved ndp->ni_next */
1018	char *nulchar;			/* location of '\0' in cn_pnbuf */
1019	char *lastchar;			/* location of the last character */
1020	struct vnode *dp = NULL;	/* the directory we are searching */
1021	struct vnode *tdp;		/* saved dp */
1022	struct prison *pr;
1023	size_t prev_ni_pathlen;		/* saved ndp->ni_pathlen */
1024	int docache;			/* == 0 do not cache last component */
1025	int wantparent;			/* 1 => wantparent or lockparent flag */
1026	int rdonly;			/* lookup read-only flag bit */
1027	int error = 0;
1028	int relookup = 0;		/* do not consume the path component */
1029	struct componentname *cnp = &ndp->ni_cnd;
1030	int lkflags_save;
1031	int ni_dvp_unlocked;
1032
1033	/*
1034	 * Setup: break out flag bits into variables.
1035	 */
1036	ni_dvp_unlocked = 0;
1037	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
1038	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
1039	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
1040	/*
1041	 * When set to zero, docache causes the last component of the
1042	 * pathname to be deleted from the cache and the full lookup
1043	 * of the name to be done (via VOP_CACHEDLOOKUP()). Often
1044	 * filesystems need some pre-computed values that are made
1045	 * during the full lookup, for instance UFS sets dp->i_offset.
1046	 *
1047	 * The docache variable is set to zero when requested by the
1048	 * NOCACHE flag and for all modifying operations except CREATE.
1049	 */
1050	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
1051	if (cnp->cn_nameiop == DELETE ||
1052	    (wantparent && cnp->cn_nameiop != CREATE &&
1053	     cnp->cn_nameiop != LOOKUP))
1054		docache = 0;
1055	rdonly = cnp->cn_flags & RDONLY;
1056	cnp->cn_flags &= ~ISSYMLINK;
1057	ndp->ni_dvp = NULL;
1058
1059	cnp->cn_lkflags = LK_SHARED;
1060	dp = ndp->ni_startdir;
1061	ndp->ni_startdir = NULLVP;
1062
1063	/*
1064	 * Leading slashes, if any, are supposed to be skipped by the caller.
1065	 */
1066	MPASS(cnp->cn_nameptr[0] != '/');
1067
1068	/*
1069	 * Check for degenerate name (e.g. / or "") which is a way of talking
1070	 * about a directory, e.g. like "/." or ".".
1071	 */
1072	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
1073		error = vfs_lookup_degenerate(ndp, dp, wantparent);
1074		if (error == 0)
1075			goto success_right_lock;
1076		goto bad_unlocked;
1077	}
1078
1079	/*
1080	 * Nul-out trailing slashes (e.g., "foo///" -> "foo").
1081	 *
1082	 * This must be done before VOP_LOOKUP() because some fs's don't know
1083	 * about trailing slashes.  Remember if there were trailing slashes to
1084	 * handle symlinks, existing non-directories and non-existing files
1085	 * that won't be directories specially later.
1086	 */
1087	MPASS(ndp->ni_pathlen >= 2);
1088	lastchar = &cnp->cn_nameptr[ndp->ni_pathlen - 2];
1089	if (*lastchar == '/') {
1090		while (lastchar >= cnp->cn_pnbuf) {
1091			*lastchar = '\0';
1092			lastchar--;
1093			ndp->ni_pathlen--;
1094			if (*lastchar != '/') {
1095				break;
1096			}
1097		}
1098		cnp->cn_flags |= TRAILINGSLASH;
1099	}
1100
1101	/*
1102	 * We use shared locks until we hit the parent of the last cn then
1103	 * we adjust based on the requesting flags.
1104	 */
1105	vn_lock(dp,
1106	    enforce_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY));
1107
1108dirloop:
1109	/*
1110	 * Search a new directory.
1111	 *
1112	 * The last component of the filename is left accessible via
1113	 * cnp->cn_nameptr. It has to be freed with a call to NDFREE*.
1114	 *
1115	 * Store / as a temporary sentinel so that we only have one character
1116	 * to test for. Pathnames tend to be short so this should not be
1117	 * resulting in cache misses.
1118	 */
1119	nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
1120	KASSERT(*nulchar == '\0',
1121	    ("%s: expected nul at %p; string [%s]\n", __func__, nulchar,
1122	    cnp->cn_pnbuf));
1123	*nulchar = '/';
1124	for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
1125		KASSERT(*cp != '\0',
1126		    ("%s: encountered unexpected nul; string [%s]\n", __func__,
1127		    cnp->cn_nameptr));
1128		continue;
1129	}
1130	*nulchar = '\0';
1131	cnp->cn_namelen = cp - cnp->cn_nameptr;
1132	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
1133		error = ENAMETOOLONG;
1134		goto bad;
1135	}
1136	prev_ni_pathlen = ndp->ni_pathlen;
1137	ndp->ni_pathlen -= cnp->cn_namelen;
1138	KASSERT(ndp->ni_pathlen <= PATH_MAX,
1139	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
1140	prev_ni_next = ndp->ni_next;
1141	ndp->ni_next = cp;
1142
1143	/*
1144	 * Something else should be clearing this.
1145	 */
1146	cnp->cn_flags &= ~(ISDOTDOT|ISLASTCN);
1147
1148	cnp->cn_flags |= MAKEENTRY;
1149	if (*cp == '\0' && docache == 0)
1150		cnp->cn_flags &= ~MAKEENTRY;
1151	if (cnp->cn_namelen == 2 &&
1152	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
1153		cnp->cn_flags |= ISDOTDOT;
1154	if (*ndp->ni_next == 0) {
1155		cnp->cn_flags |= ISLASTCN;
1156
1157		if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
1158		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))) {
1159			error = EINVAL;
1160			goto bad;
1161		}
1162	}
1163
1164	nameicap_tracker_add(ndp, dp);
1165
1166	/*
1167	 * Make sure degenerate names don't get here, their handling was
1168	 * previously found in this spot.
1169	 */
1170	MPASS(cnp->cn_nameptr[0] != '\0');
1171
1172	/*
1173	 * Handle "..": five special cases.
1174	 * 0. If doing a capability lookup and lookup_cap_dotdot is
1175	 *    disabled, return ENOTCAPABLE.
1176	 * 1. Return an error if this is the last component of
1177	 *    the name and the operation is DELETE or RENAME.
1178	 * 2. If at root directory (e.g. after chroot)
1179	 *    or at absolute root directory
1180	 *    then ignore it so can't get out.
1181	 * 3. If this vnode is the root of a mounted
1182	 *    filesystem, then replace it with the
1183	 *    vnode which was mounted on so we take the
1184	 *    .. in the other filesystem.
1185	 * 4. If the vnode is the top directory of
1186	 *    the jail or chroot, don't let them out.
1187	 * 5. If doing a capability lookup and lookup_cap_dotdot is
1188	 *    enabled, return ENOTCAPABLE if the lookup would escape
1189	 *    from the initial file descriptor directory.  Checks are
1190	 *    done by ensuring that namei() already traversed the
1191	 *    result of dotdot lookup.
1192	 */
1193	if (cnp->cn_flags & ISDOTDOT) {
1194		if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL_KTR |
1195		    NI_LCF_CAP_DOTDOT_KTR)) == NI_LCF_STRICTREL_KTR))
1196			NI_CAP_VIOLATION(ndp, cnp->cn_pnbuf);
1197		if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL |
1198		    NI_LCF_CAP_DOTDOT)) == NI_LCF_STRICTREL)) {
1199			error = ENOTCAPABLE;
1200			goto bad;
1201		}
1202		if ((cnp->cn_flags & ISLASTCN) != 0 &&
1203		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1204			error = EINVAL;
1205			goto bad;
1206		}
1207		for (;;) {
1208			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
1209			     pr = pr->pr_parent)
1210				if (dp == pr->pr_root)
1211					break;
1212			bool isroot = dp == ndp->ni_rootdir ||
1213			    dp == ndp->ni_topdir || dp == rootvnode ||
1214			    pr != NULL;
1215			if (__predict_false(isroot && (ndp->ni_lcf &
1216			    (NI_LCF_STRICTREL | NI_LCF_STRICTREL_KTR)) != 0)) {
1217				if ((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0)
1218					NI_CAP_VIOLATION(ndp, cnp->cn_pnbuf);
1219				if ((ndp->ni_lcf & NI_LCF_STRICTREL) != 0) {
1220					error = ENOTCAPABLE;
1221					goto capdotdot;
1222				}
1223			}
1224			if (isroot || ((dp->v_vflag & VV_ROOT) != 0 &&
1225			    (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
1226				ndp->ni_dvp = dp;
1227				ndp->ni_vp = dp;
1228				VREF(dp);
1229				goto nextname;
1230			}
1231			if ((dp->v_vflag & VV_ROOT) == 0)
1232				break;
1233			if (VN_IS_DOOMED(dp)) {	/* forced unmount */
1234				error = ENOENT;
1235				goto bad;
1236			}
1237			tdp = dp;
1238			dp = dp->v_mount->mnt_vnodecovered;
1239			VREF(dp);
1240			vput(tdp);
1241			vn_lock(dp,
1242			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
1243			    LK_RETRY));
1244			error = nameicap_check_dotdot(ndp, dp);
1245			if (error != 0) {
1246capdotdot:
1247				goto bad;
1248			}
1249		}
1250	}
1251
1252	/*
1253	 * We now have a segment name to search for, and a directory to search.
1254	 */
1255unionlookup:
1256#ifdef MAC
1257	error = mac_vnode_check_lookup(cnp->cn_cred, dp, cnp);
1258	if (__predict_false(error))
1259		goto bad;
1260#endif
1261	ndp->ni_dvp = dp;
1262	ndp->ni_vp = NULL;
1263	ASSERT_VOP_LOCKED(dp, "lookup");
1264	/*
1265	 * If we have a shared lock we may need to upgrade the lock for the
1266	 * last operation.
1267	 */
1268	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
1269	    dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED)
1270		vn_lock(dp, LK_UPGRADE|LK_RETRY);
1271	if (VN_IS_DOOMED(dp)) {
1272		error = ENOENT;
1273		goto bad;
1274	}
1275	/*
1276	 * If we're looking up the last component and we need an exclusive
1277	 * lock, adjust our lkflags.
1278	 */
1279	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
1280		cnp->cn_lkflags = LK_EXCLUSIVE;
1281	lkflags_save = cnp->cn_lkflags;
1282	cnp->cn_lkflags = enforce_lkflags(dp->v_mount, cnp->cn_lkflags);
1283	error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
1284	cnp->cn_lkflags = lkflags_save;
1285	if (error != 0) {
1286		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
1287		if ((error == ENOENT) &&
1288		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
1289		    (dp->v_mount->mnt_flag & MNT_UNION)) {
1290			tdp = dp;
1291			dp = dp->v_mount->mnt_vnodecovered;
1292			VREF(dp);
1293			vput(tdp);
1294			vn_lock(dp,
1295			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
1296			    LK_RETRY));
1297			nameicap_tracker_add(ndp, dp);
1298			goto unionlookup;
1299		}
1300
1301		if (error == ERELOOKUP) {
1302			vref(dp);
1303			ndp->ni_vp = dp;
1304			error = 0;
1305			relookup = 1;
1306			goto good;
1307		}
1308
1309		if (error != EJUSTRETURN)
1310			goto bad;
1311		/*
1312		 * At this point, we know we're at the end of the
1313		 * pathname.  If creating / renaming, we can consider
1314		 * allowing the file or directory to be created / renamed,
1315		 * provided we're not on a read-only filesystem.
1316		 */
1317		if (rdonly) {
1318			error = EROFS;
1319			goto bad;
1320		}
1321		/* trailing slash only allowed for directories */
1322		if ((cnp->cn_flags & TRAILINGSLASH) &&
1323		    !(cnp->cn_flags & WILLBEDIR)) {
1324			error = ENOENT;
1325			goto bad;
1326		}
1327		if ((cnp->cn_flags & LOCKPARENT) == 0)
1328			VOP_UNLOCK(dp);
1329		/*
1330		 * We return with ni_vp NULL to indicate that the entry
1331		 * doesn't currently exist, leaving a pointer to the
1332		 * (possibly locked) directory vnode in ndp->ni_dvp.
1333		 */
1334		goto success;
1335	}
1336
1337good:
1338	dp = ndp->ni_vp;
1339
1340	/*
1341	 * Check for symbolic link
1342	 */
1343	if ((dp->v_type == VLNK) &&
1344	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
1345	     *ndp->ni_next == '/')) {
1346		cnp->cn_flags |= ISSYMLINK;
1347		if (VN_IS_DOOMED(dp)) {
1348			/*
1349			 * We can't know whether the directory was mounted with
1350			 * NOSYMFOLLOW, so we can't follow safely.
1351			 */
1352			error = ENOENT;
1353			goto bad2;
1354		}
1355		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
1356			error = EACCES;
1357			goto bad2;
1358		}
1359		/*
1360		 * Symlink code always expects an unlocked dvp.
1361		 */
1362		if (ndp->ni_dvp != ndp->ni_vp) {
1363			VOP_UNLOCK(ndp->ni_dvp);
1364			ni_dvp_unlocked = 1;
1365		}
1366		goto success;
1367	}
1368
1369	if ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0 &&
1370	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1371		error = vfs_lookup_cross_mount(ndp);
1372		if (error != 0)
1373			goto bad_unlocked;
1374		/*
1375		 * FALLTHROUGH to nextname
1376		 */
1377		dp = ndp->ni_vp;
1378	}
1379
1380nextname:
1381	/*
1382	 * Not a symbolic link that we will follow.  Continue with the
1383	 * next component if there is any; otherwise, we're done.
1384	 */
1385	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
1386	    ("lookup: invalid path state."));
1387	if (relookup) {
1388		relookup = 0;
1389		ndp->ni_pathlen = prev_ni_pathlen;
1390		ndp->ni_next = prev_ni_next;
1391		if (ndp->ni_dvp != dp)
1392			vput(ndp->ni_dvp);
1393		else
1394			vrele(ndp->ni_dvp);
1395		goto dirloop;
1396	}
1397	if (cnp->cn_flags & ISDOTDOT) {
1398		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
1399		if (error != 0)
1400			goto bad2;
1401	}
1402	if (*ndp->ni_next == '/') {
1403		cnp->cn_nameptr = ndp->ni_next;
1404		while (*cnp->cn_nameptr == '/') {
1405			cnp->cn_nameptr++;
1406			ndp->ni_pathlen--;
1407		}
1408		if (ndp->ni_dvp != dp)
1409			vput(ndp->ni_dvp);
1410		else
1411			vrele(ndp->ni_dvp);
1412		goto dirloop;
1413	}
1414	/*
1415	 * If we're processing a path with a trailing slash,
1416	 * check that the end result is a directory.
1417	 */
1418	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
1419		error = ENOTDIR;
1420		goto bad2;
1421	}
1422	/*
1423	 * Disallow directory write attempts on read-only filesystems.
1424	 */
1425	if (rdonly &&
1426	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1427		error = EROFS;
1428		goto bad2;
1429	}
1430	if (!wantparent) {
1431		ni_dvp_unlocked = 2;
1432		if (ndp->ni_dvp != dp)
1433			vput(ndp->ni_dvp);
1434		else
1435			vrele(ndp->ni_dvp);
1436	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
1437		VOP_UNLOCK(ndp->ni_dvp);
1438		ni_dvp_unlocked = 1;
1439	}
1440
1441	if (cnp->cn_flags & AUDITVNODE1)
1442		AUDIT_ARG_VNODE1(dp);
1443	else if (cnp->cn_flags & AUDITVNODE2)
1444		AUDIT_ARG_VNODE2(dp);
1445
1446	if ((cnp->cn_flags & LOCKLEAF) == 0)
1447		VOP_UNLOCK(dp);
1448success:
1449	/*
1450	 * FIXME: for lookups which only cross a mount point to fetch the
1451	 * root vnode, ni_dvp will be set to vp_crossmp. This can be a problem
1452	 * if either WANTPARENT or LOCKPARENT is set.
1453	 */
1454	/*
1455	 * Because of shared lookup we may have the vnode shared locked, but
1456	 * the caller may want it to be exclusively locked.
1457	 */
1458	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
1459	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
1460		vn_lock(dp, LK_UPGRADE | LK_RETRY);
1461		if (VN_IS_DOOMED(dp)) {
1462			error = ENOENT;
1463			goto bad2;
1464		}
1465	}
1466success_right_lock:
1467	if (ndp->ni_vp != NULL) {
1468		if ((cnp->cn_flags & ISDOTDOT) == 0)
1469			nameicap_tracker_add(ndp, ndp->ni_vp);
1470		if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS)
1471			return (vfs_lookup_failifexists(ndp));
1472	}
1473	return (0);
1474
1475bad2:
1476	if (ni_dvp_unlocked != 2) {
1477		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
1478			vput(ndp->ni_dvp);
1479		else
1480			vrele(ndp->ni_dvp);
1481	}
1482bad:
1483	vput(dp);
1484bad_unlocked:
1485	ndp->ni_vp = NULL;
1486	return (error);
1487}
1488
1489/*
1490 * relookup - lookup a path name component
1491 *    Used by lookup to re-acquire things.
1492 */
1493int
1494vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1495    bool refstart)
1496{
1497	struct vnode *dp = NULL;		/* the directory we are searching */
1498	int rdonly;			/* lookup read-only flag bit */
1499	int error = 0;
1500
1501	KASSERT(cnp->cn_flags & ISLASTCN,
1502	    ("relookup: Not given last component."));
1503	/*
1504	 * Setup: break out flag bits into variables.
1505	 */
1506	KASSERT((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) != 0,
1507	    ("relookup: parent not wanted"));
1508	rdonly = cnp->cn_flags & RDONLY;
1509	cnp->cn_flags &= ~ISSYMLINK;
1510	dp = dvp;
1511	cnp->cn_lkflags = LK_EXCLUSIVE;
1512	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
1513
1514	/*
1515	 * Search a new directory.
1516	 *
1517	 * See a comment in vfs_lookup for cnp->cn_nameptr.
1518	 *
1519	 * Check for "" which represents the root directory after slash
1520	 * removal.
1521	 */
1522	if (cnp->cn_nameptr[0] == '\0') {
1523		/*
1524		 * Support only LOOKUP for "/" because lookup()
1525		 * can't succeed for CREATE, DELETE and RENAME.
1526		 */
1527		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
1528		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
1529
1530		if (!(cnp->cn_flags & LOCKLEAF))
1531			VOP_UNLOCK(dp);
1532		*vpp = dp;
1533		/* XXX This should probably move to the top of function. */
1534		if (refstart)
1535			panic("lookup: SAVESTART");
1536		return (0);
1537	}
1538
1539	if (cnp->cn_flags & ISDOTDOT)
1540		panic ("relookup: lookup on dot-dot");
1541
1542	/*
1543	 * We now have a segment name to search for, and a directory to search.
1544	 */
1545	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
1546		KASSERT(*vpp == NULL, ("leaf should be empty"));
1547		if (error != EJUSTRETURN)
1548			goto bad;
1549		/*
1550		 * If creating and at end of pathname, then can consider
1551		 * allowing file to be created.
1552		 */
1553		if (rdonly) {
1554			error = EROFS;
1555			goto bad;
1556		}
1557		/* ASSERT(dvp == ndp->ni_startdir) */
1558		if (refstart)
1559			VREF(dvp);
1560		if ((cnp->cn_flags & LOCKPARENT) == 0)
1561			VOP_UNLOCK(dp);
1562		/*
1563		 * We return with ni_vp NULL to indicate that the entry
1564		 * doesn't currently exist, leaving a pointer to the
1565		 * (possibly locked) directory vnode in ndp->ni_dvp.
1566		 */
1567		return (0);
1568	}
1569
1570	dp = *vpp;
1571
1572	/*
1573	 * Disallow directory write attempts on read-only filesystems.
1574	 */
1575	if (rdonly &&
1576	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1577		if (dvp == dp)
1578			vrele(dvp);
1579		else
1580			vput(dvp);
1581		error = EROFS;
1582		goto bad;
1583	}
1584	/*
1585	 * Set the parent lock/ref state to the requested state.
1586	 */
1587	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp)
1588		VOP_UNLOCK(dvp);
1589	/*
1590	 * Check for symbolic link
1591	 */
1592	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
1593	    ("relookup: symlink found.\n"));
1594
1595	/* ASSERT(dvp == ndp->ni_startdir) */
1596	if (refstart)
1597		VREF(dvp);
1598
1599	if ((cnp->cn_flags & LOCKLEAF) == 0)
1600		VOP_UNLOCK(dp);
1601	return (0);
1602bad:
1603	vput(dp);
1604	*vpp = NULL;
1605	return (error);
1606}
1607
1608#ifdef INVARIANTS
1609/*
1610 * Validate the final state of ndp after the lookup.
1611 */
1612static void
1613NDVALIDATE_impl(struct nameidata *ndp, int line)
1614{
1615	struct componentname *cnp;
1616
1617	cnp = &ndp->ni_cnd;
1618	if (cnp->cn_pnbuf == NULL)
1619		panic("%s: got no buf! called from %d", __func__, line);
1620}
1621
1622#endif
1623