1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD$");
41
42#include "opt_capsicum.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/dirent.h>
48#include <sys/kernel.h>
49#include <sys/capsicum.h>
50#include <sys/fcntl.h>
51#include <sys/jail.h>
52#include <sys/lock.h>
53#include <sys/mutex.h>
54#include <sys/namei.h>
55#include <sys/vnode.h>
56#include <sys/mount.h>
57#include <sys/filedesc.h>
58#include <sys/proc.h>
59#include <sys/sdt.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65#ifdef INVARIANTS
66#include <machine/_inttypes.h>
67#endif
68
69#include <security/audit/audit.h>
70#include <security/mac/mac_framework.h>
71
72#include <vm/uma.h>
73
74#define	NAMEI_DIAGNOSTIC 1
75#undef NAMEI_DIAGNOSTIC
76
77SDT_PROVIDER_DEFINE(vfs);
78SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
79    "unsigned long", "bool");
80SDT_PROBE_DEFINE4(vfs, namei, lookup, return, "int", "struct vnode *", "bool",
81    "struct nameidata");
82
83/* Allocation zone for namei. */
84uma_zone_t namei_zone;
85
86/* Placeholder vnode for mp traversal. */
87static struct vnode *vp_crossmp;
88
89static int
90crossmp_vop_islocked(struct vop_islocked_args *ap)
91{
92
93	return (LK_SHARED);
94}
95
96static int
97crossmp_vop_lock1(struct vop_lock1_args *ap)
98{
99	struct vnode *vp;
100	struct lock *lk __unused;
101	const char *file __unused;
102	int flags, line __unused;
103
104	vp = ap->a_vp;
105	lk = vp->v_vnlock;
106	flags = ap->a_flags;
107	file = ap->a_file;
108	line = ap->a_line;
109
110	if ((flags & LK_SHARED) == 0)
111		panic("invalid lock request for crossmp");
112
113	WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line,
114	    flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL);
115	WITNESS_LOCK(&lk->lock_object, 0, file, line);
116	if ((flags & LK_INTERLOCK) != 0)
117		VI_UNLOCK(vp);
118	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, line);
119	return (0);
120}
121
122static int
123crossmp_vop_unlock(struct vop_unlock_args *ap)
124{
125	struct vnode *vp;
126	struct lock *lk __unused;
127
128	vp = ap->a_vp;
129	lk = vp->v_vnlock;
130
131	WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE);
132	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE,
133	    LOCK_LINE);
134	return (0);
135}
136
137static struct vop_vector crossmp_vnodeops = {
138	.vop_default =		&default_vnodeops,
139	.vop_islocked =		crossmp_vop_islocked,
140	.vop_lock1 =		crossmp_vop_lock1,
141	.vop_unlock =		crossmp_vop_unlock,
142};
143/*
144 * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode
145 * gets allocated early. See nameiinit for the direct call below.
146 */
147
148struct nameicap_tracker {
149	struct vnode *dp;
150	TAILQ_ENTRY(nameicap_tracker) nm_link;
151};
152
153/* Zone for cap mode tracker elements used for dotdot capability checks. */
154MALLOC_DEFINE(M_NAMEITRACKER, "namei_tracker", "namei tracking for dotdot");
155
156static void
157nameiinit(void *dummy __unused)
158{
159
160	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
161	    UMA_ALIGN_PTR, 0);
162	vfs_vector_op_register(&crossmp_vnodeops);
163	getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp);
164}
165SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
166
167static int lookup_cap_dotdot = 1;
168SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
169    &lookup_cap_dotdot, 0,
170    "enables \"..\" components in path lookup in capability mode");
171static int lookup_cap_dotdot_nonlocal = 1;
172SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
173    &lookup_cap_dotdot_nonlocal, 0,
174    "enables \"..\" components in path lookup in capability mode "
175    "on non-local mount");
176
177static void
178nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
179{
180	struct nameicap_tracker *nt;
181	struct componentname *cnp;
182
183	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
184		return;
185	cnp = &ndp->ni_cnd;
186	nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head);
187	if (nt != NULL && nt->dp == dp)
188		return;
189	nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK);
190	vhold(dp);
191	nt->dp = dp;
192	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
193}
194
195static void
196nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first)
197{
198	struct nameicap_tracker *nt, *nt1;
199
200	nt = first;
201	TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
202		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
203		vdrop(nt->dp);
204		free(nt, M_NAMEITRACKER);
205	}
206}
207
208static void
209nameicap_cleanup(struct nameidata *ndp)
210{
211	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
212	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
213	nameicap_cleanup_from(ndp, NULL);
214}
215
216/*
217 * For dotdot lookups in capability mode, only allow the component
218 * lookup to succeed if the resulting directory was already traversed
219 * during the operation.  This catches situations where already
220 * traversed directory is moved to different parent, and then we walk
221 * over it with dotdots.
222 *
223 * Also allow to force failure of dotdot lookups for non-local
224 * filesystems, where external agents might assist local lookups to
225 * escape the compartment.
226 */
227static int
228nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
229{
230	struct nameicap_tracker *nt;
231	struct mount *mp;
232
233	if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf &
234	    NI_LCF_STRICTRELATIVE) == 0)
235		return (0);
236	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0)
237		return (ENOTCAPABLE);
238	mp = dp->v_mount;
239	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
240	    (mp->mnt_flag & MNT_LOCAL) == 0)
241		return (ENOTCAPABLE);
242	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
243	    nm_link) {
244		if (dp == nt->dp) {
245			nt = TAILQ_NEXT(nt, nm_link);
246			if (nt != NULL)
247				nameicap_cleanup_from(ndp, nt);
248			return (0);
249		}
250	}
251	return (ENOTCAPABLE);
252}
253
254static void
255namei_cleanup_cnp(struct componentname *cnp)
256{
257
258	uma_zfree(namei_zone, cnp->cn_pnbuf);
259#ifdef DIAGNOSTIC
260	cnp->cn_pnbuf = NULL;
261	cnp->cn_nameptr = NULL;
262#endif
263}
264
265static int
266namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
267{
268	struct componentname *cnp;
269
270	cnp = &ndp->ni_cnd;
271	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
272#ifdef KTRACE
273		if (KTRPOINT(curthread, KTR_CAPFAIL))
274			ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
275#endif
276		return (ENOTCAPABLE);
277	}
278	while (*(cnp->cn_nameptr) == '/') {
279		cnp->cn_nameptr++;
280		ndp->ni_pathlen--;
281	}
282	*dpp = ndp->ni_rootdir;
283	vrefact(*dpp);
284	return (0);
285}
286
287static int
288namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
289{
290	struct componentname *cnp;
291	struct file *dfp;
292	struct thread *td;
293	struct pwd *pwd;
294	cap_rights_t rights;
295	int error;
296	bool startdir_used;
297
298	cnp = &ndp->ni_cnd;
299	td = cnp->cn_thread;
300
301	startdir_used = false;
302	*pwdp = NULL;
303	*dpp = NULL;
304
305#ifdef CAPABILITY_MODE
306	/*
307	 * In capability mode, lookups must be restricted to happen in
308	 * the subtree with the root specified by the file descriptor:
309	 * - The root must be real file descriptor, not the pseudo-descriptor
310	 *   AT_FDCWD.
311	 * - The passed path must be relative and not absolute.
312	 * - If lookup_cap_dotdot is disabled, path must not contain the
313	 *   '..' components.
314	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
315	 *   components lookups result in the directories which were
316	 *   previously walked by us, which prevents an escape from
317	 *   the relative root.
318	 */
319	if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) {
320		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
321		ndp->ni_resflags |= NIRES_STRICTREL;
322		if (ndp->ni_dirfd == AT_FDCWD) {
323#ifdef KTRACE
324			if (KTRPOINT(td, KTR_CAPFAIL))
325				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
326#endif
327			return (ECAPMODE);
328		}
329	}
330#endif
331	error = 0;
332
333	/*
334	 * Get starting point for the translation.
335	 */
336	pwd = pwd_hold(td);
337	/*
338	 * The reference on ni_rootdir is acquired in the block below to avoid
339	 * back-to-back atomics for absolute lookups.
340	 */
341	ndp->ni_rootdir = pwd->pwd_rdir;
342	ndp->ni_topdir = pwd->pwd_jdir;
343
344	if (cnp->cn_pnbuf[0] == '/') {
345		ndp->ni_resflags |= NIRES_ABS;
346		error = namei_handle_root(ndp, dpp);
347	} else {
348		if (ndp->ni_startdir != NULL) {
349			*dpp = ndp->ni_startdir;
350			startdir_used = true;
351		} else if (ndp->ni_dirfd == AT_FDCWD) {
352			*dpp = pwd->pwd_cdir;
353			vrefact(*dpp);
354		} else {
355			rights = *ndp->ni_rightsneeded;
356			cap_rights_set_one(&rights, CAP_LOOKUP);
357
358			if (cnp->cn_flags & AUDITVNODE1)
359				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
360			if (cnp->cn_flags & AUDITVNODE2)
361				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
362			/*
363			 * Effectively inlined fgetvp_rights, because
364			 * we need to inspect the file as well as
365			 * grabbing the vnode.  No check for O_PATH,
366			 * files to implement its semantic.
367			 */
368			error = fget_cap(td, ndp->ni_dirfd, &rights,
369			    &dfp, &ndp->ni_filecaps);
370			if (error != 0) {
371				/*
372				 * Preserve the error; it should either be EBADF
373				 * or capability-related, both of which can be
374				 * safely returned to the caller.
375				 */
376			} else {
377				if (dfp->f_ops == &badfileops) {
378					error = EBADF;
379				} else if (dfp->f_vnode == NULL) {
380					error = ENOTDIR;
381				} else {
382					*dpp = dfp->f_vnode;
383					vref(*dpp);
384
385					if ((dfp->f_flag & FSEARCH) != 0)
386						cnp->cn_flags |= NOEXECCHECK;
387				}
388				fdrop(dfp, td);
389			}
390#ifdef CAPABILITIES
391			/*
392			 * If file descriptor doesn't have all rights,
393			 * all lookups relative to it must also be
394			 * strictly relative.
395			 */
396			CAP_ALL(&rights);
397			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
398			    &rights) ||
399			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
400			    ndp->ni_filecaps.fc_nioctls != -1) {
401				ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
402				ndp->ni_resflags |= NIRES_STRICTREL;
403			}
404#endif
405		}
406		if (error == 0 && (*dpp)->v_type != VDIR &&
407		    (cnp->cn_pnbuf[0] != '\0' ||
408		    (cnp->cn_flags & EMPTYPATH) == 0))
409			error = ENOTDIR;
410	}
411	if (error == 0 && (cnp->cn_flags & RBENEATH) != 0) {
412		if (cnp->cn_pnbuf[0] == '/') {
413			error = ENOTCAPABLE;
414		} else if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0) {
415			ndp->ni_lcf |= NI_LCF_STRICTRELATIVE |
416			    NI_LCF_CAP_DOTDOT;
417		}
418	}
419
420	/*
421	 * If we are auditing the kernel pathname, save the user pathname.
422	 */
423	if (cnp->cn_flags & AUDITVNODE1)
424		AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
425	if (cnp->cn_flags & AUDITVNODE2)
426		AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
427	if (ndp->ni_startdir != NULL && !startdir_used)
428		vrele(ndp->ni_startdir);
429	if (error != 0) {
430		if (*dpp != NULL)
431			vrele(*dpp);
432		pwd_drop(pwd);
433		return (error);
434	}
435	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
436	    lookup_cap_dotdot != 0)
437		ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
438	SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
439	    cnp->cn_flags, false);
440	*pwdp = pwd;
441	return (0);
442}
443
444static int
445namei_getpath(struct nameidata *ndp)
446{
447	struct componentname *cnp;
448	int error;
449
450	cnp = &ndp->ni_cnd;
451
452	/*
453	 * Get a buffer for the name to be translated, and copy the
454	 * name into the buffer.
455	 */
456	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
457	if (ndp->ni_segflg == UIO_SYSSPACE) {
458		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
459		    &ndp->ni_pathlen);
460	} else {
461		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
462		    &ndp->ni_pathlen);
463	}
464
465	if (__predict_false(error != 0))
466		return (error);
467
468	/*
469	 * Don't allow empty pathnames unless EMPTYPATH is specified.
470	 * Caller checks for ENOENT as an indication for the empty path.
471	 */
472	if (__predict_false(*cnp->cn_pnbuf == '\0'))
473		return (ENOENT);
474
475	cnp->cn_nameptr = cnp->cn_pnbuf;
476	return (0);
477}
478
479static int
480namei_emptypath(struct nameidata *ndp)
481{
482	struct componentname *cnp;
483	struct pwd *pwd;
484	struct vnode *dp;
485	int error;
486
487	cnp = &ndp->ni_cnd;
488	MPASS(*cnp->cn_pnbuf == '\0');
489	MPASS((cnp->cn_flags & EMPTYPATH) != 0);
490	MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
491
492	error = namei_setup(ndp, &dp, &pwd);
493	if (error != 0) {
494		namei_cleanup_cnp(cnp);
495		goto errout;
496	}
497
498	/*
499	 * Usecount on dp already provided by namei_setup.
500	 */
501	ndp->ni_vp = dp;
502	namei_cleanup_cnp(cnp);
503	pwd_drop(pwd);
504	ndp->ni_resflags |= NIRES_EMPTYPATH;
505	NDVALIDATE(ndp);
506	if ((cnp->cn_flags & LOCKLEAF) != 0) {
507		VOP_LOCK(dp, (cnp->cn_flags & LOCKSHARED) != 0 ?
508		    LK_SHARED : LK_EXCLUSIVE);
509		if (VN_IS_DOOMED(dp)) {
510			vput(dp);
511			error = ENOENT;
512			goto errout;
513		}
514	}
515	SDT_PROBE4(vfs, namei, lookup, return, 0, ndp->ni_vp, false, ndp);
516	return (0);
517
518errout:
519	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
520	return (error);
521}
522
523/*
524 * Convert a pathname into a pointer to a locked vnode.
525 *
526 * The FOLLOW flag is set when symbolic links are to be followed
527 * when they occur at the end of the name translation process.
528 * Symbolic links are always followed for all other pathname
529 * components other than the last.
530 *
531 * The segflg defines whether the name is to be copied from user
532 * space or kernel space.
533 *
534 * Overall outline of namei:
535 *
536 *	copy in name
537 *	get starting directory
538 *	while (!done && !error) {
539 *		call lookup to search path.
540 *		if symbolic link, massage name in buffer and continue
541 *	}
542 */
543int
544namei(struct nameidata *ndp)
545{
546	char *cp;		/* pointer into pathname argument */
547	struct vnode *dp;	/* the directory we are searching */
548	struct iovec aiov;		/* uio for reading symbolic links */
549	struct componentname *cnp;
550	struct thread *td;
551	struct pwd *pwd;
552	struct uio auio;
553	int error, linklen;
554	enum cache_fpl_status status;
555
556	cnp = &ndp->ni_cnd;
557	td = cnp->cn_thread;
558#ifdef INVARIANTS
559	KASSERT((ndp->ni_debugflags & NAMEI_DBG_CALLED) == 0,
560	    ("%s: repeated call to namei without NDREINIT", __func__));
561	KASSERT(ndp->ni_debugflags == NAMEI_DBG_INITED,
562	    ("%s: bad debugflags %d", __func__, ndp->ni_debugflags));
563	ndp->ni_debugflags |= NAMEI_DBG_CALLED;
564	if (ndp->ni_startdir != NULL)
565		ndp->ni_debugflags |= NAMEI_DBG_HADSTARTDIR;
566	if (cnp->cn_flags & FAILIFEXISTS) {
567		KASSERT(cnp->cn_nameiop == CREATE,
568		    ("%s: FAILIFEXISTS passed for op %d", __func__, cnp->cn_nameiop));
569		/*
570		 * The limitation below is to restrict hairy corner cases.
571		 */
572		KASSERT((cnp->cn_flags & (LOCKPARENT | LOCKLEAF)) == LOCKPARENT,
573		    ("%s: FAILIFEXISTS must be passed with LOCKPARENT and without LOCKLEAF",
574		    __func__));
575	}
576	/*
577	 * For NDVALIDATE.
578	 *
579	 * While NDINIT may seem like a more natural place to do it, there are
580	 * callers which directly modify flags past invoking init.
581	 */
582	cnp->cn_origflags = cnp->cn_flags;
583#endif
584	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
585	KASSERT(ndp->ni_resflags == 0, ("%s: garbage in ni_resflags: %x\n",
586	    __func__, ndp->ni_resflags));
587	KASSERT(cnp->cn_cred && td->td_proc, ("namei: bad cred/proc"));
588	KASSERT((cnp->cn_flags & NAMEI_INTERNAL_FLAGS) == 0,
589	    ("namei: unexpected flags: %" PRIx64 "\n",
590	    cnp->cn_flags & NAMEI_INTERNAL_FLAGS));
591	if (cnp->cn_flags & NOCACHE)
592		KASSERT(cnp->cn_nameiop != LOOKUP,
593		    ("%s: NOCACHE passed with LOOKUP", __func__));
594	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
595	    ndp->ni_startdir->v_type == VBAD);
596
597	ndp->ni_lcf = 0;
598	ndp->ni_loopcnt = 0;
599	ndp->ni_vp = NULL;
600
601	error = namei_getpath(ndp);
602	if (__predict_false(error != 0)) {
603		if (error == ENOENT && (cnp->cn_flags & EMPTYPATH) != 0)
604			return (namei_emptypath(ndp));
605		namei_cleanup_cnp(cnp);
606		SDT_PROBE4(vfs, namei, lookup, return, error, NULL,
607		    false, ndp);
608		return (error);
609	}
610
611#ifdef KTRACE
612	if (KTRPOINT(td, KTR_NAMEI)) {
613		KASSERT(cnp->cn_thread == curthread,
614		    ("namei not using curthread"));
615		ktrnamei(cnp->cn_pnbuf);
616	}
617#endif
618
619	/*
620	 * First try looking up the target without locking any vnodes.
621	 *
622	 * We may need to start from scratch or pick up where it left off.
623	 */
624	error = cache_fplookup(ndp, &status, &pwd);
625	switch (status) {
626	case CACHE_FPL_STATUS_UNSET:
627		__assert_unreachable();
628		break;
629	case CACHE_FPL_STATUS_HANDLED:
630		if (error == 0)
631			NDVALIDATE(ndp);
632		return (error);
633	case CACHE_FPL_STATUS_PARTIAL:
634		TAILQ_INIT(&ndp->ni_cap_tracker);
635		dp = ndp->ni_startdir;
636		break;
637	case CACHE_FPL_STATUS_DESTROYED:
638		ndp->ni_loopcnt = 0;
639		error = namei_getpath(ndp);
640		if (__predict_false(error != 0)) {
641			namei_cleanup_cnp(cnp);
642			return (error);
643		}
644		/* FALLTHROUGH */
645	case CACHE_FPL_STATUS_ABORTED:
646		TAILQ_INIT(&ndp->ni_cap_tracker);
647		MPASS(ndp->ni_lcf == 0);
648		error = namei_setup(ndp, &dp, &pwd);
649		if (error != 0) {
650			namei_cleanup_cnp(cnp);
651			return (error);
652		}
653		break;
654	}
655
656	/*
657	 * Locked lookup.
658	 */
659	for (;;) {
660		ndp->ni_startdir = dp;
661		error = lookup(ndp);
662		if (error != 0)
663			goto out;
664
665		/*
666		 * If not a symbolic link, we're done.
667		 */
668		if ((cnp->cn_flags & ISSYMLINK) == 0) {
669			SDT_PROBE4(vfs, namei, lookup, return, error,
670			    (error == 0 ? ndp->ni_vp : NULL), false, ndp);
671			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
672				namei_cleanup_cnp(cnp);
673			} else
674				cnp->cn_flags |= HASBUF;
675			nameicap_cleanup(ndp);
676			pwd_drop(pwd);
677			if (error == 0)
678				NDVALIDATE(ndp);
679			return (error);
680		}
681		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
682			error = ELOOP;
683			break;
684		}
685#ifdef MAC
686		if ((cnp->cn_flags & NOMACCHECK) == 0) {
687			error = mac_vnode_check_readlink(td->td_ucred,
688			    ndp->ni_vp);
689			if (error != 0)
690				break;
691		}
692#endif
693		if (ndp->ni_pathlen > 1)
694			cp = uma_zalloc(namei_zone, M_WAITOK);
695		else
696			cp = cnp->cn_pnbuf;
697		aiov.iov_base = cp;
698		aiov.iov_len = MAXPATHLEN;
699		auio.uio_iov = &aiov;
700		auio.uio_iovcnt = 1;
701		auio.uio_offset = 0;
702		auio.uio_rw = UIO_READ;
703		auio.uio_segflg = UIO_SYSSPACE;
704		auio.uio_td = td;
705		auio.uio_resid = MAXPATHLEN;
706		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
707		if (error != 0) {
708			if (ndp->ni_pathlen > 1)
709				uma_zfree(namei_zone, cp);
710			break;
711		}
712		linklen = MAXPATHLEN - auio.uio_resid;
713		if (linklen == 0) {
714			if (ndp->ni_pathlen > 1)
715				uma_zfree(namei_zone, cp);
716			error = ENOENT;
717			break;
718		}
719		if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
720			if (ndp->ni_pathlen > 1)
721				uma_zfree(namei_zone, cp);
722			error = ENAMETOOLONG;
723			break;
724		}
725		if (ndp->ni_pathlen > 1) {
726			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
727			uma_zfree(namei_zone, cnp->cn_pnbuf);
728			cnp->cn_pnbuf = cp;
729		} else
730			cnp->cn_pnbuf[linklen] = '\0';
731		ndp->ni_pathlen += linklen;
732		vput(ndp->ni_vp);
733		dp = ndp->ni_dvp;
734		/*
735		 * Check if root directory should replace current directory.
736		 */
737		cnp->cn_nameptr = cnp->cn_pnbuf;
738		if (*(cnp->cn_nameptr) == '/') {
739			vrele(dp);
740			error = namei_handle_root(ndp, &dp);
741			if (error != 0)
742				goto out;
743		}
744	}
745	vput(ndp->ni_vp);
746	ndp->ni_vp = NULL;
747	vrele(ndp->ni_dvp);
748out:
749	MPASS(error != 0);
750	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
751	namei_cleanup_cnp(cnp);
752	nameicap_cleanup(ndp);
753	pwd_drop(pwd);
754	return (error);
755}
756
757static int
758compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
759{
760
761	if (mp == NULL || ((lkflags & LK_SHARED) &&
762	    (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
763	    ((cnflags & ISDOTDOT) &&
764	    (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
765		lkflags &= ~LK_SHARED;
766		lkflags |= LK_EXCLUSIVE;
767	}
768	lkflags |= LK_NODDLKTREAT;
769	return (lkflags);
770}
771
772static __inline int
773needs_exclusive_leaf(struct mount *mp, int flags)
774{
775
776	/*
777	 * Intermediate nodes can use shared locks, we only need to
778	 * force an exclusive lock for leaf nodes.
779	 */
780	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
781		return (0);
782
783	/* Always use exclusive locks if LOCKSHARED isn't set. */
784	if (!(flags & LOCKSHARED))
785		return (1);
786
787	/*
788	 * For lookups during open(), if the mount point supports
789	 * extended shared operations, then use a shared lock for the
790	 * leaf node, otherwise use an exclusive lock.
791	 */
792	if ((flags & ISOPEN) != 0)
793		return (!MNT_EXTENDED_SHARED(mp));
794
795	/*
796	 * Lookup requests outside of open() that specify LOCKSHARED
797	 * only need a shared lock on the leaf vnode.
798	 */
799	return (0);
800}
801
802/*
803 * Various filesystems expect to be able to copy a name component with length
804 * bounded by NAME_MAX into a directory entry buffer of size MAXNAMLEN.  Make
805 * sure that these are the same size.
806 */
807_Static_assert(MAXNAMLEN == NAME_MAX,
808    "MAXNAMLEN and NAME_MAX have different values");
809
810/*
811 * Search a pathname.
812 * This is a very central and rather complicated routine.
813 *
814 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
815 * The starting directory is taken from ni_startdir. The pathname is
816 * descended until done, or a symbolic link is encountered. The variable
817 * ni_more is clear if the path is completed; it is set to one if a
818 * symbolic link needing interpretation is encountered.
819 *
820 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
821 * whether the name is to be looked up, created, renamed, or deleted.
822 * When CREATE, RENAME, or DELETE is specified, information usable in
823 * creating, renaming, or deleting a directory entry may be calculated.
824 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
825 * locked. If flag has WANTPARENT or'ed into it, the parent directory is
826 * returned unlocked. Otherwise the parent directory is not returned. If
827 * the target of the pathname exists and LOCKLEAF is or'ed into the flag
828 * the target is returned locked, otherwise it is returned unlocked.
829 * When creating or renaming and LOCKPARENT is specified, the target may not
830 * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
831 *
832 * Overall outline of lookup:
833 *
834 * dirloop:
835 *	identify next component of name at ndp->ni_ptr
836 *	handle degenerate case where name is null string
837 *	if .. and crossing mount points and on mounted filesys, find parent
838 *	call VOP_LOOKUP routine for next component name
839 *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
840 *	    component vnode returned in ni_vp (if it exists), locked.
841 *	if result vnode is mounted on and crossing mount points,
842 *	    find mounted on vnode
843 *	if more components of name, do next level at dirloop
844 *	return the answer in ni_vp, locked if LOCKLEAF set
845 *	    if LOCKPARENT set, return locked parent in ni_dvp
846 *	    if WANTPARENT set, return unlocked parent in ni_dvp
847 */
848int
849lookup(struct nameidata *ndp)
850{
851	char *cp;			/* pointer into pathname argument */
852	char *prev_ni_next;		/* saved ndp->ni_next */
853	char *nulchar;			/* location of '\0' in cn_pnbuf */
854	struct vnode *dp = NULL;	/* the directory we are searching */
855	struct vnode *tdp;		/* saved dp */
856	struct mount *mp;		/* mount table entry */
857	struct prison *pr;
858	size_t prev_ni_pathlen;		/* saved ndp->ni_pathlen */
859	int docache;			/* == 0 do not cache last component */
860	int wantparent;			/* 1 => wantparent or lockparent flag */
861	int rdonly;			/* lookup read-only flag bit */
862	int error = 0;
863	int dpunlocked = 0;		/* dp has already been unlocked */
864	int relookup = 0;		/* do not consume the path component */
865	struct componentname *cnp = &ndp->ni_cnd;
866	int lkflags_save;
867	int ni_dvp_unlocked;
868
869	/*
870	 * Setup: break out flag bits into variables.
871	 */
872	ni_dvp_unlocked = 0;
873	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
874	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
875	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
876	/*
877	 * When set to zero, docache causes the last component of the
878	 * pathname to be deleted from the cache and the full lookup
879	 * of the name to be done (via VOP_CACHEDLOOKUP()). Often
880	 * filesystems need some pre-computed values that are made
881	 * during the full lookup, for instance UFS sets dp->i_offset.
882	 *
883	 * The docache variable is set to zero when requested by the
884	 * NOCACHE flag and for all modifying operations except CREATE.
885	 */
886	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
887	if (cnp->cn_nameiop == DELETE ||
888	    (wantparent && cnp->cn_nameiop != CREATE &&
889	     cnp->cn_nameiop != LOOKUP))
890		docache = 0;
891	rdonly = cnp->cn_flags & RDONLY;
892	cnp->cn_flags &= ~ISSYMLINK;
893	ndp->ni_dvp = NULL;
894	/*
895	 * We use shared locks until we hit the parent of the last cn then
896	 * we adjust based on the requesting flags.
897	 */
898	cnp->cn_lkflags = LK_SHARED;
899	dp = ndp->ni_startdir;
900	ndp->ni_startdir = NULLVP;
901	vn_lock(dp,
902	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
903	    cnp->cn_flags));
904
905dirloop:
906	/*
907	 * Search a new directory.
908	 *
909	 * The last component of the filename is left accessible via
910	 * cnp->cn_nameptr for callers that need the name. Callers needing
911	 * the name set the SAVENAME flag. When done, they assume
912	 * responsibility for freeing the pathname buffer.
913	 *
914	 * Store / as a temporary sentinel so that we only have one character
915	 * to test for. Pathnames tend to be short so this should not be
916	 * resulting in cache misses.
917	 */
918	nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
919	KASSERT(*nulchar == '\0',
920	    ("%s: expected nul at %p; string [%s]\n", __func__, nulchar,
921	    cnp->cn_pnbuf));
922	*nulchar = '/';
923	for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
924		KASSERT(*cp != '\0',
925		    ("%s: encountered unexpected nul; string [%s]\n", __func__,
926		    cnp->cn_nameptr));
927		continue;
928	}
929	*nulchar = '\0';
930	cnp->cn_namelen = cp - cnp->cn_nameptr;
931	if (cnp->cn_namelen > NAME_MAX) {
932		error = ENAMETOOLONG;
933		goto bad;
934	}
935#ifdef NAMEI_DIAGNOSTIC
936	{ char c = *cp;
937	*cp = '\0';
938	printf("{%s}: ", cnp->cn_nameptr);
939	*cp = c; }
940#endif
941	prev_ni_pathlen = ndp->ni_pathlen;
942	ndp->ni_pathlen -= cnp->cn_namelen;
943	KASSERT(ndp->ni_pathlen <= PATH_MAX,
944	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
945	prev_ni_next = ndp->ni_next;
946	ndp->ni_next = cp;
947
948	/*
949	 * Replace multiple slashes by a single slash and trailing slashes
950	 * by a null.  This must be done before VOP_LOOKUP() because some
951	 * fs's don't know about trailing slashes.  Remember if there were
952	 * trailing slashes to handle symlinks, existing non-directories
953	 * and non-existing files that won't be directories specially later.
954	 */
955	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
956		cp++;
957		ndp->ni_pathlen--;
958		if (*cp == '\0') {
959			*ndp->ni_next = '\0';
960			cnp->cn_flags |= TRAILINGSLASH;
961		}
962	}
963	ndp->ni_next = cp;
964
965	cnp->cn_flags |= MAKEENTRY;
966	if (*cp == '\0' && docache == 0)
967		cnp->cn_flags &= ~MAKEENTRY;
968	if (cnp->cn_namelen == 2 &&
969	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
970		cnp->cn_flags |= ISDOTDOT;
971	else
972		cnp->cn_flags &= ~ISDOTDOT;
973	if (*ndp->ni_next == 0)
974		cnp->cn_flags |= ISLASTCN;
975	else
976		cnp->cn_flags &= ~ISLASTCN;
977
978	if ((cnp->cn_flags & ISLASTCN) != 0 &&
979	    cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
980	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
981		error = EINVAL;
982		goto bad;
983	}
984
985	nameicap_tracker_add(ndp, dp);
986
987	/*
988	 * Check for degenerate name (e.g. / or "")
989	 * which is a way of talking about a directory,
990	 * e.g. like "/." or ".".
991	 */
992	if (cnp->cn_nameptr[0] == '\0') {
993		if (dp->v_type != VDIR) {
994			error = ENOTDIR;
995			goto bad;
996		}
997		if (cnp->cn_nameiop != LOOKUP) {
998			error = EISDIR;
999			goto bad;
1000		}
1001		if (wantparent) {
1002			ndp->ni_dvp = dp;
1003			VREF(dp);
1004		}
1005		ndp->ni_vp = dp;
1006
1007		if (cnp->cn_flags & AUDITVNODE1)
1008			AUDIT_ARG_VNODE1(dp);
1009		else if (cnp->cn_flags & AUDITVNODE2)
1010			AUDIT_ARG_VNODE2(dp);
1011
1012		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
1013			VOP_UNLOCK(dp);
1014		/* XXX This should probably move to the top of function. */
1015		if (cnp->cn_flags & SAVESTART)
1016			panic("lookup: SAVESTART");
1017		goto success;
1018	}
1019
1020	/*
1021	 * Handle "..": five special cases.
1022	 * 0. If doing a capability lookup and lookup_cap_dotdot is
1023	 *    disabled, return ENOTCAPABLE.
1024	 * 1. Return an error if this is the last component of
1025	 *    the name and the operation is DELETE or RENAME.
1026	 * 2. If at root directory (e.g. after chroot)
1027	 *    or at absolute root directory
1028	 *    then ignore it so can't get out.
1029	 * 3. If this vnode is the root of a mounted
1030	 *    filesystem, then replace it with the
1031	 *    vnode which was mounted on so we take the
1032	 *    .. in the other filesystem.
1033	 * 4. If the vnode is the top directory of
1034	 *    the jail or chroot, don't let them out.
1035	 * 5. If doing a capability lookup and lookup_cap_dotdot is
1036	 *    enabled, return ENOTCAPABLE if the lookup would escape
1037	 *    from the initial file descriptor directory.  Checks are
1038	 *    done by ensuring that namei() already traversed the
1039	 *    result of dotdot lookup.
1040	 */
1041	if (cnp->cn_flags & ISDOTDOT) {
1042		if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
1043		    == NI_LCF_STRICTRELATIVE) {
1044#ifdef KTRACE
1045			if (KTRPOINT(curthread, KTR_CAPFAIL))
1046				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
1047#endif
1048			error = ENOTCAPABLE;
1049			goto bad;
1050		}
1051		if ((cnp->cn_flags & ISLASTCN) != 0 &&
1052		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1053			error = EINVAL;
1054			goto bad;
1055		}
1056		for (;;) {
1057			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
1058			     pr = pr->pr_parent)
1059				if (dp == pr->pr_root)
1060					break;
1061			if (dp == ndp->ni_rootdir ||
1062			    dp == ndp->ni_topdir ||
1063			    dp == rootvnode ||
1064			    pr != NULL ||
1065			    ((dp->v_vflag & VV_ROOT) != 0 &&
1066			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
1067				ndp->ni_dvp = dp;
1068				ndp->ni_vp = dp;
1069				VREF(dp);
1070				goto nextname;
1071			}
1072			if ((dp->v_vflag & VV_ROOT) == 0)
1073				break;
1074			if (VN_IS_DOOMED(dp)) {	/* forced unmount */
1075				error = ENOENT;
1076				goto bad;
1077			}
1078			tdp = dp;
1079			dp = dp->v_mount->mnt_vnodecovered;
1080			VREF(dp);
1081			vput(tdp);
1082			vn_lock(dp,
1083			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
1084			    LK_RETRY, ISDOTDOT));
1085			error = nameicap_check_dotdot(ndp, dp);
1086			if (error != 0) {
1087#ifdef KTRACE
1088				if (KTRPOINT(curthread, KTR_CAPFAIL))
1089					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
1090#endif
1091				goto bad;
1092			}
1093		}
1094	}
1095
1096	/*
1097	 * We now have a segment name to search for, and a directory to search.
1098	 */
1099unionlookup:
1100#ifdef MAC
1101	error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp, cnp);
1102	if (error)
1103		goto bad;
1104#endif
1105	ndp->ni_dvp = dp;
1106	ndp->ni_vp = NULL;
1107	ASSERT_VOP_LOCKED(dp, "lookup");
1108	/*
1109	 * If we have a shared lock we may need to upgrade the lock for the
1110	 * last operation.
1111	 */
1112	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
1113	    dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED)
1114		vn_lock(dp, LK_UPGRADE|LK_RETRY);
1115	if (VN_IS_DOOMED(dp)) {
1116		error = ENOENT;
1117		goto bad;
1118	}
1119	/*
1120	 * If we're looking up the last component and we need an exclusive
1121	 * lock, adjust our lkflags.
1122	 */
1123	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
1124		cnp->cn_lkflags = LK_EXCLUSIVE;
1125#ifdef NAMEI_DIAGNOSTIC
1126	vn_printf(dp, "lookup in ");
1127#endif
1128	lkflags_save = cnp->cn_lkflags;
1129	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
1130	    cnp->cn_flags);
1131	error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
1132	cnp->cn_lkflags = lkflags_save;
1133	if (error != 0) {
1134		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
1135#ifdef NAMEI_DIAGNOSTIC
1136		printf("not found\n");
1137#endif
1138		if ((error == ENOENT) &&
1139		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
1140		    (dp->v_mount->mnt_flag & MNT_UNION)) {
1141			tdp = dp;
1142			dp = dp->v_mount->mnt_vnodecovered;
1143			VREF(dp);
1144			vput(tdp);
1145			vn_lock(dp,
1146			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
1147			    LK_RETRY, cnp->cn_flags));
1148			nameicap_tracker_add(ndp, dp);
1149			goto unionlookup;
1150		}
1151
1152		if (error == ERELOOKUP) {
1153			vref(dp);
1154			ndp->ni_vp = dp;
1155			error = 0;
1156			relookup = 1;
1157			goto good;
1158		}
1159
1160		if (error != EJUSTRETURN)
1161			goto bad;
1162		/*
1163		 * At this point, we know we're at the end of the
1164		 * pathname.  If creating / renaming, we can consider
1165		 * allowing the file or directory to be created / renamed,
1166		 * provided we're not on a read-only filesystem.
1167		 */
1168		if (rdonly) {
1169			error = EROFS;
1170			goto bad;
1171		}
1172		/* trailing slash only allowed for directories */
1173		if ((cnp->cn_flags & TRAILINGSLASH) &&
1174		    !(cnp->cn_flags & WILLBEDIR)) {
1175			error = ENOENT;
1176			goto bad;
1177		}
1178		if ((cnp->cn_flags & LOCKPARENT) == 0)
1179			VOP_UNLOCK(dp);
1180		/*
1181		 * We return with ni_vp NULL to indicate that the entry
1182		 * doesn't currently exist, leaving a pointer to the
1183		 * (possibly locked) directory vnode in ndp->ni_dvp.
1184		 */
1185		if (cnp->cn_flags & SAVESTART) {
1186			ndp->ni_startdir = ndp->ni_dvp;
1187			VREF(ndp->ni_startdir);
1188		}
1189		goto success;
1190	}
1191
1192good:
1193#ifdef NAMEI_DIAGNOSTIC
1194	printf("found\n");
1195#endif
1196	dp = ndp->ni_vp;
1197
1198	/*
1199	 * Check to see if the vnode has been mounted on;
1200	 * if so find the root of the mounted filesystem.
1201	 */
1202	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
1203	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1204		if (vfs_busy(mp, 0))
1205			continue;
1206		vput(dp);
1207		if (dp != ndp->ni_dvp)
1208			vput(ndp->ni_dvp);
1209		else
1210			vrele(ndp->ni_dvp);
1211		vrefact(vp_crossmp);
1212		ndp->ni_dvp = vp_crossmp;
1213		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
1214		    cnp->cn_flags), &tdp);
1215		vfs_unbusy(mp);
1216		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
1217			panic("vp_crossmp exclusively locked or reclaimed");
1218		if (error) {
1219			dpunlocked = 1;
1220			goto bad2;
1221		}
1222		ndp->ni_vp = dp = tdp;
1223	}
1224
1225	/*
1226	 * Check for symbolic link
1227	 */
1228	if ((dp->v_type == VLNK) &&
1229	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
1230	     *ndp->ni_next == '/')) {
1231		cnp->cn_flags |= ISSYMLINK;
1232		if (VN_IS_DOOMED(dp)) {
1233			/*
1234			 * We can't know whether the directory was mounted with
1235			 * NOSYMFOLLOW, so we can't follow safely.
1236			 */
1237			error = ENOENT;
1238			goto bad2;
1239		}
1240		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
1241			error = EACCES;
1242			goto bad2;
1243		}
1244		/*
1245		 * Symlink code always expects an unlocked dvp.
1246		 */
1247		if (ndp->ni_dvp != ndp->ni_vp) {
1248			VOP_UNLOCK(ndp->ni_dvp);
1249			ni_dvp_unlocked = 1;
1250		}
1251		goto success;
1252	}
1253
1254nextname:
1255	/*
1256	 * Not a symbolic link that we will follow.  Continue with the
1257	 * next component if there is any; otherwise, we're done.
1258	 */
1259	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
1260	    ("lookup: invalid path state."));
1261	if (relookup) {
1262		relookup = 0;
1263		ndp->ni_pathlen = prev_ni_pathlen;
1264		ndp->ni_next = prev_ni_next;
1265		if (ndp->ni_dvp != dp)
1266			vput(ndp->ni_dvp);
1267		else
1268			vrele(ndp->ni_dvp);
1269		goto dirloop;
1270	}
1271	if (cnp->cn_flags & ISDOTDOT) {
1272		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
1273		if (error != 0) {
1274#ifdef KTRACE
1275			if (KTRPOINT(curthread, KTR_CAPFAIL))
1276				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
1277#endif
1278			goto bad2;
1279		}
1280	}
1281	if (*ndp->ni_next == '/') {
1282		cnp->cn_nameptr = ndp->ni_next;
1283		while (*cnp->cn_nameptr == '/') {
1284			cnp->cn_nameptr++;
1285			ndp->ni_pathlen--;
1286		}
1287		if (ndp->ni_dvp != dp)
1288			vput(ndp->ni_dvp);
1289		else
1290			vrele(ndp->ni_dvp);
1291		goto dirloop;
1292	}
1293	/*
1294	 * If we're processing a path with a trailing slash,
1295	 * check that the end result is a directory.
1296	 */
1297	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
1298		error = ENOTDIR;
1299		goto bad2;
1300	}
1301	/*
1302	 * Disallow directory write attempts on read-only filesystems.
1303	 */
1304	if (rdonly &&
1305	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1306		error = EROFS;
1307		goto bad2;
1308	}
1309	if (cnp->cn_flags & SAVESTART) {
1310		ndp->ni_startdir = ndp->ni_dvp;
1311		VREF(ndp->ni_startdir);
1312	}
1313	if (!wantparent) {
1314		ni_dvp_unlocked = 2;
1315		if (ndp->ni_dvp != dp)
1316			vput(ndp->ni_dvp);
1317		else
1318			vrele(ndp->ni_dvp);
1319	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
1320		VOP_UNLOCK(ndp->ni_dvp);
1321		ni_dvp_unlocked = 1;
1322	}
1323
1324	if (cnp->cn_flags & AUDITVNODE1)
1325		AUDIT_ARG_VNODE1(dp);
1326	else if (cnp->cn_flags & AUDITVNODE2)
1327		AUDIT_ARG_VNODE2(dp);
1328
1329	if ((cnp->cn_flags & LOCKLEAF) == 0)
1330		VOP_UNLOCK(dp);
1331success:
1332	/*
1333	 * FIXME: for lookups which only cross a mount point to fetch the
1334	 * root vnode, ni_dvp will be set to vp_crossmp. This can be a problem
1335	 * if either WANTPARENT or LOCKPARENT is set.
1336	 */
1337	/*
1338	 * Because of shared lookup we may have the vnode shared locked, but
1339	 * the caller may want it to be exclusively locked.
1340	 */
1341	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
1342	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
1343		vn_lock(dp, LK_UPGRADE | LK_RETRY);
1344		if (VN_IS_DOOMED(dp)) {
1345			error = ENOENT;
1346			goto bad2;
1347		}
1348	}
1349	if (ndp->ni_vp != NULL) {
1350		if ((cnp->cn_flags & ISDOTDOT) == 0)
1351			nameicap_tracker_add(ndp, ndp->ni_vp);
1352		if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS)
1353			goto bad_eexist;
1354	}
1355	return (0);
1356
1357bad2:
1358	if (ni_dvp_unlocked != 2) {
1359		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
1360			vput(ndp->ni_dvp);
1361		else
1362			vrele(ndp->ni_dvp);
1363	}
1364bad:
1365	if (!dpunlocked)
1366		vput(dp);
1367	ndp->ni_vp = NULL;
1368	return (error);
1369bad_eexist:
1370	/*
1371	 * FAILIFEXISTS handling.
1372	 *
1373	 * XXX namei called with LOCKPARENT but not LOCKLEAF has the strange
1374	 * behaviour of leaving the vnode unlocked if the target is the same
1375	 * vnode as the parent.
1376	 */
1377	MPASS((cnp->cn_flags & ISSYMLINK) == 0);
1378	if (ndp->ni_vp == ndp->ni_dvp)
1379		vrele(ndp->ni_dvp);
1380	else
1381		vput(ndp->ni_dvp);
1382	vrele(ndp->ni_vp);
1383	ndp->ni_dvp = NULL;
1384	ndp->ni_vp = NULL;
1385	NDFREE(ndp, NDF_ONLY_PNBUF);
1386	return (EEXIST);
1387}
1388
1389/*
1390 * relookup - lookup a path name component
1391 *    Used by lookup to re-acquire things.
1392 */
1393int
1394relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
1395{
1396	struct vnode *dp = NULL;		/* the directory we are searching */
1397	int rdonly;			/* lookup read-only flag bit */
1398	int error = 0;
1399
1400	KASSERT(cnp->cn_flags & ISLASTCN,
1401	    ("relookup: Not given last component."));
1402	/*
1403	 * Setup: break out flag bits into variables.
1404	 */
1405	KASSERT((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) != 0,
1406	    ("relookup: parent not wanted"));
1407	rdonly = cnp->cn_flags & RDONLY;
1408	cnp->cn_flags &= ~ISSYMLINK;
1409	dp = dvp;
1410	cnp->cn_lkflags = LK_EXCLUSIVE;
1411	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
1412
1413	/*
1414	 * Search a new directory.
1415	 *
1416	 * The last component of the filename is left accessible via
1417	 * cnp->cn_nameptr for callers that need the name. Callers needing
1418	 * the name set the SAVENAME flag. When done, they assume
1419	 * responsibility for freeing the pathname buffer.
1420	 */
1421#ifdef NAMEI_DIAGNOSTIC
1422	printf("{%s}: ", cnp->cn_nameptr);
1423#endif
1424
1425	/*
1426	 * Check for "" which represents the root directory after slash
1427	 * removal.
1428	 */
1429	if (cnp->cn_nameptr[0] == '\0') {
1430		/*
1431		 * Support only LOOKUP for "/" because lookup()
1432		 * can't succeed for CREATE, DELETE and RENAME.
1433		 */
1434		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
1435		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
1436
1437		if (!(cnp->cn_flags & LOCKLEAF))
1438			VOP_UNLOCK(dp);
1439		*vpp = dp;
1440		/* XXX This should probably move to the top of function. */
1441		if (cnp->cn_flags & SAVESTART)
1442			panic("lookup: SAVESTART");
1443		return (0);
1444	}
1445
1446	if (cnp->cn_flags & ISDOTDOT)
1447		panic ("relookup: lookup on dot-dot");
1448
1449	/*
1450	 * We now have a segment name to search for, and a directory to search.
1451	 */
1452#ifdef NAMEI_DIAGNOSTIC
1453	vn_printf(dp, "search in ");
1454#endif
1455	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
1456		KASSERT(*vpp == NULL, ("leaf should be empty"));
1457		if (error != EJUSTRETURN)
1458			goto bad;
1459		/*
1460		 * If creating and at end of pathname, then can consider
1461		 * allowing file to be created.
1462		 */
1463		if (rdonly) {
1464			error = EROFS;
1465			goto bad;
1466		}
1467		/* ASSERT(dvp == ndp->ni_startdir) */
1468		if (cnp->cn_flags & SAVESTART)
1469			VREF(dvp);
1470		if ((cnp->cn_flags & LOCKPARENT) == 0)
1471			VOP_UNLOCK(dp);
1472		/*
1473		 * We return with ni_vp NULL to indicate that the entry
1474		 * doesn't currently exist, leaving a pointer to the
1475		 * (possibly locked) directory vnode in ndp->ni_dvp.
1476		 */
1477		return (0);
1478	}
1479
1480	dp = *vpp;
1481
1482	/*
1483	 * Disallow directory write attempts on read-only filesystems.
1484	 */
1485	if (rdonly &&
1486	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1487		if (dvp == dp)
1488			vrele(dvp);
1489		else
1490			vput(dvp);
1491		error = EROFS;
1492		goto bad;
1493	}
1494	/*
1495	 * Set the parent lock/ref state to the requested state.
1496	 */
1497	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp)
1498		VOP_UNLOCK(dvp);
1499	/*
1500	 * Check for symbolic link
1501	 */
1502	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
1503	    ("relookup: symlink found.\n"));
1504
1505	/* ASSERT(dvp == ndp->ni_startdir) */
1506	if (cnp->cn_flags & SAVESTART)
1507		VREF(dvp);
1508
1509	if ((cnp->cn_flags & LOCKLEAF) == 0)
1510		VOP_UNLOCK(dp);
1511	return (0);
1512bad:
1513	vput(dp);
1514	*vpp = NULL;
1515	return (error);
1516}
1517
1518/*
1519 * Free data allocated by namei(); see namei(9) for details.
1520 */
1521void
1522NDFREE_PNBUF(struct nameidata *ndp)
1523{
1524
1525	if ((ndp->ni_cnd.cn_flags & HASBUF) != 0) {
1526		MPASS((ndp->ni_cnd.cn_flags & (SAVENAME | SAVESTART)) != 0);
1527		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
1528		ndp->ni_cnd.cn_flags &= ~HASBUF;
1529	}
1530}
1531
1532/*
1533 * NDFREE_PNBUF replacement for callers that know there is no buffer.
1534 *
1535 * This is a hack. Preferably the VFS layer would not produce anything more
1536 * than it was asked to do. Unfortunately several non-LOOKUP cases can add the
1537 * HASBUF flag to the result. Even then an interface could be implemented where
1538 * the caller specifies what they expected to see in the result and what they
1539 * are going to take care of.
1540 *
1541 * In the meantime provide this kludge as a trivial replacement for NDFREE_PNBUF
1542 * calls scattered throughout the kernel where we know for a fact the flag must not
1543 * be seen.
1544 */
1545#ifdef INVARIANTS
1546void
1547NDFREE_NOTHING(struct nameidata *ndp)
1548{
1549	struct componentname *cnp;
1550
1551	cnp = &ndp->ni_cnd;
1552	KASSERT(cnp->cn_nameiop == LOOKUP, ("%s: got non-LOOKUP op %d\n",
1553	    __func__, cnp->cn_nameiop));
1554	KASSERT((cnp->cn_flags & (SAVENAME | HASBUF)) == 0,
1555	    ("%s: bad flags \%" PRIx64 "\n", __func__, cnp->cn_flags));
1556}
1557#endif
1558
1559void
1560(NDFREE)(struct nameidata *ndp, const u_int flags)
1561{
1562	int unlock_dvp;
1563	int unlock_vp;
1564
1565	unlock_dvp = 0;
1566	unlock_vp = 0;
1567
1568	if (!(flags & NDF_NO_FREE_PNBUF)) {
1569		NDFREE_PNBUF(ndp);
1570	}
1571	if (!(flags & NDF_NO_VP_UNLOCK) &&
1572	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
1573		unlock_vp = 1;
1574	if (!(flags & NDF_NO_DVP_UNLOCK) &&
1575	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
1576	    ndp->ni_dvp != ndp->ni_vp)
1577		unlock_dvp = 1;
1578	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
1579		if (unlock_vp) {
1580			vput(ndp->ni_vp);
1581			unlock_vp = 0;
1582		} else
1583			vrele(ndp->ni_vp);
1584		ndp->ni_vp = NULL;
1585	}
1586	if (unlock_vp)
1587		VOP_UNLOCK(ndp->ni_vp);
1588	if (!(flags & NDF_NO_DVP_RELE) &&
1589	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
1590		if (unlock_dvp) {
1591			vput(ndp->ni_dvp);
1592			unlock_dvp = 0;
1593		} else
1594			vrele(ndp->ni_dvp);
1595		ndp->ni_dvp = NULL;
1596	}
1597	if (unlock_dvp)
1598		VOP_UNLOCK(ndp->ni_dvp);
1599	if (!(flags & NDF_NO_STARTDIR_RELE) &&
1600	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
1601		vrele(ndp->ni_startdir);
1602		ndp->ni_startdir = NULL;
1603	}
1604}
1605
1606#ifdef INVARIANTS
1607/*
1608 * Validate the final state of ndp after the lookup.
1609 *
1610 * Historically filesystems were allowed to modify cn_flags. Most notably they
1611 * can add SAVENAME to the request, resulting in HASBUF and pushing subsequent
1612 * clean up to the consumer. In practice this seems to only concern != LOOKUP
1613 * operations.
1614 *
1615 * As a step towards stricter API contract this routine validates the state to
1616 * clean up. Note validation is a work in progress with the intent of becoming
1617 * stricter over time.
1618 */
1619#define NDMODIFYINGFLAGS (LOCKLEAF | LOCKPARENT | WANTPARENT | SAVENAME | SAVESTART | HASBUF)
1620void
1621NDVALIDATE(struct nameidata *ndp)
1622{
1623	struct componentname *cnp;
1624	u_int64_t used, orig;
1625
1626	cnp = &ndp->ni_cnd;
1627	orig = cnp->cn_origflags;
1628	used = cnp->cn_flags;
1629	switch (cnp->cn_nameiop) {
1630	case LOOKUP:
1631		/*
1632		 * For plain lookup we require strict conformance -- nothing
1633		 * to clean up if it was not requested by the caller.
1634		 */
1635		orig &= NDMODIFYINGFLAGS;
1636		used &= NDMODIFYINGFLAGS;
1637		if ((orig & (SAVENAME | SAVESTART)) != 0)
1638			orig |= HASBUF;
1639		if (orig != used) {
1640			goto out_mismatch;
1641		}
1642		break;
1643	case CREATE:
1644	case DELETE:
1645	case RENAME:
1646		/*
1647		 * Some filesystems set SAVENAME to provoke HASBUF, accomodate
1648		 * for it until it gets fixed.
1649		 */
1650		orig &= NDMODIFYINGFLAGS;
1651		orig |= (SAVENAME | HASBUF);
1652		used &= NDMODIFYINGFLAGS;
1653		used |= (SAVENAME | HASBUF);
1654		if (orig != used) {
1655			goto out_mismatch;
1656		}
1657		break;
1658	}
1659	return;
1660out_mismatch:
1661	panic("%s: mismatched flags for op %d: added %" PRIx64 ", "
1662	    "removed %" PRIx64" (%" PRIx64" != %" PRIx64"; stored %" PRIx64" != %" PRIx64")",
1663	    __func__, cnp->cn_nameiop, used & ~orig, orig &~ used,
1664	    orig, used, cnp->cn_origflags, cnp->cn_flags);
1665}
1666#endif
1667
1668/*
1669 * Determine if there is a suitable alternate filename under the specified
1670 * prefix for the specified path.  If the create flag is set, then the
1671 * alternate prefix will be used so long as the parent directory exists.
1672 * This is used by the various compatibility ABIs so that Linux binaries prefer
1673 * files under /compat/linux for example.  The chosen path (whether under
1674 * the prefix or under /) is returned in a kernel malloc'd buffer pointed
1675 * to by pathbuf.  The caller is responsible for free'ing the buffer from
1676 * the M_TEMP bucket if one is returned.
1677 */
1678int
1679kern_alternate_path(struct thread *td, const char *prefix, const char *path,
1680    enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
1681{
1682	struct nameidata nd, ndroot;
1683	char *ptr, *buf, *cp;
1684	size_t len, sz;
1685	int error;
1686
1687	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1688	*pathbuf = buf;
1689
1690	/* Copy the prefix into the new pathname as a starting point. */
1691	len = strlcpy(buf, prefix, MAXPATHLEN);
1692	if (len >= MAXPATHLEN) {
1693		*pathbuf = NULL;
1694		free(buf, M_TEMP);
1695		return (EINVAL);
1696	}
1697	sz = MAXPATHLEN - len;
1698	ptr = buf + len;
1699
1700	/* Append the filename to the prefix. */
1701	if (pathseg == UIO_SYSSPACE)
1702		error = copystr(path, ptr, sz, &len);
1703	else
1704		error = copyinstr(path, ptr, sz, &len);
1705
1706	if (error) {
1707		*pathbuf = NULL;
1708		free(buf, M_TEMP);
1709		return (error);
1710	}
1711
1712	/* Only use a prefix with absolute pathnames. */
1713	if (*ptr != '/') {
1714		error = EINVAL;
1715		goto keeporig;
1716	}
1717
1718	if (dirfd != AT_FDCWD) {
1719		/*
1720		 * We want the original because the "prefix" is
1721		 * included in the already opened dirfd.
1722		 */
1723		bcopy(ptr, buf, len);
1724		return (0);
1725	}
1726
1727	/*
1728	 * We know that there is a / somewhere in this pathname.
1729	 * Search backwards for it, to find the file's parent dir
1730	 * to see if it exists in the alternate tree. If it does,
1731	 * and we want to create a file (cflag is set). We don't
1732	 * need to worry about the root comparison in this case.
1733	 */
1734
1735	if (create) {
1736		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
1737		*cp = '\0';
1738
1739		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
1740		error = namei(&nd);
1741		*cp = '/';
1742		if (error != 0)
1743			goto keeporig;
1744	} else {
1745		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
1746
1747		error = namei(&nd);
1748		if (error != 0)
1749			goto keeporig;
1750
1751		/*
1752		 * We now compare the vnode of the prefix to the one
1753		 * vnode asked. If they resolve to be the same, then we
1754		 * ignore the match so that the real root gets used.
1755		 * This avoids the problem of traversing "../.." to find the
1756		 * root directory and never finding it, because "/" resolves
1757		 * to the emulation root directory. This is expensive :-(
1758		 */
1759		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
1760		    td);
1761
1762		/* We shouldn't ever get an error from this namei(). */
1763		error = namei(&ndroot);
1764		if (error == 0) {
1765			if (nd.ni_vp == ndroot.ni_vp)
1766				error = ENOENT;
1767
1768			NDFREE(&ndroot, NDF_ONLY_PNBUF);
1769			vrele(ndroot.ni_vp);
1770		}
1771	}
1772
1773	NDFREE(&nd, NDF_ONLY_PNBUF);
1774	vrele(nd.ni_vp);
1775
1776keeporig:
1777	/* If there was an error, use the original path name. */
1778	if (error)
1779		bcopy(ptr, buf, len);
1780	return (error);
1781}
1782