vfs_mount.c revision 153034
1/*-
2 * Copyright (c) 1999-2004 Poul-Henning Kamp
3 * Copyright (c) 1999 Michael Smith
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_mount.c 153034 2005-12-03 01:26:27Z rodrigc $");
39
40#include <sys/param.h>
41#include <sys/conf.h>
42#include <sys/jail.h>
43#include <sys/kernel.h>
44#include <sys/libkern.h>
45#include <sys/mac.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/proc.h>
51#include <sys/filedesc.h>
52#include <sys/reboot.h>
53#include <sys/syscallsubr.h>
54#include <sys/sysproto.h>
55#include <sys/sx.h>
56#include <sys/sysctl.h>
57#include <sys/sysent.h>
58#include <sys/systm.h>
59#include <sys/vnode.h>
60
61#include <geom/geom.h>
62
63#include <machine/stdarg.h>
64
65#include "opt_rootdevname.h"
66#include "opt_ddb.h"
67#include "opt_mac.h"
68
69#ifdef DDB
70#include <ddb/ddb.h>
71#endif
72
73#define	ROOTNAME		"root_device"
74#define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
75
76static int	vfs_domount(struct thread *td, const char *fstype,
77		    char *fspath, int fsflags, void *fsdata);
78static int	vfs_mount_alloc(struct vnode *dvp, struct vfsconf *vfsp,
79		    const char *fspath, struct thread *td, struct mount **mpp);
80static int	vfs_mountroot_ask(void);
81static int	vfs_mountroot_try(const char *mountfrom);
82static int	vfs_donmount(struct thread *td, int fsflags,
83		    struct uio *fsoptions);
84static void	free_mntarg(struct mntarg *ma);
85static void	vfs_mount_destroy(struct mount *, struct thread *);
86static int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
87
88static int	usermount = 0;
89SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
90    "Unprivileged users may mount and unmount file systems");
91
92MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
93
94/* List of mounted filesystems. */
95struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
96
97/* For any iteration/modification of mountlist */
98struct mtx mountlist_mtx;
99MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
100
101TAILQ_HEAD(vfsoptlist, vfsopt);
102struct vfsopt {
103	TAILQ_ENTRY(vfsopt) link;
104	char	*name;
105	void	*value;
106	int	len;
107};
108
109/*
110 * The vnode of the system's root (/ in the filesystem, without chroot
111 * active.)
112 */
113struct vnode	*rootvnode;
114
115/*
116 * The root filesystem is detailed in the kernel environment variable
117 * vfs.root.mountfrom, which is expected to be in the general format
118 *
119 * <vfsname>:[<path>]
120 * vfsname   := the name of a VFS known to the kernel and capable
121 *              of being mounted as root
122 * path      := disk device name or other data used by the filesystem
123 *              to locate its physical store
124 */
125
126/*
127 * Global opts, taken by all filesystems
128 */
129static const char *global_opts[] = {
130	"fstype",
131	"fspath",
132	"ro",
133	"rw",
134	"suid",
135	"exec",
136	NULL
137};
138
139/*
140 * The root specifiers we will try if RB_CDROM is specified.
141 */
142static char *cdrom_rootdevnames[] = {
143	"cd9660:cd0",
144	"cd9660:acd0",
145	NULL
146};
147
148/* legacy find-root code */
149char		*rootdevnames[2] = {NULL, NULL};
150#ifndef ROOTDEVNAME
151#  define ROOTDEVNAME NULL
152#endif
153static const char	*ctrootdevname = ROOTDEVNAME;
154
155/*
156 * ---------------------------------------------------------------------
157 * Functions for building and sanitizing the mount options
158 */
159
160/* Remove one mount option. */
161static void
162vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
163{
164
165	TAILQ_REMOVE(opts, opt, link);
166	free(opt->name, M_MOUNT);
167	if (opt->value != NULL)
168		free(opt->value, M_MOUNT);
169#ifdef INVARIANTS
170	else if (opt->len != 0)
171		panic("%s: mount option with NULL value but length != 0",
172		    __func__);
173#endif
174	free(opt, M_MOUNT);
175}
176
177/* Release all resources related to the mount options. */
178static void
179vfs_freeopts(struct vfsoptlist *opts)
180{
181	struct vfsopt *opt;
182
183	while (!TAILQ_EMPTY(opts)) {
184		opt = TAILQ_FIRST(opts);
185		vfs_freeopt(opts, opt);
186	}
187	free(opts, M_MOUNT);
188}
189
190/*
191 * Check if options are equal (with or without the "no" prefix).
192 */
193static int
194vfs_equalopts(const char *opt1, const char *opt2)
195{
196
197	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
198	if (strcmp(opt1, opt2) == 0)
199		return (1);
200	/* "noopt" vs. "opt" */
201	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
202		return (1);
203	/* "opt" vs. "noopt" */
204	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
205		return (1);
206	return (0);
207}
208
209/*
210 * If a mount option is specified several times,
211 * (with or without the "no" prefix) only keep
212 * the last occurence of it.
213 */
214static void
215vfs_sanitizeopts(struct vfsoptlist *opts)
216{
217	struct vfsopt *opt, *opt2, *tmp;
218
219	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
220		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
221		while (opt2 != NULL) {
222			if (vfs_equalopts(opt->name, opt2->name)) {
223				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
224				vfs_freeopt(opts, opt2);
225				opt2 = tmp;
226			} else {
227				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
228			}
229		}
230	}
231}
232
233/*
234 * Build a linked list of mount options from a struct uio.
235 */
236static int
237vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
238{
239	struct vfsoptlist *opts;
240	struct vfsopt *opt;
241	size_t memused;
242	unsigned int i, iovcnt;
243	int error, namelen, optlen;
244
245	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
246	TAILQ_INIT(opts);
247	memused = 0;
248	iovcnt = auio->uio_iovcnt;
249	for (i = 0; i < iovcnt; i += 2) {
250		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
251		namelen = auio->uio_iov[i].iov_len;
252		optlen = auio->uio_iov[i + 1].iov_len;
253		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
254		opt->value = NULL;
255		opt->len = 0;
256
257		/*
258		 * Do this early, so jumps to "bad" will free the current
259		 * option.
260		 */
261		TAILQ_INSERT_TAIL(opts, opt, link);
262		memused += sizeof(struct vfsopt) + optlen + namelen;
263
264		/*
265		 * Avoid consuming too much memory, and attempts to overflow
266		 * memused.
267		 */
268		if (memused > VFS_MOUNTARG_SIZE_MAX ||
269		    optlen > VFS_MOUNTARG_SIZE_MAX ||
270		    namelen > VFS_MOUNTARG_SIZE_MAX) {
271			error = EINVAL;
272			goto bad;
273		}
274
275		if (auio->uio_segflg == UIO_SYSSPACE) {
276			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
277		} else {
278			error = copyin(auio->uio_iov[i].iov_base, opt->name,
279			    namelen);
280			if (error)
281				goto bad;
282		}
283		/* Ensure names are null-terminated strings. */
284		if (opt->name[namelen - 1] != '\0') {
285			error = EINVAL;
286			goto bad;
287		}
288		if (optlen != 0) {
289			opt->len = optlen;
290			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
291			if (auio->uio_segflg == UIO_SYSSPACE) {
292				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
293				    optlen);
294			} else {
295				error = copyin(auio->uio_iov[i + 1].iov_base,
296				    opt->value, optlen);
297				if (error)
298					goto bad;
299			}
300		}
301	}
302	vfs_sanitizeopts(opts);
303	*options = opts;
304	return (0);
305bad:
306	vfs_freeopts(opts);
307	return (error);
308}
309
310/*
311 * Merge the old mount options with the new ones passed
312 * in the MNT_UPDATE case.
313 */
314static void
315vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
316{
317	struct vfsopt *opt, *opt2, *new;
318
319	TAILQ_FOREACH(opt, opts, link) {
320		/*
321		 * Check that this option hasn't been redefined
322		 * nor cancelled with a "no" mount option.
323		 */
324		opt2 = TAILQ_FIRST(toopts);
325		while (opt2 != NULL) {
326			if (strcmp(opt2->name, opt->name) == 0)
327				goto next;
328			if (strncmp(opt2->name, "no", 2) == 0 &&
329			    strcmp(opt2->name + 2, opt->name) == 0) {
330				vfs_freeopt(toopts, opt2);
331				goto next;
332			}
333			opt2 = TAILQ_NEXT(opt2, link);
334		}
335		/* We want this option, duplicate it. */
336		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
337		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
338		strcpy(new->name, opt->name);
339		if (opt->len != 0) {
340			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
341			bcopy(opt->value, new->value, opt->len);
342		} else {
343			new->value = NULL;
344		}
345		new->len = opt->len;
346		TAILQ_INSERT_TAIL(toopts, new, link);
347next:
348		continue;
349	}
350}
351
352/*
353 * ---------------------------------------------------------------------
354 * Mount a filesystem
355 */
356int
357nmount(td, uap)
358	struct thread *td;
359	struct nmount_args /* {
360		struct iovec *iovp;
361		unsigned int iovcnt;
362		int flags;
363	} */ *uap;
364{
365	struct uio *auio;
366	struct iovec *iov;
367	unsigned int i;
368	int error;
369	u_int iovcnt;
370
371	/* Kick out MNT_ROOTFS early as it is legal internally */
372	if (uap->flags & MNT_ROOTFS)
373		return (EINVAL);
374
375	iovcnt = uap->iovcnt;
376	/*
377	 * Check that we have an even number of iovec's
378	 * and that we have at least two options.
379	 */
380	if ((iovcnt & 1) || (iovcnt < 4))
381		return (EINVAL);
382
383	error = copyinuio(uap->iovp, iovcnt, &auio);
384	if (error)
385		return (error);
386	iov = auio->uio_iov;
387	for (i = 0; i < iovcnt; i++) {
388		if (iov->iov_len > MMAXOPTIONLEN) {
389			free(auio, M_IOV);
390			return (EINVAL);
391		}
392		iov++;
393	}
394	error = vfs_donmount(td, uap->flags, auio);
395
396	free(auio, M_IOV);
397	return (error);
398}
399
400/*
401 * ---------------------------------------------------------------------
402 * Various utility functions
403 */
404
405/*
406 * Allocate and initialize the mount point struct.
407 */
408static int
409vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp,
410    const char *fspath, struct thread *td, struct mount **mpp)
411{
412	struct mount *mp;
413
414	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
415	TAILQ_INIT(&mp->mnt_nvnodelist);
416	mp->mnt_nvnodelistsize = 0;
417	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
418	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
419	vfs_busy(mp, LK_NOWAIT, 0, td);
420	mp->mnt_op = vfsp->vfc_vfsops;
421	mp->mnt_vfc = vfsp;
422	vfsp->vfc_refcount++;
423	mp->mnt_stat.f_type = vfsp->vfc_typenum;
424	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
425	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
426	mp->mnt_vnodecovered = vp;
427	mp->mnt_cred = crdup(td->td_ucred);
428	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
429	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
430	mp->mnt_iosize_max = DFLTPHYS;
431#ifdef MAC
432	mac_init_mount(mp);
433	mac_create_mount(td->td_ucred, mp);
434#endif
435	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
436	*mpp = mp;
437	return (0);
438}
439
440/*
441 * Destroy the mount struct previously allocated by vfs_mount_alloc().
442 */
443static void
444vfs_mount_destroy(struct mount *mp, struct thread *td)
445{
446
447	mp->mnt_vfc->vfc_refcount--;
448	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
449		panic("unmount: dangling vnode");
450	vfs_unbusy(mp,td);
451	lockdestroy(&mp->mnt_lock);
452	MNT_ILOCK(mp);
453	if (mp->mnt_kern_flag & MNTK_MWAIT)
454		wakeup(mp);
455	MNT_IUNLOCK(mp);
456	mtx_destroy(&mp->mnt_mtx);
457#ifdef MAC
458	mac_destroy_mount(mp);
459#endif
460	if (mp->mnt_opt != NULL)
461		vfs_freeopts(mp->mnt_opt);
462	crfree(mp->mnt_cred);
463	free(mp, M_MOUNT);
464}
465
466static int
467vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
468{
469	struct vfsoptlist *optlist;
470	char *fstype, *fspath, *errmsg;
471	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
472
473	errmsg_len = 0;
474	errmsg_pos = -1;
475
476	error = vfs_buildopts(fsoptions, &optlist);
477	if (error)
478		return (error);
479
480	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
481		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
482	else
483		errmsg_len = 0;
484
485	/*
486	 * We need these two options before the others,
487	 * and they are mandatory for any filesystem.
488	 * Ensure they are NUL terminated as well.
489	 */
490	fstypelen = 0;
491	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
492	if (error || fstype[fstypelen - 1] != '\0') {
493		error = EINVAL;
494		if (errmsg != NULL)
495			strncpy(errmsg, "Invalid fstype", errmsg_len);
496		goto bail;
497	}
498	fspathlen = 0;
499	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
500	if (error || fspath[fspathlen - 1] != '\0') {
501		error = EINVAL;
502		if (errmsg != NULL)
503			strncpy(errmsg, "Invalid fspath", errmsg_len);
504		goto bail;
505	}
506
507	/*
508	 * We need to see if we have the "update" option
509	 * before we call vfs_domount(), since vfs_domount() has special
510	 * logic based on MNT_UPDATE.  This is very important
511	 * when we want to update the root filesystem.
512	 */
513	if (vfs_getopt(optlist, "update", NULL, NULL) == 0)
514		fsflags |= MNT_UPDATE;
515
516	if (vfs_getopt(optlist, "async", NULL, NULL) == 0)
517		fsflags |= MNT_ASYNC;
518
519	if (vfs_getopt(optlist, "force", NULL, NULL) == 0)
520		fsflags |= MNT_FORCE;
521
522	if (vfs_getopt(optlist, "multilabel", NULL, NULL) == 0)
523		fsflags |= MNT_MULTILABEL;
524
525	if (vfs_getopt(optlist, "noasync", NULL, NULL) == 0)
526		fsflags &= ~MNT_ASYNC;
527
528	if (vfs_getopt(optlist, "noatime", NULL, NULL) == 0)
529		fsflags |= MNT_NOATIME;
530
531	if (vfs_getopt(optlist, "noclusterr", NULL, NULL) == 0)
532		fsflags |= MNT_NOCLUSTERR;
533
534	if (vfs_getopt(optlist, "noclusterw", NULL, NULL) == 0)
535		fsflags |= MNT_NOCLUSTERW;
536
537	if (vfs_getopt(optlist, "noexec", NULL, NULL) == 0)
538		fsflags |= MNT_NOEXEC;
539
540	if (vfs_getopt(optlist, "nosuid", NULL, NULL) == 0)
541		fsflags |= MNT_NOSUID;
542
543	if (vfs_getopt(optlist, "nosymfollow", NULL, NULL) == 0)
544		fsflags |= MNT_NOSYMFOLLOW;
545
546	if (vfs_getopt(optlist, "noro", NULL, NULL) == 0)
547		fsflags &= ~MNT_RDONLY;
548
549	if (vfs_getopt(optlist, "ro", NULL, NULL) == 0)
550		fsflags |= MNT_RDONLY;
551
552	if (vfs_getopt(optlist, "rw", NULL, NULL) == 0)
553		fsflags &= ~MNT_RDONLY;
554
555	if (vfs_getopt(optlist, "snapshot", NULL, NULL) == 0)
556		fsflags |= MNT_SNAPSHOT;
557
558	if (vfs_getopt(optlist, "suiddir", NULL, NULL) == 0)
559		fsflags |= MNT_SUIDDIR;
560
561	if (vfs_getopt(optlist, "sync", NULL, NULL) == 0)
562		fsflags |= MNT_SYNCHRONOUS;
563
564	if (vfs_getopt(optlist, "union", NULL, NULL) == 0)
565		fsflags |= MNT_UNION;
566
567	/*
568	 * Be ultra-paranoid about making sure the type and fspath
569	 * variables will fit in our mp buffers, including the
570	 * terminating NUL.
571	 */
572	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
573		error = ENAMETOOLONG;
574		goto bail;
575	}
576
577	mtx_lock(&Giant);
578	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
579	mtx_unlock(&Giant);
580bail:
581	/* copyout the errmsg */
582	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
583	    && errmsg_len > 0 && errmsg != NULL) {
584		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
585			strncpy(fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
586			    errmsg,
587			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
588		} else {
589			copystr(errmsg,
590			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
591			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len,
592			    NULL);
593		}
594	}
595
596	if (error != 0)
597		vfs_freeopts(optlist);
598	return (error);
599}
600
601/*
602 * ---------------------------------------------------------------------
603 * Old mount API.
604 */
605#ifndef _SYS_SYSPROTO_H_
606struct mount_args {
607	char	*type;
608	char	*path;
609	int	flags;
610	caddr_t	data;
611};
612#endif
613/* ARGSUSED */
614int
615mount(td, uap)
616	struct thread *td;
617	struct mount_args /* {
618		char *type;
619		char *path;
620		int flags;
621		caddr_t data;
622	} */ *uap;
623{
624	char *fstype;
625	struct vfsconf *vfsp = NULL;
626	struct mntarg *ma = NULL;
627	int error;
628
629	/* Kick out MNT_ROOTFS early as it is legal internally */
630	uap->flags &= ~MNT_ROOTFS;
631
632	if (uap->data == NULL)
633		return (EINVAL);
634
635	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
636	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
637	if (!error) {
638		mtx_lock(&Giant);	/* XXX ? */
639		vfsp = vfs_byname_kld(fstype, td, &error);
640		mtx_unlock(&Giant);
641	}
642	free(fstype, M_TEMP);
643	if (error)
644		return (error);
645	if (vfsp == NULL)
646		return (ENOENT);
647	if (vfsp->vfc_vfsops->vfs_cmount == NULL)
648		return (EOPNOTSUPP);
649
650	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
651	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
652	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
653	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
654	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
655
656	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
657	return (error);
658}
659
660
661/*
662 * vfs_domount(): actually attempt a filesystem mount.
663 */
664static int
665vfs_domount(
666	struct thread *td,	/* Flags common to all filesystems. */
667	const char *fstype,	/* Filesystem type. */
668	char *fspath,		/* Mount path. */
669	int fsflags,		/* Flags common to all filesystems. */
670	void *fsdata		/* Options local to the filesystem. */
671	)
672{
673	struct vnode *vp;
674	struct mount *mp;
675	struct vfsconf *vfsp;
676	int error, flag = 0, kern_flag = 0;
677	struct vattr va;
678	struct nameidata nd;
679
680	mtx_assert(&Giant, MA_OWNED);
681
682	/*
683	 * Be ultra-paranoid about making sure the type and fspath
684	 * variables will fit in our mp buffers, including the
685	 * terminating NUL.
686	 */
687	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
688		return (ENAMETOOLONG);
689
690	if (jailed(td->td_ucred))
691		return (EPERM);
692	if (usermount == 0) {
693		if ((error = suser(td)) != 0)
694			return (error);
695	}
696
697	/*
698	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
699	 */
700	if (fsflags & (MNT_EXPORTED | MNT_SUIDDIR)) {
701		if ((error = suser(td)) != 0)
702			return (error);
703	}
704	/*
705	 * Silently enforce MNT_NOSUID and MNT_USER for
706	 * unprivileged users.
707	 */
708	if (suser(td) != 0)
709		fsflags |= MNT_NOSUID | MNT_USER;
710	/*
711	 * Get vnode to be covered
712	 */
713	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
714	if ((error = namei(&nd)) != 0)
715		return (error);
716	NDFREE(&nd, NDF_ONLY_PNBUF);
717	vp = nd.ni_vp;
718	if (fsflags & MNT_UPDATE) {
719		if ((vp->v_vflag & VV_ROOT) == 0) {
720			vput(vp);
721			return (EINVAL);
722		}
723		mp = vp->v_mount;
724		flag = mp->mnt_flag;
725		kern_flag = mp->mnt_kern_flag;
726		/*
727		 * We only allow the filesystem to be reloaded if it
728		 * is currently mounted read-only.
729		 */
730		if ((fsflags & MNT_RELOAD) &&
731		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
732			vput(vp);
733			return (EOPNOTSUPP);	/* Needs translation */
734		}
735		/*
736		 * Only privileged root, or (if MNT_USER is set) the user that
737		 * did the original mount is permitted to update it.
738		 */
739		error = vfs_suser(mp, td);
740		if (error) {
741			vput(vp);
742			return (error);
743		}
744		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
745			vput(vp);
746			return (EBUSY);
747		}
748		VI_LOCK(vp);
749		if ((vp->v_iflag & VI_MOUNT) != 0 ||
750		    vp->v_mountedhere != NULL) {
751			VI_UNLOCK(vp);
752			vfs_unbusy(mp, td);
753			vput(vp);
754			return (EBUSY);
755		}
756		vp->v_iflag |= VI_MOUNT;
757		VI_UNLOCK(vp);
758		mp->mnt_flag |= fsflags &
759		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
760		VOP_UNLOCK(vp, 0, td);
761		mp->mnt_optnew = fsdata;
762		vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
763	} else {
764		/*
765		 * If the user is not root, ensure that they own the directory
766		 * onto which we are attempting to mount.
767		 */
768		error = VOP_GETATTR(vp, &va, td->td_ucred, td);
769		if (error) {
770			vput(vp);
771			return (error);
772		}
773		if (va.va_uid != td->td_ucred->cr_uid) {
774			if ((error = suser(td)) != 0) {
775				vput(vp);
776				return (error);
777			}
778		}
779		error = vinvalbuf(vp, V_SAVE, td, 0, 0);
780		if (error != 0) {
781			vput(vp);
782			return (error);
783		}
784		if (vp->v_type != VDIR) {
785			vput(vp);
786			return (ENOTDIR);
787		}
788		vfsp = vfs_byname_kld(fstype, td, &error);
789		if (vfsp == NULL) {
790			vput(vp);
791			return (error);
792		}
793		VI_LOCK(vp);
794		if ((vp->v_iflag & VI_MOUNT) != 0 ||
795		    vp->v_mountedhere != NULL) {
796			VI_UNLOCK(vp);
797			vput(vp);
798			return (EBUSY);
799		}
800		vp->v_iflag |= VI_MOUNT;
801		VI_UNLOCK(vp);
802
803		/*
804		 * Allocate and initialize the filesystem.
805		 */
806		error = vfs_mount_alloc(vp, vfsp, fspath, td, &mp);
807		if (error) {
808			vput(vp);
809			return (error);
810		}
811		VOP_UNLOCK(vp, 0, td);
812
813		/* XXXMAC: pass to vfs_mount_alloc? */
814		mp->mnt_optnew = fsdata;
815	}
816
817	/*
818	 * Set the mount level flags.
819	 */
820	if (fsflags & MNT_RDONLY)
821		mp->mnt_flag |= MNT_RDONLY;
822	mp->mnt_flag &=~ MNT_UPDATEMASK;
823	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
824	/*
825	 * Mount the filesystem.
826	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
827	 * get.  No freeing of cn_pnbuf.
828	 */
829        error = VFS_MOUNT(mp, td);
830	if (!error) {
831		if (mp->mnt_opt != NULL)
832			vfs_freeopts(mp->mnt_opt);
833		mp->mnt_opt = mp->mnt_optnew;
834		VFS_STATFS(mp, &mp->mnt_stat, td);
835	}
836	/*
837	 * Prevent external consumers of mount options from reading
838	 * mnt_optnew.
839	*/
840	mp->mnt_optnew = NULL;
841	if (mp->mnt_flag & MNT_UPDATE) {
842		mp->mnt_flag &=
843		    ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
844		if (error) {
845			mp->mnt_flag = flag;
846			mp->mnt_kern_flag = kern_flag;
847		}
848		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
849			if (mp->mnt_syncer == NULL)
850				error = vfs_allocate_syncvnode(mp);
851		} else {
852			if (mp->mnt_syncer != NULL)
853				vrele(mp->mnt_syncer);
854			mp->mnt_syncer = NULL;
855		}
856		vfs_unbusy(mp, td);
857		VI_LOCK(vp);
858		vp->v_iflag &= ~VI_MOUNT;
859		VI_UNLOCK(vp);
860		vrele(vp);
861		return (error);
862	}
863	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
864	/*
865	 * Put the new filesystem on the mount list after root.
866	 */
867	cache_purge(vp);
868	if (!error) {
869		struct vnode *newdp;
870
871		VI_LOCK(vp);
872		vp->v_iflag &= ~VI_MOUNT;
873		VI_UNLOCK(vp);
874		vp->v_mountedhere = mp;
875		mtx_lock(&mountlist_mtx);
876		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
877		mtx_unlock(&mountlist_mtx);
878		vfs_event_signal(NULL, VQ_MOUNT, 0);
879		if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td))
880			panic("mount: lost mount");
881		mountcheckdirs(vp, newdp);
882		vput(newdp);
883		VOP_UNLOCK(vp, 0, td);
884		if ((mp->mnt_flag & MNT_RDONLY) == 0)
885			error = vfs_allocate_syncvnode(mp);
886		vfs_unbusy(mp, td);
887		if (error)
888			vrele(vp);
889	} else {
890		VI_LOCK(vp);
891		vp->v_iflag &= ~VI_MOUNT;
892		VI_UNLOCK(vp);
893		vfs_mount_destroy(mp, td);
894		vput(vp);
895	}
896	return (error);
897}
898
899/*
900 * ---------------------------------------------------------------------
901 * Unmount a filesystem.
902 *
903 * Note: unmount takes a path to the vnode mounted on as argument,
904 * not special file (as before).
905 */
906#ifndef _SYS_SYSPROTO_H_
907struct unmount_args {
908	char	*path;
909	int	flags;
910};
911#endif
912/* ARGSUSED */
913int
914unmount(td, uap)
915	struct thread *td;
916	register struct unmount_args /* {
917		char *path;
918		int flags;
919	} */ *uap;
920{
921	struct mount *mp;
922	char *pathbuf;
923	int error, id0, id1;
924
925	if (jailed(td->td_ucred))
926		return (EPERM);
927	if (usermount == 0) {
928		if ((error = suser(td)) != 0)
929			return (error);
930	}
931
932	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
933	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
934	if (error) {
935		free(pathbuf, M_TEMP);
936		return (error);
937	}
938	if (uap->flags & MNT_BYFSID) {
939		/* Decode the filesystem ID. */
940		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
941			free(pathbuf, M_TEMP);
942			return (EINVAL);
943		}
944
945		mtx_lock(&mountlist_mtx);
946		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
947			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
948			    mp->mnt_stat.f_fsid.val[1] == id1)
949				break;
950		}
951		mtx_unlock(&mountlist_mtx);
952	} else {
953		mtx_lock(&mountlist_mtx);
954		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
955			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
956				break;
957		}
958		mtx_unlock(&mountlist_mtx);
959	}
960	free(pathbuf, M_TEMP);
961	if (mp == NULL) {
962		/*
963		 * Previously we returned ENOENT for a nonexistent path and
964		 * EINVAL for a non-mountpoint.  We cannot tell these apart
965		 * now, so in the !MNT_BYFSID case return the more likely
966		 * EINVAL for compatibility.
967		 */
968		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
969	}
970
971	/*
972	 * Only privileged root, or (if MNT_USER is set) the user that did the
973	 * original mount is permitted to unmount this filesystem.
974	 */
975	error = vfs_suser(mp, td);
976	if (error)
977		return (error);
978
979	/*
980	 * Don't allow unmounting the root filesystem.
981	 */
982	if (mp->mnt_flag & MNT_ROOTFS)
983		return (EINVAL);
984	mtx_lock(&Giant);
985	error = dounmount(mp, uap->flags, td);
986	mtx_unlock(&Giant);
987	return (error);
988}
989
990/*
991 * Do the actual filesystem unmount.
992 */
993int
994dounmount(mp, flags, td)
995	struct mount *mp;
996	int flags;
997	struct thread *td;
998{
999	struct vnode *coveredvp, *fsrootvp;
1000	int error;
1001	int async_flag;
1002
1003	mtx_assert(&Giant, MA_OWNED);
1004
1005	MNT_ILOCK(mp);
1006	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
1007		MNT_IUNLOCK(mp);
1008		return (EBUSY);
1009	}
1010	mp->mnt_kern_flag |= MNTK_UNMOUNT;
1011	/* Allow filesystems to detect that a forced unmount is in progress. */
1012	if (flags & MNT_FORCE)
1013		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
1014	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
1015	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td);
1016	if (error) {
1017		MNT_ILOCK(mp);
1018		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
1019		if (mp->mnt_kern_flag & MNTK_MWAIT)
1020			wakeup(mp);
1021		MNT_IUNLOCK(mp);
1022		return (error);
1023	}
1024	vn_start_write(NULL, &mp, V_WAIT);
1025
1026	if (mp->mnt_flag & MNT_EXPUBLIC)
1027		vfs_setpublicfs(NULL, NULL, NULL);
1028
1029	vfs_msync(mp, MNT_WAIT);
1030	async_flag = mp->mnt_flag & MNT_ASYNC;
1031	mp->mnt_flag &= ~MNT_ASYNC;
1032	cache_purgevfs(mp);	/* remove cache entries for this file sys */
1033	if (mp->mnt_syncer != NULL)
1034		vrele(mp->mnt_syncer);
1035	/*
1036	 * For forced unmounts, move process cdir/rdir refs on the fs root
1037	 * vnode to the covered vnode.  For non-forced unmounts we want
1038	 * such references to cause an EBUSY error.
1039	 */
1040	if ((flags & MNT_FORCE) &&
1041	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
1042		if (mp->mnt_vnodecovered != NULL)
1043			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
1044		if (fsrootvp == rootvnode) {
1045			vrele(rootvnode);
1046			rootvnode = NULL;
1047		}
1048		vput(fsrootvp);
1049	}
1050	if (((mp->mnt_flag & MNT_RDONLY) ||
1051	     (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) ||
1052	    (flags & MNT_FORCE)) {
1053		error = VFS_UNMOUNT(mp, flags, td);
1054	}
1055	vn_finished_write(mp);
1056	if (error) {
1057		/* Undo cdir/rdir and rootvnode changes made above. */
1058		if ((flags & MNT_FORCE) &&
1059		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
1060			if (mp->mnt_vnodecovered != NULL)
1061				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
1062			if (rootvnode == NULL) {
1063				rootvnode = fsrootvp;
1064				vref(rootvnode);
1065			}
1066			vput(fsrootvp);
1067		}
1068		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
1069			(void) vfs_allocate_syncvnode(mp);
1070		MNT_ILOCK(mp);
1071		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
1072		mp->mnt_flag |= async_flag;
1073		lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
1074		if (mp->mnt_kern_flag & MNTK_MWAIT)
1075			wakeup(mp);
1076		MNT_IUNLOCK(mp);
1077		return (error);
1078	}
1079	mtx_lock(&mountlist_mtx);
1080	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1081	if ((coveredvp = mp->mnt_vnodecovered) != NULL)
1082		coveredvp->v_mountedhere = NULL;
1083	mtx_unlock(&mountlist_mtx);
1084	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
1085	vfs_mount_destroy(mp, td);
1086	if (coveredvp != NULL)
1087		vrele(coveredvp);
1088	return (0);
1089}
1090
1091/*
1092 * ---------------------------------------------------------------------
1093 * Mounting of root filesystem
1094 *
1095 */
1096
1097struct root_hold_token {
1098	const char 			*who;
1099	LIST_ENTRY(root_hold_token)	list;
1100};
1101
1102static LIST_HEAD(, root_hold_token)	root_holds =
1103    LIST_HEAD_INITIALIZER(&root_holds);
1104
1105struct root_hold_token *
1106root_mount_hold(const char *identifier)
1107{
1108	struct root_hold_token *h;
1109
1110	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
1111	h->who = identifier;
1112	mtx_lock(&mountlist_mtx);
1113	LIST_INSERT_HEAD(&root_holds, h, list);
1114	mtx_unlock(&mountlist_mtx);
1115	return (h);
1116}
1117
1118void
1119root_mount_rel(struct root_hold_token *h)
1120{
1121
1122	mtx_lock(&mountlist_mtx);
1123	LIST_REMOVE(h, list);
1124	wakeup(&root_holds);
1125	mtx_unlock(&mountlist_mtx);
1126	free(h, M_DEVBUF);
1127}
1128
1129static void
1130root_mount_wait(void)
1131{
1132	struct root_hold_token *h;
1133
1134	for (;;) {
1135		DROP_GIANT();
1136		g_waitidle();
1137		PICKUP_GIANT();
1138		mtx_lock(&mountlist_mtx);
1139		if (LIST_EMPTY(&root_holds)) {
1140			mtx_unlock(&mountlist_mtx);
1141			break;
1142		}
1143		printf("Root mount waiting for:");
1144		LIST_FOREACH(h, &root_holds, list)
1145			printf(" %s", h->who);
1146		printf("\n");
1147		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
1148		    hz);
1149	}
1150}
1151
1152static void
1153set_rootvnode(struct thread *td)
1154{
1155	struct proc *p;
1156
1157	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td))
1158		panic("Cannot find root vnode");
1159
1160	p = td->td_proc;
1161	FILEDESC_LOCK(p->p_fd);
1162
1163	if (p->p_fd->fd_cdir != NULL)
1164		vrele(p->p_fd->fd_cdir);
1165	p->p_fd->fd_cdir = rootvnode;
1166	VREF(rootvnode);
1167
1168	if (p->p_fd->fd_rdir != NULL)
1169		vrele(p->p_fd->fd_rdir);
1170	p->p_fd->fd_rdir = rootvnode;
1171	VREF(rootvnode);
1172
1173	FILEDESC_UNLOCK(p->p_fd);
1174
1175	VOP_UNLOCK(rootvnode, 0, td);
1176}
1177
1178/*
1179 * Mount /devfs as our root filesystem, but do not put it on the mountlist
1180 * yet.  Create a /dev -> / symlink so that absolute pathnames will lookup.
1181 */
1182
1183static void
1184devfs_first(void)
1185{
1186	struct thread *td = curthread;
1187	struct vfsconf *vfsp;
1188	struct mount *mp = NULL;
1189	int error;
1190
1191	vfsp = vfs_byname("devfs");
1192	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
1193	if (vfsp == NULL)
1194		return;
1195
1196	error = vfs_mount_alloc(NULLVP, vfsp, "/dev", td, &mp);
1197	KASSERT(error == 0, ("vfs_mount_alloc failed %d", error));
1198	if (error)
1199		return;
1200
1201	error = VFS_MOUNT(mp, curthread);
1202	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
1203	if (error)
1204		return;
1205
1206	mtx_lock(&mountlist_mtx);
1207	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
1208	mtx_unlock(&mountlist_mtx);
1209
1210	set_rootvnode(td);
1211
1212	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
1213	if (error)
1214		printf("kern_symlink /dev -> / returns %d\n", error);
1215}
1216
1217/*
1218 * Surgically move our devfs to be mounted on /dev.
1219 */
1220
1221static void
1222devfs_fixup(struct thread *td)
1223{
1224	struct nameidata nd;
1225	int error;
1226	struct vnode *vp, *dvp;
1227	struct mount *mp;
1228
1229	/* Remove our devfs mount from the mountlist and purge the cache */
1230	mtx_lock(&mountlist_mtx);
1231	mp = TAILQ_FIRST(&mountlist);
1232	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1233	mtx_unlock(&mountlist_mtx);
1234	cache_purgevfs(mp);
1235
1236	VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
1237	VI_LOCK(dvp);
1238	dvp->v_iflag &= ~VI_MOUNT;
1239	dvp->v_mountedhere = NULL;
1240	VI_UNLOCK(dvp);
1241
1242	/* Set up the real rootvnode, and purge the cache */
1243	TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
1244	set_rootvnode(td);
1245	cache_purgevfs(rootvnode->v_mount);
1246
1247	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
1248	error = namei(&nd);
1249	if (error) {
1250		printf("Lookup of /dev for devfs, error: %d\n", error);
1251		return;
1252	}
1253	NDFREE(&nd, NDF_ONLY_PNBUF);
1254	vp = nd.ni_vp;
1255	if (vp->v_type != VDIR) {
1256		vput(vp);
1257	}
1258	error = vinvalbuf(vp, V_SAVE, td, 0, 0);
1259	if (error) {
1260		vput(vp);
1261	}
1262	cache_purge(vp);
1263	mp->mnt_vnodecovered = vp;
1264	vp->v_mountedhere = mp;
1265	mtx_lock(&mountlist_mtx);
1266	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1267	mtx_unlock(&mountlist_mtx);
1268	VOP_UNLOCK(vp, 0, td);
1269	vfs_unbusy(mp, td);
1270	vput(dvp);
1271
1272	/* Unlink the no longer needed /dev/dev -> / symlink */
1273	kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
1274}
1275
1276/*
1277 * Report errors during filesystem mounting.
1278 */
1279void
1280vfs_mount_error(struct mount *mp, const char *fmt, ...)
1281{
1282	struct vfsoptlist *moptlist = mp->mnt_optnew;
1283	va_list ap;
1284	int error, len;
1285	char *errmsg;
1286
1287	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
1288	if (error || errmsg == NULL || len <= 0)
1289		return;
1290
1291	va_start(ap, fmt);
1292	vsnprintf(errmsg, (size_t)len, fmt, ap);
1293	va_end(ap);
1294}
1295
1296/*
1297 * Find and mount the root filesystem
1298 */
1299void
1300vfs_mountroot(void)
1301{
1302	char *cp;
1303	int error, i, asked = 0;
1304
1305	root_mount_wait();
1306
1307	devfs_first();
1308
1309	/*
1310	 * We are booted with instructions to prompt for the root filesystem.
1311	 */
1312	if (boothowto & RB_ASKNAME) {
1313		if (!vfs_mountroot_ask())
1314			return;
1315		asked = 1;
1316	}
1317
1318	/*
1319	 * The root filesystem information is compiled in, and we are
1320	 * booted with instructions to use it.
1321	 */
1322	if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
1323		if (!vfs_mountroot_try(ctrootdevname))
1324			return;
1325		ctrootdevname = NULL;
1326	}
1327
1328	/*
1329	 * We've been given the generic "use CDROM as root" flag.  This is
1330	 * necessary because one media may be used in many different
1331	 * devices, so we need to search for them.
1332	 */
1333	if (boothowto & RB_CDROM) {
1334		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
1335			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
1336				return;
1337		}
1338	}
1339
1340	/*
1341	 * Try to use the value read by the loader from /etc/fstab, or
1342	 * supplied via some other means.  This is the preferred
1343	 * mechanism.
1344	 */
1345	cp = getenv("vfs.root.mountfrom");
1346	if (cp != NULL) {
1347		error = vfs_mountroot_try(cp);
1348		freeenv(cp);
1349		if (!error)
1350			return;
1351	}
1352
1353	/*
1354	 * Try values that may have been computed by code during boot
1355	 */
1356	if (!vfs_mountroot_try(rootdevnames[0]))
1357		return;
1358	if (!vfs_mountroot_try(rootdevnames[1]))
1359		return;
1360
1361	/*
1362	 * If we (still) have a compiled-in default, try it.
1363	 */
1364	if (ctrootdevname != NULL)
1365		if (!vfs_mountroot_try(ctrootdevname))
1366			return;
1367	/*
1368	 * Everything so far has failed, prompt on the console if we haven't
1369	 * already tried that.
1370	 */
1371	if (!asked)
1372		if (!vfs_mountroot_ask())
1373			return;
1374
1375	panic("Root mount failed, startup aborted.");
1376}
1377
1378/*
1379 * Mount (mountfrom) as the root filesystem.
1380 */
1381static int
1382vfs_mountroot_try(const char *mountfrom)
1383{
1384	struct mount	*mp;
1385	char		*vfsname, *path;
1386	time_t		timebase;
1387	int		error;
1388	char		patt[32];
1389
1390	vfsname = NULL;
1391	path    = NULL;
1392	mp      = NULL;
1393	error   = EINVAL;
1394
1395	if (mountfrom == NULL)
1396		return (error);		/* don't complain */
1397	printf("Trying to mount root from %s\n", mountfrom);
1398
1399	/* parse vfs name and path */
1400	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
1401	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
1402	vfsname[0] = path[0] = 0;
1403	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
1404	if (sscanf(mountfrom, patt, vfsname, path) < 1)
1405		goto out;
1406
1407	if (path[0] == '\0')
1408		strcpy(path, ROOTNAME);
1409
1410	error = kernel_vmount(
1411	    MNT_RDONLY | MNT_ROOTFS,
1412	    "fstype", vfsname,
1413	    "fspath", "/",
1414	    "from", path,
1415	    NULL);
1416	if (error == 0) {
1417		/*
1418		 * We mount devfs prior to mounting the / FS, so the first
1419		 * entry will typically be devfs.
1420		 */
1421		mp = TAILQ_FIRST(&mountlist);
1422		KASSERT(mp != NULL, ("%s: mountlist is empty", __func__));
1423
1424		/*
1425		 * Iterate over all currently mounted file systems and use
1426		 * the time stamp found to check and/or initialize the RTC.
1427		 * Typically devfs has no time stamp and the only other FS
1428		 * is the actual / FS.
1429		 * Call inittodr() only once and pass it the largest of the
1430		 * timestamps we encounter.
1431		 */
1432		timebase = 0;
1433		do {
1434			if (mp->mnt_time > timebase)
1435				timebase = mp->mnt_time;
1436			mp = TAILQ_NEXT(mp, mnt_list);
1437		} while (mp != NULL);
1438		inittodr(timebase);
1439
1440		devfs_fixup(curthread);
1441	}
1442out:
1443	free(path, M_MOUNT);
1444	free(vfsname, M_MOUNT);
1445	return (error);
1446}
1447
1448/*
1449 * ---------------------------------------------------------------------
1450 * Interactive root filesystem selection code.
1451 */
1452
1453static int
1454vfs_mountroot_ask(void)
1455{
1456	char name[128];
1457
1458	for(;;) {
1459		printf("\nManual root filesystem specification:\n");
1460		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
1461#if defined(__i386__) || defined(__ia64__)
1462		printf("                       eg. ufs:da0s1a\n");
1463#else
1464		printf("                       eg. ufs:/dev/da0a\n");
1465#endif
1466		printf("  ?                  List valid disk boot devices\n");
1467		printf("  <empty line>       Abort manual input\n");
1468		printf("\nmountroot> ");
1469		gets(name, sizeof(name), 1);
1470		if (name[0] == '\0')
1471			return (1);
1472		if (name[0] == '?') {
1473			printf("\nList of GEOM managed disk devices:\n  ");
1474			g_dev_print();
1475			continue;
1476		}
1477		if (!vfs_mountroot_try(name))
1478			return (0);
1479	}
1480}
1481
1482/*
1483 * ---------------------------------------------------------------------
1484 * Functions for querying mount options/arguments from filesystems.
1485 */
1486
1487/*
1488 * Check that no unknown options are given
1489 */
1490int
1491vfs_filteropt(struct vfsoptlist *opts, const char **legal)
1492{
1493	struct vfsopt *opt;
1494	const char **t, *p;
1495
1496
1497	TAILQ_FOREACH(opt, opts, link) {
1498		p = opt->name;
1499		if (p[0] == 'n' && p[1] == 'o')
1500			p += 2;
1501		for(t = global_opts; *t != NULL; t++)
1502			if (!strcmp(*t, p))
1503				break;
1504		if (*t != NULL)
1505			continue;
1506		for(t = legal; *t != NULL; t++)
1507			if (!strcmp(*t, p))
1508				break;
1509		if (*t != NULL)
1510			continue;
1511		printf("mount option <%s> is unknown\n", p);
1512		return (EINVAL);
1513	}
1514	return (0);
1515}
1516
1517/*
1518 * Get a mount option by its name.
1519 *
1520 * Return 0 if the option was found, ENOENT otherwise.
1521 * If len is non-NULL it will be filled with the length
1522 * of the option. If buf is non-NULL, it will be filled
1523 * with the address of the option.
1524 */
1525int
1526vfs_getopt(opts, name, buf, len)
1527	struct vfsoptlist *opts;
1528	const char *name;
1529	void **buf;
1530	int *len;
1531{
1532	struct vfsopt *opt;
1533
1534	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1535
1536	TAILQ_FOREACH(opt, opts, link) {
1537		if (strcmp(name, opt->name) == 0) {
1538			if (len != NULL)
1539				*len = opt->len;
1540			if (buf != NULL)
1541				*buf = opt->value;
1542			return (0);
1543		}
1544	}
1545	return (ENOENT);
1546}
1547
1548static int
1549vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
1550{
1551	struct vfsopt *opt;
1552	int i;
1553
1554	if (opts == NULL)
1555		return (-1);
1556
1557	i = 0;
1558	TAILQ_FOREACH(opt, opts, link) {
1559		if (strcmp(name, opt->name) == 0)
1560			return (i);
1561		++i;
1562	}
1563	return (-1);
1564}
1565
1566char *
1567vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
1568{
1569	struct vfsopt *opt;
1570
1571	*error = 0;
1572	TAILQ_FOREACH(opt, opts, link) {
1573		if (strcmp(name, opt->name) != 0)
1574			continue;
1575		if (((char *)opt->value)[opt->len - 1] != '\0') {
1576			*error = EINVAL;
1577			return (NULL);
1578		}
1579		return (opt->value);
1580	}
1581	return (NULL);
1582}
1583
1584int
1585vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
1586{
1587	struct vfsopt *opt;
1588
1589	TAILQ_FOREACH(opt, opts, link) {
1590		if (strcmp(name, opt->name) == 0) {
1591			if (w != NULL)
1592				*w |= val;
1593			return (1);
1594		}
1595	}
1596	if (w != NULL)
1597		*w &= ~val;
1598	return (0);
1599}
1600
1601int
1602vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
1603{
1604	va_list ap;
1605	struct vfsopt *opt;
1606	int ret;
1607
1608	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1609
1610	TAILQ_FOREACH(opt, opts, link) {
1611		if (strcmp(name, opt->name) != 0)
1612			continue;
1613		if (((char *)opt->value)[opt->len - 1] != '\0')
1614			return (0);
1615		va_start(ap, fmt);
1616		ret = vsscanf(opt->value, fmt, ap);
1617		va_end(ap);
1618		return (ret);
1619	}
1620	return (0);
1621}
1622
1623/*
1624 * Find and copy a mount option.
1625 *
1626 * The size of the buffer has to be specified
1627 * in len, if it is not the same length as the
1628 * mount option, EINVAL is returned.
1629 * Returns ENOENT if the option is not found.
1630 */
1631int
1632vfs_copyopt(opts, name, dest, len)
1633	struct vfsoptlist *opts;
1634	const char *name;
1635	void *dest;
1636	int len;
1637{
1638	struct vfsopt *opt;
1639
1640	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
1641
1642	TAILQ_FOREACH(opt, opts, link) {
1643		if (strcmp(name, opt->name) == 0) {
1644			if (len != opt->len)
1645				return (EINVAL);
1646			bcopy(opt->value, dest, opt->len);
1647			return (0);
1648		}
1649	}
1650	return (ENOENT);
1651}
1652
1653/*
1654 * This is a helper function for filesystems to traverse their
1655 * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
1656 */
1657
1658struct vnode *
1659__mnt_vnode_next(struct vnode **nvp, struct mount *mp)
1660{
1661	struct vnode *vp;
1662
1663	mtx_assert(&mp->mnt_mtx, MA_OWNED);
1664
1665	vp = *nvp;
1666	/* Check if we are done */
1667	if (vp == NULL)
1668		return (NULL);
1669	/* If our next vnode is no longer ours, start over */
1670	if (vp->v_mount != mp)
1671		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
1672	/* Save pointer to next vnode in list */
1673	if (vp != NULL)
1674		*nvp = TAILQ_NEXT(vp, v_nmntvnodes);
1675	else
1676		*nvp = NULL;
1677	return (vp);
1678}
1679
1680int
1681__vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
1682{
1683	int error;
1684
1685	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td);
1686	if (sbp != &mp->mnt_stat)
1687		*sbp = mp->mnt_stat;
1688	return (error);
1689}
1690
1691void
1692vfs_mountedfrom(struct mount *mp, const char *from)
1693{
1694
1695	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
1696	strlcpy(mp->mnt_stat.f_mntfromname, from,
1697	    sizeof mp->mnt_stat.f_mntfromname);
1698}
1699
1700/*
1701 * ---------------------------------------------------------------------
1702 * This is the api for building mount args and mounting filesystems from
1703 * inside the kernel.
1704 *
1705 * The API works by accumulation of individual args.  First error is
1706 * latched.
1707 *
1708 * XXX: should be documented in new manpage kernel_mount(9)
1709 */
1710
1711/* A memory allocation which must be freed when we are done */
1712struct mntaarg {
1713	SLIST_ENTRY(mntaarg)	next;
1714};
1715
1716/* The header for the mount arguments */
1717struct mntarg {
1718	struct iovec *v;
1719	int len;
1720	int error;
1721	SLIST_HEAD(, mntaarg)	list;
1722};
1723
1724/*
1725 * Add a boolean argument.
1726 *
1727 * flag is the boolean value.
1728 * name must start with "no".
1729 */
1730struct mntarg *
1731mount_argb(struct mntarg *ma, int flag, const char *name)
1732{
1733
1734	KASSERT(name[0] == 'n' && name[1] == 'o',
1735	    ("mount_argb(...,%s): name must start with 'no'", name));
1736
1737	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
1738}
1739
1740/*
1741 * Add an argument printf style
1742 */
1743struct mntarg *
1744mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
1745{
1746	va_list ap;
1747	struct mntaarg *maa;
1748	struct sbuf *sb;
1749	int len;
1750
1751	if (ma == NULL) {
1752		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1753		SLIST_INIT(&ma->list);
1754	}
1755	if (ma->error)
1756		return (ma);
1757
1758	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
1759	    M_MOUNT, M_WAITOK);
1760	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
1761	ma->v[ma->len].iov_len = strlen(name) + 1;
1762	ma->len++;
1763
1764	sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
1765	va_start(ap, fmt);
1766	sbuf_vprintf(sb, fmt, ap);
1767	va_end(ap);
1768	sbuf_finish(sb);
1769	len = sbuf_len(sb) + 1;
1770	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
1771	SLIST_INSERT_HEAD(&ma->list, maa, next);
1772	bcopy(sbuf_data(sb), maa + 1, len);
1773	sbuf_delete(sb);
1774
1775	ma->v[ma->len].iov_base = maa + 1;
1776	ma->v[ma->len].iov_len = len;
1777	ma->len++;
1778
1779	return (ma);
1780}
1781
1782/*
1783 * Add an argument which is a userland string.
1784 */
1785struct mntarg *
1786mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
1787{
1788	struct mntaarg *maa;
1789	char *tbuf;
1790
1791	if (val == NULL)
1792		return (ma);
1793	if (ma == NULL) {
1794		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1795		SLIST_INIT(&ma->list);
1796	}
1797	if (ma->error)
1798		return (ma);
1799	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
1800	SLIST_INSERT_HEAD(&ma->list, maa, next);
1801	tbuf = (void *)(maa + 1);
1802	ma->error = copyinstr(val, tbuf, len, NULL);
1803	return (mount_arg(ma, name, tbuf, -1));
1804}
1805
1806/*
1807 * Plain argument.
1808 *
1809 * If length is -1, use printf.
1810 */
1811struct mntarg *
1812mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
1813{
1814
1815	if (ma == NULL) {
1816		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1817		SLIST_INIT(&ma->list);
1818	}
1819	if (ma->error)
1820		return (ma);
1821
1822	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
1823	    M_MOUNT, M_WAITOK);
1824	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
1825	ma->v[ma->len].iov_len = strlen(name) + 1;
1826	ma->len++;
1827
1828	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
1829	if (len < 0)
1830		ma->v[ma->len].iov_len = strlen(val) + 1;
1831	else
1832		ma->v[ma->len].iov_len = len;
1833	ma->len++;
1834	return (ma);
1835}
1836
1837/*
1838 * Free a mntarg structure
1839 */
1840static void
1841free_mntarg(struct mntarg *ma)
1842{
1843	struct mntaarg *maa;
1844
1845	while (!SLIST_EMPTY(&ma->list)) {
1846		maa = SLIST_FIRST(&ma->list);
1847		SLIST_REMOVE_HEAD(&ma->list, next);
1848		free(maa, M_MOUNT);
1849	}
1850	free(ma->v, M_MOUNT);
1851	free(ma, M_MOUNT);
1852}
1853
1854/*
1855 * Mount a filesystem
1856 */
1857int
1858kernel_mount(struct mntarg *ma, int flags)
1859{
1860	struct uio auio;
1861	int error;
1862
1863	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
1864	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
1865	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
1866
1867	auio.uio_iov = ma->v;
1868	auio.uio_iovcnt = ma->len;
1869	auio.uio_segflg = UIO_SYSSPACE;
1870
1871	error = ma->error;
1872	if (!error)
1873		error = vfs_donmount(curthread, flags, &auio);
1874	free_mntarg(ma);
1875	return (error);
1876}
1877
1878/*
1879 * A printflike function to mount a filesystem.
1880 */
1881int
1882kernel_vmount(int flags, ...)
1883{
1884	struct mntarg *ma = NULL;
1885	va_list ap;
1886	const char *cp;
1887	const void *vp;
1888	int error;
1889
1890	va_start(ap, flags);
1891	for (;;) {
1892		cp = va_arg(ap, const char *);
1893		if (cp == NULL)
1894			break;
1895		vp = va_arg(ap, const void *);
1896		ma = mount_arg(ma, cp, vp, -1);
1897	}
1898	va_end(ap);
1899
1900	error = kernel_mount(ma, flags);
1901	return (error);
1902}
1903